Refinements to Utf8String and Utf8StringTest

2026-03-18 13:53:22 +00:00 · 2019-10-02 15:00:35 -07:00
parent afaccd551b
commit 7d38ded957
2 changed files with 81 additions and 30 deletions
--- a/java/src/main/java/com/azure/data/cosmos/core/Utf8String.java
+++ b/java/src/main/java/com/azure/data/cosmos/core/Utf8String.java
@@ -478,15 +478,20 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
     * @param start the start index, inclusive.
     * @param end   the end index, exclusive.
     * @return the specified subsequence
-     * @throws IndexOutOfBoundsException, if
-     *                                    <p><ul>
-     *                                    <li>{@code start} or {@code end} are negative,
-     *                                    <li>{@code end} is greater than {@code length()}, or
-     *                                    <li>{@code start} is greater than {@code end}.
-     *                                    </ul></p>
+     * @throws IllegalArgumentException  if the values of {@code start} or {@code end} would cause a code point to be
+     *                                   split into a surrogate pair. This exception will only be thrown on sequences
+     *                                   containing 4-byte UTF-8 encodings. Whether the exception is thrown on sequences
+     *                                   containing 4-byte UTF-8 encodings depends on the values of {@code start} and
+     *                                   {@code end}. To avoid this exception at the cost of data conversion and memory
+     *                                   allocation, convert this {@link Utf8String} to a {@link String} and call
+     *                                   {@link String#subSequence}.
+     * @throws IndexOutOfBoundsException if {@code start} or {@code end} are negative, {@code end} is greater than
+     *                                   {@link #length()}, <li>{@code start} is greater than {@code end}, or
+     *                                   {@link #isNull()} is {@code true}.
     */
+    @Nonnull
    @Override
-    public CharSequence subSequence(int start, int end) {
+    public CharSequence subSequence(final int start, final int end) {

        final int length = this.length();

@@ -499,6 +504,10 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
            return EMPTY;
        }

+        if (start == 0 && end == length) {
+            return this;
+        }
+
        final int encodedLength = this.buffer.writerIndex();
        final int i, n;

@@ -508,11 +517,22 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
        } else {
            final UTF16CodeUnitCounter counter = new UTF16CodeUnitCounter(start);
            i = this.buffer.forEachByte(0, encodedLength, counter);
+            checkArgument(counter.index == counter.end, "start: %s, end: %s, counter: %s", start, end, counter);
            n = encodedLength - i;
        }

-        final int j = this.buffer.forEachByte(i, n, new UTF16CodeUnitCounter(end - start));
-        return fromUnsafe(this.buffer.slice(i, j >= 0 ? j - i : n));
+        final int j;
+
+        if (end == length) {
+            j = encodedLength;
+        } else {
+            final UTF16CodeUnitCounter counter = new UTF16CodeUnitCounter(end - start);
+            j = this.buffer.forEachByte(i, n, counter);
+            checkArgument(counter.index == counter.end, "start: %s, end: %s, counter: %s", start, end, counter);
+            assert j >= 0;
+        }
+
+        return fromUnsafe(this.buffer.slice(i, j - i));
    }

    @Override
@@ -588,11 +608,10 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
            //  Bits 08-15 = 0b10YYYYYY (byte 3)
            //  Bits 00-07 = 0b10XXXXXX (byte 4)
            //
-            // The corresponding UTF-16 code point can be viewed as a 21-bit integer,
-            // 0bVVVZZZZZZYYYYYYXXXXXX. Hence, we map the UTF-8 code units into 3 bytes with the first
-            // 4 bits coming from the first (high order) byte, the next 6 bits from the second byte,
-            // the next 6 bits from the third byte, and the last 6 bits from the fourth (low order)
-            // byte.
+            // The corresponding code point can be viewed as a 21-bit integer, 0bVVVZZZZZZYYYYYYXXXXXX. Hence, we map
+            // the UTF-8 code units into 3 bytes with the first 3 bits coming from the first (high order) byte, the
+            // next 6 bits from the second byte, the next 6 bits from the third byte, and the last 6 bits from the
+            // fourth (low order) byte.

            final int b1 = characterEncoding & 0b00000111_00000000_00000000_00000000;
            final int b2 = characterEncoding & 0b00000000_00111111_00000000_00000000;
@@ -613,10 +632,9 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
            //  Bits 08-15 = 0b10YYYYYY (byte 2)
            //  Bits 00-07 = 0b10XXXXXX (byte 3)
            //
-            // The corresponding UTF-16 code point can be viewed as a 16-bit integer,
-            // 0bZZZZYYYYYYXXXXXX. Hence, we map the UTF-8 code units into 2 bytes with the first 3
-            // bits coming from the first (high order) byte, the next 6 bits from the second (mid order)
-            // byte, and the last 6 bits from the third (low order) byte.
+            // The corresponding code point can be viewed as a 16-bit integer, 0bZZZZYYYYYYXXXXXX. Hence, we map the
+            // UTF-8 code units into 2 bytes with the first 3 bits coming from the first (high order) byte, the next 6
+            // bits from the second (mid order) byte, and the last 6 bits from the third (low order) byte.

            final int b1 = characterEncoding & 0b000011110000000000000000;
            final int b2 = characterEncoding & 0b000000000011111100000000;
@@ -636,9 +654,9 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
            //  Bits 08-15 = 0b110YYYYY (byte 1)
            //  Bits 00-07 = 0b10XXXXXX (byte 2)
            //
-            // The corresponding UTF-16 code point can be viewed as an 11-bit integer, 0bYYYYYXXXXXX.
-            // Hence, we map the UTF-8 code units into 1 byte with the first 5 bits coming from the
-            // first (high order) byte and the final 6 bits coming from the second (low order) byte
+            // The corresponding code point can be viewed as an 11-bit integer, 0bYYYYYXXXXXX. Hence, we map the UTF-8
+            // code units into 1 byte with the first 5 bits coming from the first (high order) byte and the final 6
+            // bits coming from the second (low order) byte.

            final int b1 = characterEncoding & 0b0001111100000000;
            final int b2 = characterEncoding & 0b0000000000111111;
@@ -792,8 +810,13 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable

        @JsonProperty
        private final int end;
+
        @JsonProperty
        private int count = 0;
+
+        @JsonProperty
+        private int index = 0;
+
        @JsonProperty
        private int skip = 0;

@@ -815,6 +838,7 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
            }

            final int leadingByte = value & 0xFF;
+            this.index = this.count;

            if (leadingByte < 0x7F) {
                // UTF-8-1 = 0x00-7F
@@ -841,10 +865,6 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
            return this.count <= this.end;
        }

-        public int skip() {
-            return this.skip;
-        }
-
        public int value() {
            return this.count;
        }
@@ -936,8 +956,8 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
                    }

                    if (0xF0 <= leadingByte && leadingByte <= 0xF4) {
-                        // UTF8-4 = 0xF0 0x90-BF 2( UTF8-tail ) / 0xF1-F3 3( UTF8-tail ) / 0xF4 0x80-8F 2( UTF8-tail )
-                        this.codePoint = leadingByte << 3 * Byte.SIZE;
+                        // UTF-8-4 = 0xF0 0x90-BF 2( UTF8-tail ) / 0xF1-F3 3( UTF8-tail ) / 0xF4 0x80-8F 2( UTF8-tail )
+                        this.codePoint = leadingByte << (3 * Byte.SIZE);
                        this.shift = 2 * Byte.SIZE;
                        return true;
                    }
--- a/java/src/test/java/com/azure/data/cosmos/core/Utf8StringTest.java
+++ b/java/src/test/java/com/azure/data/cosmos/core/Utf8StringTest.java
@@ -16,6 +16,7 @@ import java.util.Optional;

 import static org.testng.Assert.assertEquals;
 import static org.testng.Assert.assertFalse;
+import static org.testng.Assert.assertNotNull;
 import static org.testng.Assert.assertSame;
 import static org.testng.Assert.assertThrows;
 import static org.testng.Assert.assertTrue;
@@ -170,10 +171,40 @@ public class Utf8StringTest {

        Utf8String value = Utf8String.fromUnsafe(item.byteBuf());

+        for (int start = 0, end = start + 1; end <= value.length(); end++) {
+            try {
+                final Utf8String actual = (Utf8String)value.subSequence(start, end);
+                assertNotNull(actual);
+                assertFalse(actual.isNull());
+                assertEquals(actual.toUtf16(), item.value.subSequence(start, end));
+            } catch (IllegalArgumentException error) {
+                // TODO: DANOBLE: assertions
+                System.out.println(error.toString());
+            }
+        }
+
+        for (int start = 1, end = start + 1; end <= value.length(); end++) {
+            try {
+                final Utf8String actual = (Utf8String)value.subSequence(start, end);
+                assertNotNull(actual);
+                assertFalse(actual.isNull());
+                assertEquals(actual.toUtf16(), item.value.subSequence(start, end));
+            } catch (IllegalArgumentException error) {
+                // TODO: DANOBLE: assertions
+                System.out.println(error.toString());
+            }
+        }
+
        for (int start = 0, end = value.length(); start <= end; start++, end--) {
-            String actual = ((Utf8String)value.subSequence(start, end)).toUtf16();
-            String expected = (String)item.value.subSequence(start, end);
-            assertEquals(actual, expected);
+            try {
+                final Utf8String actual = (Utf8String) value.subSequence(start, end);
+                assertNotNull(actual);
+                assertFalse(actual.isNull());
+                assertEquals(actual.toUtf16(), item.value.subSequence(start, end));
+            } catch (IllegalArgumentException error) {
+                // TODO: DANOBLE: assertions
+                System.out.println(error.toString());
+            }
        }
    }