diff --git a/java/src/main/java/com/azure/data/cosmos/core/Utf8String.java b/java/src/main/java/com/azure/data/cosmos/core/Utf8String.java index 2af8a85..060d037 100644 --- a/java/src/main/java/com/azure/data/cosmos/core/Utf8String.java +++ b/java/src/main/java/com/azure/data/cosmos/core/Utf8String.java @@ -478,15 +478,20 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable * @param start the start index, inclusive. * @param end the end index, exclusive. * @return the specified subsequence - * @throws IndexOutOfBoundsException, if - *

+ * @throws IllegalArgumentException if the values of {@code start} or {@code end} would cause a code point to be + * split into a surrogate pair. This exception will only be thrown on sequences + * containing 4-byte UTF-8 encodings. Whether the exception is thrown on sequences + * containing 4-byte UTF-8 encodings depends on the values of {@code start} and + * {@code end}. To avoid this exception at the cost of data conversion and memory + * allocation, convert this {@link Utf8String} to a {@link String} and call + * {@link String#subSequence}. + * @throws IndexOutOfBoundsException if {@code start} or {@code end} are negative, {@code end} is greater than + * {@link #length()},
  • {@code start} is greater than {@code end}, or + * {@link #isNull()} is {@code true}. */ + @Nonnull @Override - public CharSequence subSequence(int start, int end) { + public CharSequence subSequence(final int start, final int end) { final int length = this.length(); @@ -499,6 +504,10 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable return EMPTY; } + if (start == 0 && end == length) { + return this; + } + final int encodedLength = this.buffer.writerIndex(); final int i, n; @@ -508,11 +517,22 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable } else { final UTF16CodeUnitCounter counter = new UTF16CodeUnitCounter(start); i = this.buffer.forEachByte(0, encodedLength, counter); + checkArgument(counter.index == counter.end, "start: %s, end: %s, counter: %s", start, end, counter); n = encodedLength - i; } - final int j = this.buffer.forEachByte(i, n, new UTF16CodeUnitCounter(end - start)); - return fromUnsafe(this.buffer.slice(i, j >= 0 ? j - i : n)); + final int j; + + if (end == length) { + j = encodedLength; + } else { + final UTF16CodeUnitCounter counter = new UTF16CodeUnitCounter(end - start); + j = this.buffer.forEachByte(i, n, counter); + checkArgument(counter.index == counter.end, "start: %s, end: %s, counter: %s", start, end, counter); + assert j >= 0; + } + + return fromUnsafe(this.buffer.slice(i, j - i)); } @Override @@ -588,11 +608,10 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable // Bits 08-15 = 0b10YYYYYY (byte 3) // Bits 00-07 = 0b10XXXXXX (byte 4) // - // The corresponding UTF-16 code point can be viewed as a 21-bit integer, - // 0bVVVZZZZZZYYYYYYXXXXXX. Hence, we map the UTF-8 code units into 3 bytes with the first - // 4 bits coming from the first (high order) byte, the next 6 bits from the second byte, - // the next 6 bits from the third byte, and the last 6 bits from the fourth (low order) - // byte. + // The corresponding code point can be viewed as a 21-bit integer, 0bVVVZZZZZZYYYYYYXXXXXX. Hence, we map + // the UTF-8 code units into 3 bytes with the first 3 bits coming from the first (high order) byte, the + // next 6 bits from the second byte, the next 6 bits from the third byte, and the last 6 bits from the + // fourth (low order) byte. final int b1 = characterEncoding & 0b00000111_00000000_00000000_00000000; final int b2 = characterEncoding & 0b00000000_00111111_00000000_00000000; @@ -613,10 +632,9 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable // Bits 08-15 = 0b10YYYYYY (byte 2) // Bits 00-07 = 0b10XXXXXX (byte 3) // - // The corresponding UTF-16 code point can be viewed as a 16-bit integer, - // 0bZZZZYYYYYYXXXXXX. Hence, we map the UTF-8 code units into 2 bytes with the first 3 - // bits coming from the first (high order) byte, the next 6 bits from the second (mid order) - // byte, and the last 6 bits from the third (low order) byte. + // The corresponding code point can be viewed as a 16-bit integer, 0bZZZZYYYYYYXXXXXX. Hence, we map the + // UTF-8 code units into 2 bytes with the first 3 bits coming from the first (high order) byte, the next 6 + // bits from the second (mid order) byte, and the last 6 bits from the third (low order) byte. final int b1 = characterEncoding & 0b000011110000000000000000; final int b2 = characterEncoding & 0b000000000011111100000000; @@ -636,9 +654,9 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable // Bits 08-15 = 0b110YYYYY (byte 1) // Bits 00-07 = 0b10XXXXXX (byte 2) // - // The corresponding UTF-16 code point can be viewed as an 11-bit integer, 0bYYYYYXXXXXX. - // Hence, we map the UTF-8 code units into 1 byte with the first 5 bits coming from the - // first (high order) byte and the final 6 bits coming from the second (low order) byte + // The corresponding code point can be viewed as an 11-bit integer, 0bYYYYYXXXXXX. Hence, we map the UTF-8 + // code units into 1 byte with the first 5 bits coming from the first (high order) byte and the final 6 + // bits coming from the second (low order) byte. final int b1 = characterEncoding & 0b0001111100000000; final int b2 = characterEncoding & 0b0000000000111111; @@ -792,8 +810,13 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable @JsonProperty private final int end; + @JsonProperty private int count = 0; + + @JsonProperty + private int index = 0; + @JsonProperty private int skip = 0; @@ -815,6 +838,7 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable } final int leadingByte = value & 0xFF; + this.index = this.count; if (leadingByte < 0x7F) { // UTF-8-1 = 0x00-7F @@ -841,10 +865,6 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable return this.count <= this.end; } - public int skip() { - return this.skip; - } - public int value() { return this.count; } @@ -936,8 +956,8 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable } if (0xF0 <= leadingByte && leadingByte <= 0xF4) { - // UTF8-4 = 0xF0 0x90-BF 2( UTF8-tail ) / 0xF1-F3 3( UTF8-tail ) / 0xF4 0x80-8F 2( UTF8-tail ) - this.codePoint = leadingByte << 3 * Byte.SIZE; + // UTF-8-4 = 0xF0 0x90-BF 2( UTF8-tail ) / 0xF1-F3 3( UTF8-tail ) / 0xF4 0x80-8F 2( UTF8-tail ) + this.codePoint = leadingByte << (3 * Byte.SIZE); this.shift = 2 * Byte.SIZE; return true; } diff --git a/java/src/test/java/com/azure/data/cosmos/core/Utf8StringTest.java b/java/src/test/java/com/azure/data/cosmos/core/Utf8StringTest.java index ba984e7..9c64fd6 100644 --- a/java/src/test/java/com/azure/data/cosmos/core/Utf8StringTest.java +++ b/java/src/test/java/com/azure/data/cosmos/core/Utf8StringTest.java @@ -16,6 +16,7 @@ import java.util.Optional; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; import static org.testng.Assert.assertSame; import static org.testng.Assert.assertThrows; import static org.testng.Assert.assertTrue; @@ -170,10 +171,40 @@ public class Utf8StringTest { Utf8String value = Utf8String.fromUnsafe(item.byteBuf()); + for (int start = 0, end = start + 1; end <= value.length(); end++) { + try { + final Utf8String actual = (Utf8String)value.subSequence(start, end); + assertNotNull(actual); + assertFalse(actual.isNull()); + assertEquals(actual.toUtf16(), item.value.subSequence(start, end)); + } catch (IllegalArgumentException error) { + // TODO: DANOBLE: assertions + System.out.println(error.toString()); + } + } + + for (int start = 1, end = start + 1; end <= value.length(); end++) { + try { + final Utf8String actual = (Utf8String)value.subSequence(start, end); + assertNotNull(actual); + assertFalse(actual.isNull()); + assertEquals(actual.toUtf16(), item.value.subSequence(start, end)); + } catch (IllegalArgumentException error) { + // TODO: DANOBLE: assertions + System.out.println(error.toString()); + } + } + for (int start = 0, end = value.length(); start <= end; start++, end--) { - String actual = ((Utf8String)value.subSequence(start, end)).toUtf16(); - String expected = (String)item.value.subSequence(start, end); - assertEquals(actual, expected); + try { + final Utf8String actual = (Utf8String) value.subSequence(start, end); + assertNotNull(actual); + assertFalse(actual.isNull()); + assertEquals(actual.toUtf16(), item.value.subSequence(start, end)); + } catch (IllegalArgumentException error) { + // TODO: DANOBLE: assertions + System.out.println(error.toString()); + } } }