diff --git a/java/src/main/java/com/azure/data/cosmos/core/Utf8String.java b/java/src/main/java/com/azure/data/cosmos/core/Utf8String.java index 073acfc..e8be3bf 100644 --- a/java/src/main/java/com/azure/data/cosmos/core/Utf8String.java +++ b/java/src/main/java/com/azure/data/cosmos/core/Utf8String.java @@ -9,25 +9,22 @@ import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.SerializerProvider; +import com.fasterxml.jackson.databind.annotation.JsonDeserialize; import com.fasterxml.jackson.databind.annotation.JsonSerialize; import com.fasterxml.jackson.databind.deser.std.StdDeserializer; import com.fasterxml.jackson.databind.node.JsonNodeType; import com.fasterxml.jackson.databind.ser.std.StdSerializer; import com.google.common.base.Objects; import com.google.common.base.Suppliers; -import com.google.common.base.Utf8; import io.netty.buffer.ByteBuf; import io.netty.buffer.ByteBufHolder; -import io.netty.buffer.ByteBufUtil; import io.netty.buffer.Unpooled; import io.netty.util.ByteProcessor; import it.unimi.dsi.fastutil.ints.IntIterator; -import org.checkerframework.checker.initialization.qual.NotOnlyInitialized; import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.io.IOException; -import java.nio.charset.StandardCharsets; import java.util.NoSuchElementException; import java.util.Optional; import java.util.PrimitiveIterator; @@ -43,6 +40,7 @@ import static com.google.common.base.Preconditions.checkState; import static com.google.common.base.Strings.lenientFormat; import static java.nio.charset.StandardCharsets.UTF_8; +@JsonDeserialize(using = Utf8String.JsonDeserializer.class) @JsonSerialize(using = Utf8String.JsonSerializer.class) @SuppressWarnings("UnstableApiUsage") public final class Utf8String implements ByteBufHolder, CharSequence, Comparable { @@ -147,8 +145,12 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable public final int compareTo(final String other) { - if (this.buffer == null) { - return other == null ? 0 : -1; + if (null == other) { + return null == this.buffer ? 0 : 1; + } + + if (null == this.buffer) { + return -1; } PrimitiveIterator.OfInt t = this.codePoints().iterator(); @@ -408,6 +410,80 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable return new Utf8String(Unpooled.wrappedBuffer(string.getBytes(UTF_8))); } + private static int toCodePoint(int characterEncoding) { + + if ((characterEncoding & 0b11111000_00000000_00000000_00000000) == 0b11110000_00000000_00000000_00000000) { + + // Map 4-byte UTF-8 encoding to code point in the [0x10000, 0x0FFFF] range + // + // UTF-8 encodings in this range have this bit pattern: + // + // Bits 24-31 = 0b11110VVV (byte 1) + // Bits 16-23 = 0b10ZZZZZZ (byte 2) + // Bits 08-15 = 0b10YYYYYY (byte 3) + // Bits 00-07 = 0b10XXXXXX (byte 4) + // + // The corresponding UTF-16 code point can be viewed as a 21-bit integer, + // 0bVVVZZZZZZYYYYYYXXXXXX. Hence, we map the UTF-8 code units into 3 bytes with the first + // 4 bits coming from the first (high order) byte, the next 6 bits from the second byte, + // the next 6 bits from the third byte, and the last 6 bits from the fourth (low order) + // byte. + + final int b1 = characterEncoding & 0b00000111_00000000_00000000_00000000; + final int b2 = characterEncoding & 0b00000000_00111111_00000000_00000000; + final int b3 = characterEncoding & 0b00000000_00000000_00111111_00000000; + final int b4 = characterEncoding & 0b00000000_00000000_00000000_00111111; + + return (b1 >> 6) | (b2 >> 4) | (b3 >> 2) | b4; + } + + if ((characterEncoding & 0b11111111_11110000_00000000_00000000) == 0b00000000_11100000_00000000_00000000) { + + // Map 3-byte UTF-8 encoding to code point in the [0x0800, 0xFFFF] range + // + // UTF-8 encodings in this range have this bit pattern: + // + // Bits 24-31 = 0b00000000 + // Bits 16-23 = 0b1110ZZZZ (byte 1) + // Bits 08-15 = 0b10YYYYYY (byte 2) + // Bits 00-07 = 0b10XXXXXX (byte 3) + // + // The corresponding UTF-16 code point can be viewed as a 16-bit integer, + // 0bZZZZYYYYYYXXXXXX. Hence, we map the UTF-8 code units into 2 bytes with the first 3 + // bits coming from the first (high order) byte, the next 6 bits from the second (mid order) + // byte, and the last 6 bits from the third (low order) byte. + + final int b1 = characterEncoding & 0b000011110000000000000000; + final int b2 = characterEncoding & 0b000000000011111100000000; + final int b3 = characterEncoding & 0b000000000000000000111111; + + return (b1 >> 4) | (b2 >> 2) | b3; + } + + if ((characterEncoding & 0b11111111_11111111_11100000_00000000) == 0b00000000_00000000_11000000_00000000) { + + // Map 2-byte UTF-8 character encoding to code point in the [0x0080, 0x07FF] range + // + // UTF-8 Encodings in this this range have this bit pattern: + // + // Bits 24-31 = 0b00000000 + // Bits 16-23 = 0b00000000 + // Bits 08-15 = 0b110YYYYY (byte 1) + // Bits 00-07 = 0b10XXXXXX (byte 2) + // + // The corresponding UTF-16 code point can be viewed as an 11-bit integer, 0bYYYYYXXXXXX. + // Hence, we map the UTF-8 code units into 1 byte with the first 5 bits coming from the + // first (high order) byte and the final 6 bits coming from the second (low order) byte + + final int b1 = characterEncoding & 0b0001111100000000; + final int b2 = characterEncoding & 0b0000000000111111; + + return (b1 >> 2) | b2; + } + + return -1; + } + private static final class CodePointIterator extends UTF8CodePointGetter implements IntIterator.OfInt { private final ByteBuf buffer; @@ -445,9 +521,9 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable } } - static final class Deserializer extends StdDeserializer { + static final class JsonDeserializer extends StdDeserializer { - private Deserializer() { + private JsonDeserializer() { super(Utf8String.class); } @@ -597,7 +673,6 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable private static final int REPLACEMENT_CHARACTER = 0xFFFD; private int codePoint = -1; - private int length = -1; private int shift = -1; /** @@ -626,84 +701,9 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable // End of code point sequence of length 2-4 - this.codePoint |= (value & 0xFF); this.shift = -1; - - switch (this.length) { - default: { - assert false : lenientFormat("codePoint: 0b%s, length: %s, shift: 0", - Integer.toBinaryString(this.codePoint), - this.length); - this.codePoint = REPLACEMENT_CHARACTER; - return false; - } - case 2: { - - // Map UTF-8 encoding to UTF-16 code point in the [0x0080, 0x07FF] range - // - // UTF-8 Encodings in this this range have this bit pattern: - // - // Bits 08-15 = 0b110YYYYY (byte 1) - // Bits 00-07 = 0b10XXXXXX (byte 2) - // - // The corresponding UTF-16 code point can be viewed as an 11-bit integer, 0bYYYYYXXXXXX. - // Hence, we map the UTF-8 code units into 1 byte with the first 5 bits coming from the - // first (high order) byte and the final 6 bits coming from the second (low order) byte - - final int b1 = this.codePoint & 0b0001111100000000; - final int b2 = this.codePoint & 0b0000000000111111; - - this.codePoint = (b1 >> 2) | b2; - break; - } - case 3: { - - // Map UTF-8 encoding to UTF-16 code point in the [0x0800, 0xFFFF] range - // - // UTF-8 encodings in this range have this bit pattern: - // - // Bits 16-23 = 0b1110ZZZZ (byte 1) - // Bits 08-15 = 0b10YYYYYY (byte 2) - // Bits 00-07 = 0b10XXXXXX (byte 3) - // - // The corresponding UTF-16 code point can be viewed as a 16-bit integer, - // 0bZZZZYYYYYYXXXXXX. Hence, we map the UTF-8 code units into 2 bytes with the first 3 - // bits coming from the first (high order) byte, the next 6 bits from the second (mid order) - // byte, and the last 6 bits from the third (low order) byte. - - final int b1 = this.codePoint & 0b000011110000000000000000; - final int b2 = this.codePoint & 0b000000000011111100000000; - final int b3 = this.codePoint & 0b000000000000000000111111; - - this.codePoint = (b1 >> 4) | (b2 >> 2) | b3; - break; - } - case 4: { - - // Map UTF-8 encoding to UTF-16 code point in the [0x10000, 0x0FFFF] range - // - // UTF-8 encodings in this range have this bit pattern: - // - // Bits 24-32 = 0b11110VVV (byte 1) - // Bits 16-23 = 0b10ZZZZZZ (byte 2) - // Bits 08-15 = 0b10YYYYYY (byte 3) - // Bits 00-07 = 0b10XXXXXX (byte 4) - // - // The corresponding UTF-16 code point can be viewed as a 21-bit integer, - // 0bVVVZZZZZZYYYYYYXXXXXX. Hence, we map the UTF-8 code units into 3 bytes with the first - // 4 bits coming from the first (high order) byte, the next 6 bits from the second byte, - // the next 6 bits from the third byte, and the last 6 bits from the fourth (low order) - // byte. - - final int b1 = this.codePoint & 0b00000111000000000000000000000000; - final int b2 = this.codePoint & 0b00000000001111110000000000000000; - final int b3 = this.codePoint & 0b00000000000000000011111100000000; - final int b4 = this.codePoint & 0b00000000000000000000000000111111; - - this.codePoint = (b1 >> 6) | (b2 >> 4) | (b3 >> 2) | b4; - break; - } - } + this.codePoint |= (value & 0xFF); + this.codePoint = toCodePoint(this.codePoint); if (!Character.isDefined(this.codePoint)) { this.codePoint = REPLACEMENT_CHARACTER; @@ -720,14 +720,12 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable if (leadingByte < 0x7F) { // UTF-8-1 = 0x00-7F this.codePoint = leadingByte; - this.length = 1; return false; } if (0xC2 <= leadingByte && leadingByte <= 0xDF) { // UTF8-8-2 = 0xC2-DF UTF8-tail this.codePoint = leadingByte << Byte.SIZE; - this.length = 2; this.shift = 0; return true; } @@ -735,7 +733,6 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable if (0xE0 <= leadingByte && leadingByte <= 0xEF) { // UTF-8-3 = 0xE0 0xA0-BF UTF8-tail / 0xE1-EC 2(UTF8-tail) / 0xED 0x80-9F UTF8-tail / 0xEE-EF 2(UTF8-tail) this.codePoint = leadingByte << 2 * Byte.SIZE; - this.length = 3; this.shift = Byte.SIZE; return true; } @@ -743,7 +740,6 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable if (0xF0 <= leadingByte && leadingByte <= 0xF4) { // UTF8-4 = 0xF0 0x90-BF 2( UTF8-tail ) / 0xF1-F3 3( UTF8-tail ) / 0xF4 0x80-8F 2( UTF8-tail ) this.codePoint = leadingByte << 3 * Byte.SIZE; - this.length = 4; this.shift = 2 * Byte.SIZE; return true; } @@ -770,13 +766,13 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable * This {@link #process(byte)} method reads a single code point at a time. The first byte read following * construction of an instance of this class must be a leading byte. This is used to determine the number of * single-byte UTF-8 code units in the code point. The {@link #process(byte)} method returns {@code false} when - * an undefined code point is encountered. + * an undefined code point is encountered as determined by {@link Character#isDefined(int)}}. * * @see RFC 3629: UTF-8, a transformation format of ISO 10646 */ private static class UTF8CodePointValidator implements ByteProcessor { - private int codePoint = 0; + private int codePoint = -1; private int shift = -1; /** @@ -796,7 +792,7 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable // Next unit of code point sequence - this.codePoint |= (value & 0xFF << this.shift); + this.codePoint |= ((value & 0xFF) << this.shift); this.shift -= Byte.SIZE; return true; } @@ -804,8 +800,9 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable // End of code point sequence - this.codePoint |= value & 0xFF; + this.codePoint |= (value & 0xFF); this.shift = -1; + this.codePoint = toCodePoint(this.codePoint); return Character.isDefined(this.codePoint); } @@ -813,7 +810,7 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable // Start of code point sequence - final int leadingByte = value & 0xFF; + final int leadingByte = (value & 0xFF); if (leadingByte < 0x7F) { // UTF-8-1 = 0x00-7F @@ -838,7 +835,7 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable if (0xF0 <= leadingByte && leadingByte <= 0xF4) { // UTF8-4 = 0xF0 0x90-BF 2( UTF8-tail ) / 0xF1-F3 3( UTF8-tail ) / 0xF4 0x80-8F 2( UTF8-tail ) this.codePoint = leadingByte << 3 * Byte.SIZE; - this.shift = 3 * Byte.SIZE; + this.shift = 2 * Byte.SIZE; return true; } diff --git a/java/src/test/java/com/azure/data/cosmos/core/Utf8StringTest.java b/java/src/test/java/com/azure/data/cosmos/core/Utf8StringTest.java index aaf424f..f69645c 100644 --- a/java/src/test/java/com/azure/data/cosmos/core/Utf8StringTest.java +++ b/java/src/test/java/com/azure/data/cosmos/core/Utf8StringTest.java @@ -15,6 +15,7 @@ import java.util.Optional; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertSame; +import static org.testng.Assert.assertThrows; import static org.testng.Assert.assertTrue; public class Utf8StringTest { @@ -55,12 +56,32 @@ public class Utf8StringTest { assertEquals(value.codePoints().iterator(), item.value().codePoints().iterator()); } - @Test - public void testCompareTo() { - } + @SuppressWarnings("EqualsWithItself") + @Test(dataProvider = "unicodeTextDataProvider") + public void testCompareTo(UnicodeTextItem item) { - @Test - public void testTestCompareTo() { + Utf8String value = Utf8String.transcodeUtf16(item.value()); + assertEquals(value.compareTo(value), 0); + assertEquals(value.compareTo(item.value()), 0); + + Utf8String unsafe = Utf8String.fromUnsafe(item.byteBuf()); + assertEquals(unsafe.compareTo(value), 0); + assertEquals(unsafe.compareTo(item.value()), 0); + + Optional optional = Utf8String.from(item.byteBuf()); + assertTrue(optional.isPresent()); + assertEquals(optional.get().compareTo(value), 0); + assertEquals(optional.get().compareTo(item.value()), 0); + + assertThrows(NullPointerException.class, () -> { + //noinspection ConstantConditions,ResultOfMethodCallIgnored + value.compareTo((Utf8String) null); + }); + + assertEquals(Utf8String.NULL.compareTo(item.value()), -1); + assertEquals(Utf8String.NULL.compareTo((String) null), 0); + assertEquals(Utf8String.EMPTY.compareTo((String) null), 1); + assertEquals(Utf8String.EMPTY.compareTo(Utf8String.NULL), 1); } @Test @@ -115,17 +136,22 @@ public class Utf8StringTest { private static Iterator unicodeTextData() { ImmutableList items = ImmutableList.of( - // English + // US ASCII (7-bit encoding) + // ..English new UnicodeTextItem("The quick brown fox jumps over the lazy dog."), - // German + // ISO-8859-1 (8-bit encoding that adds Latin-1 supplement to US ASCII) + // ..German new UnicodeTextItem("Der schnelle braune Fuchs springt über den faulen Hund."), - // Swedish + // ..Swedish new UnicodeTextItem("Den snabbbruna räven hoppar över den lata hunden."), - // Greek + // ISO 8859-7 (11-bit encoding that covers the Greek and Coptic alphabets) + // ..Greek new UnicodeTextItem("Η γρήγορη καφέ αλεπού πηδάει πάνω από το τεμπέλικο σκυλί."), - // Japanese + // Katakana code block (16-bit encoding containing katakana characters for the Japanese and Ainu languages) + // ..Japanese new UnicodeTextItem("速い茶色のキツネは怠laな犬を飛び越えます。"), - // Deseret alphabet + // Deseret code block (21-bit encoding containing an English alphabet invented by the LDS Church) + // ..Deseret new UnicodeTextItem("\uD801\uDC10\uD801\uDC2F\uD801\uDC4A\uD801\uDC2C, \uD801\uDC38\uD801\uDC35 \uD801\uDC2A\uD801\uDC49 \uD801\uDC4F?") ); @@ -150,6 +176,11 @@ public class Utf8StringTest { return Unpooled.wrappedBuffer(this.buffer); } + @Override + public String toString() { + return this.value.toString(); + } + public String value() { return this.value; }