Added and debugged some tests

2026-03-19 06:13:19 +00:00 · 2019-09-15 00:12:00 -07:00
parent f1959ba5b3
commit 0d1ec61cee
2 changed files with 194 additions and 22 deletions
--- a/java/src/main/java/com/azure/data/cosmos/core/Utf8String.java
+++ b/java/src/main/java/com/azure/data/cosmos/core/Utf8String.java
@@ -18,13 +18,16 @@ import com.google.common.base.Suppliers;
 import com.google.common.base.Utf8;
 import io.netty.buffer.ByteBuf;
 import io.netty.buffer.ByteBufHolder;
+import io.netty.buffer.ByteBufUtil;
 import io.netty.buffer.Unpooled;
 import io.netty.util.ByteProcessor;
 import it.unimi.dsi.fastutil.ints.IntIterator;
+import org.checkerframework.checker.initialization.qual.NotOnlyInitialized;

 import javax.annotation.Nonnull;
 import javax.annotation.Nullable;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.NoSuchElementException;
 import java.util.Optional;
 import java.util.PrimitiveIterator;
@@ -246,8 +249,15 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
     */
    @Nonnull
    public static Optional<Utf8String> from(@Nonnull final ByteBuf buffer) {
+
        checkNotNull(buffer, "expected non-null buffer");
-        return Utf8.isWellFormed(buffer.array()) ? Optional.of(new Utf8String(buffer)) : Optional.empty();
+
+        if (buffer.writerIndex() == 0) {
+            return Optional.of(EMPTY);
+        }
+
+        int index = buffer.forEachByte(0, buffer.writerIndex(), new UTF8CodePointValidator());
+        return index >= 0 ? Optional.empty() : Optional.of(new Utf8String(buffer));
    }

    /**
@@ -262,7 +272,7 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
    @Nonnull
    public static Utf8String fromUnsafe(@Nonnull ByteBuf buffer) {
        checkNotNull(buffer, "expected non-null buffer");
-        return new Utf8String(buffer);
+        return buffer.writerIndex() == 0 ? EMPTY : new Utf8String(buffer);
    }

    @Override
@@ -356,6 +366,7 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
    }

    @Override
+    @Nonnull
    public String toString() {
        return this.buffer.getCharSequence(0, this.buffer.writerIndex(), UTF_8).toString();
    }
@@ -381,27 +392,20 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
     * <p>
     * This method must transcode the UTF-16 into UTF-8 which both requires allocation and is a size of data operation.
     *
-     * @param string A UTF-16 encoding string or {@code null}
-     * @return A new {@link Utf8String} or {@code null}, if {@code string} is {@code null}
+     * @param string A UTF-16 encoded string or {@code null}.
+     *
+     * @return A new {@link Utf8String}, Utf8String.EMPTY, {@code string} is empty, or Utf8String.NULL, if
+     * {@code string} is {@code null}.
     */
-    @Nullable
+    @Nonnull
    public static Utf8String transcodeUtf16(@Nullable final String string) {
-
        if (string == null) {
-            return null;
+            return NULL;
        }
-
        if (string.isEmpty()) {
            return EMPTY;
        }
-
-        final int length = Utf8.encodedLength(string);
-        final ByteBuf buffer = Unpooled.wrappedBuffer(new byte[length]);
-        final int count = buffer.writeCharSequence(string, UTF_8);
-
-        checkState(count == length, "count: %s, length: %s", count, length);
-
-        return new Utf8String(buffer);
+        return new Utf8String(Unpooled.wrappedBuffer(string.getBytes(UTF_8)));
    }

    private static final class CodePointIterator extends UTF8CodePointGetter implements IntIterator.OfInt {
@@ -417,7 +421,7 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable

        @Override
        public boolean hasNext() {
-            return this.length > 0;
+            return 0 <= this.start && this.start < this.length;
        }

        /**
@@ -433,8 +437,9 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
                throw new NoSuchElementException();
            }

-            this.start = this.buffer.forEachByte(this.start, this.length, this);
-            this.length -= this.start;
+            final int index = this.buffer.forEachByte(this.start, this.length - this.start, this);
+            assert index >= 0;
+            this.start = index + 1;

            return this.codePoint();
        }
@@ -676,4 +681,97 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
            return this.codePoint;
        }
    }
+
+    /**
+     * A {@link ByteProcessor} used to validate a UTF-8 encoded strings.
+     * <p>
+     * This {@link #process(byte)} method reads a single code point at a time. The first byte read following
+     * construction of an instance of this class must be a leading byte. This is used to determine the number of
+     * single-byte UTF-8 code units in the code point. The {@link #process(byte)} method returns {@code false} when
+     * an undefined code point is encountered.
+     *
+     * @see <a href="https://tools.ietf.org/html/rfc3629">RFC 3629: UTF-8, a transformation format of ISO 10646</a>
+     */
+    private static class UTF8CodePointValidator implements ByteProcessor {
+
+        private int codePoint = 0;
+        private int shift = -1;
+
+        /**
+         * Processes the next code unit in a UTF-8 code point sequence.
+         *
+         * @param value a {@code byte} representing the next code unit in a UTF-8 code point sequence.
+         *
+         * @return {@code false} if the current code unit signals the end of an undefined code point; otherwise, a value
+         * of {@code true}.
+         */
+        @Override
+        public boolean process(byte value) {
+
+            switch (this.shift) {
+
+                default: {
+
+                    // Next unit of code point sequence
+
+                    this.codePoint |= (value & 0xFF << this.shift);
+                    this.shift -= Byte.SIZE;
+                    return true;
+                }
+                case 0: {
+
+                    // End of code point sequence
+
+                    this.codePoint |= value & 0xFF;
+                    this.shift = -1;
+
+                    return Character.isDefined(this.codePoint);
+                }
+                case -1: {
+
+                    // Start of code point sequence
+
+                    final int leadingByte = value & 0xFF;
+
+                    if (leadingByte < 0x7F) {
+                        // UTF-8-1 = 0x00-7F
+                        this.codePoint = leadingByte;
+                        return true;
+                    }
+
+                    if (0xC2 <= leadingByte && leadingByte <= 0xDF) {
+                        // UTF8-8-2 = 0xC2-DF UTF8-tail
+                        this.codePoint = leadingByte << Byte.SIZE;
+                        this.shift = 0;
+                        return true;
+                    }
+
+                    if (0xE0 <= leadingByte && leadingByte <= 0xEF) {
+                        // UTF-8-3 = 0xE0 0xA0-BF UTF8-tail / 0xE1-EC 2(UTF8-tail) / 0xED 0x80-9F UTF8-tail / 0xEE-EF 2(UTF8-tail)
+                        this.codePoint = leadingByte << 2 * Byte.SIZE;
+                        this.shift = Byte.SIZE;
+                        return true;
+                    }
+
+                    if (0xF0 <= leadingByte && leadingByte <= 0xF4) {
+                        // UTF8-4 = 0xF0 0x90-BF 2( UTF8-tail ) / 0xF1-F3 3( UTF8-tail ) / 0xF4 0x80-8F 2( UTF8-tail )
+                        this.codePoint = leadingByte << 3 * Byte.SIZE;
+                        this.shift = 3 * Byte.SIZE;
+                        return true;
+                    }
+
+                    return false;
+                }
+            }
+        }
+
+        /**
+         * Returns the value of the most-recently read code point.
+         *
+         * @return value of the most-recently read code point.
+         */
+        int codePoint() {
+            return this.codePoint;
+        }
+    }
 }
--- a/java/src/test/java/com/azure/data/cosmos/core/Utf8StringTest.java
+++ b/java/src/test/java/com/azure/data/cosmos/core/Utf8StringTest.java
@@ -3,26 +3,56 @@

 package com.azure.data.cosmos.core;

+import com.google.common.collect.ImmutableList;
+import io.netty.buffer.ByteBuf;
+import io.netty.buffer.Unpooled;
+import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;

-import static org.testng.Assert.*;
+import java.nio.charset.StandardCharsets;
+import java.util.Iterator;
+import java.util.Optional;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertSame;
+import static org.testng.Assert.assertTrue;

 public class Utf8StringTest {

    @Test
    public void testIsEmpty() {
+
+        Utf8String value;
+
+        value = Utf8String.transcodeUtf16("");
+        assertTrue(value.isEmpty());
+        assertSame(value, Utf8String.EMPTY);
+
+        value = Utf8String.fromUnsafe(Unpooled.EMPTY_BUFFER);
+        assertTrue(value.isEmpty());
+        assertSame(value, Utf8String.EMPTY);
+
+        Optional<Utf8String> optional = Utf8String.from(Unpooled.EMPTY_BUFFER);
+        assertTrue(optional.isPresent());
+        assertTrue(optional.get().isEmpty());
+        assertSame(optional.get(), Utf8String.EMPTY);
    }

    @Test
    public void testIsNull() {
+        Utf8String value = Utf8String.transcodeUtf16(null);
+        assertTrue(value.isNull());
+        assertSame(value, Utf8String.NULL);
    }

    @Test
    public void testChars() {
    }

-    @Test
-    public void testCodePoints() {
+    @Test(dataProvider = "unicodeTextDataProvider")
+    public void testCodePoints(UnicodeTextItem item) {
+        Utf8String value = Utf8String.transcodeUtf16(item.value());
+        assertEquals(value.codePoints().iterator(), item.value().codePoints().iterator());
    }

    @Test
@@ -80,4 +110,48 @@ public class Utf8StringTest {
    @Test
    public void testTranscodeUtf16() {
    }
+
+    @DataProvider(name = "unicodeTextDataProvider")
+    private static Iterator<Object[]> unicodeTextData() {
+
+        ImmutableList<UnicodeTextItem> items = ImmutableList.of(
+            // English
+            new UnicodeTextItem("The quick brown fox jumps over the lazy dog."),
+            // German
+            new UnicodeTextItem("Der schnelle braune Fuchs springt über den faulen Hund."),
+            // Swedish
+            new UnicodeTextItem("Den snabbbruna räven hoppar över den lata hunden."),
+            // Greek
+            new UnicodeTextItem("Η γρήγορη καφέ αλεπού πηδάει πάνω από το τεμπέλικο σκυλί."),
+            // Japanese
+            new UnicodeTextItem("速い茶色のキツネは怠laな犬を飛び越えます。"),
+            // Deseret alphabet
+            new UnicodeTextItem("\uD801\uDC10\uD801\uDC2F\uD801\uDC4A\uD801\uDC2C, \uD801\uDC38\uD801\uDC35 \uD801\uDC2A\uD801\uDC49 \uD801\uDC4F?")
+        );
+
+        return items.stream().map(item -> new Object[] { item }).iterator();
+    }
+
+    private static class UnicodeTextItem {
+
+        private final byte[] buffer;
+        private final String value;
+
+        UnicodeTextItem(String value) {
+            this.buffer = value.getBytes(StandardCharsets.UTF_8);
+            this.value = value;
+        }
+
+        public byte[] buffer() {
+            return this.buffer;
+        }
+
+        public ByteBuf byteBuf() {
+            return Unpooled.wrappedBuffer(this.buffer);
+        }
+
+        public String value() {
+            return this.value;
+        }
+    }
 }