Added and debugged some tests

This commit is contained in:
David Noble
2019-09-15 00:12:00 -07:00
parent f1959ba5b3
commit 0d1ec61cee
2 changed files with 194 additions and 22 deletions

View File

@@ -18,13 +18,16 @@ import com.google.common.base.Suppliers;
import com.google.common.base.Utf8;
import io.netty.buffer.ByteBuf;
import io.netty.buffer.ByteBufHolder;
import io.netty.buffer.ByteBufUtil;
import io.netty.buffer.Unpooled;
import io.netty.util.ByteProcessor;
import it.unimi.dsi.fastutil.ints.IntIterator;
import org.checkerframework.checker.initialization.qual.NotOnlyInitialized;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.PrimitiveIterator;
@@ -246,8 +249,15 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
*/
@Nonnull
public static Optional<Utf8String> from(@Nonnull final ByteBuf buffer) {
checkNotNull(buffer, "expected non-null buffer");
return Utf8.isWellFormed(buffer.array()) ? Optional.of(new Utf8String(buffer)) : Optional.empty();
if (buffer.writerIndex() == 0) {
return Optional.of(EMPTY);
}
int index = buffer.forEachByte(0, buffer.writerIndex(), new UTF8CodePointValidator());
return index >= 0 ? Optional.empty() : Optional.of(new Utf8String(buffer));
}
/**
@@ -262,7 +272,7 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
@Nonnull
public static Utf8String fromUnsafe(@Nonnull ByteBuf buffer) {
checkNotNull(buffer, "expected non-null buffer");
return new Utf8String(buffer);
return buffer.writerIndex() == 0 ? EMPTY : new Utf8String(buffer);
}
@Override
@@ -356,6 +366,7 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
}
@Override
@Nonnull
public String toString() {
return this.buffer.getCharSequence(0, this.buffer.writerIndex(), UTF_8).toString();
}
@@ -381,27 +392,20 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
* <p>
* This method must transcode the UTF-16 into UTF-8 which both requires allocation and is a size of data operation.
*
* @param string A UTF-16 encoding string or {@code null}
* @return A new {@link Utf8String} or {@code null}, if {@code string} is {@code null}
* @param string A UTF-16 encoded string or {@code null}.
*
* @return A new {@link Utf8String}, Utf8String.EMPTY, {@code string} is empty, or Utf8String.NULL, if
* {@code string} is {@code null}.
*/
@Nullable
@Nonnull
public static Utf8String transcodeUtf16(@Nullable final String string) {
if (string == null) {
return null;
return NULL;
}
if (string.isEmpty()) {
return EMPTY;
}
final int length = Utf8.encodedLength(string);
final ByteBuf buffer = Unpooled.wrappedBuffer(new byte[length]);
final int count = buffer.writeCharSequence(string, UTF_8);
checkState(count == length, "count: %s, length: %s", count, length);
return new Utf8String(buffer);
return new Utf8String(Unpooled.wrappedBuffer(string.getBytes(UTF_8)));
}
private static final class CodePointIterator extends UTF8CodePointGetter implements IntIterator.OfInt {
@@ -417,7 +421,7 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
@Override
public boolean hasNext() {
return this.length > 0;
return 0 <= this.start && this.start < this.length;
}
/**
@@ -433,8 +437,9 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
throw new NoSuchElementException();
}
this.start = this.buffer.forEachByte(this.start, this.length, this);
this.length -= this.start;
final int index = this.buffer.forEachByte(this.start, this.length - this.start, this);
assert index >= 0;
this.start = index + 1;
return this.codePoint();
}
@@ -676,4 +681,97 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
return this.codePoint;
}
}
/**
* A {@link ByteProcessor} used to validate a UTF-8 encoded strings.
* <p>
* This {@link #process(byte)} method reads a single code point at a time. The first byte read following
* construction of an instance of this class must be a leading byte. This is used to determine the number of
* single-byte UTF-8 code units in the code point. The {@link #process(byte)} method returns {@code false} when
* an undefined code point is encountered.
*
* @see <a href="https://tools.ietf.org/html/rfc3629">RFC 3629: UTF-8, a transformation format of ISO 10646</a>
*/
private static class UTF8CodePointValidator implements ByteProcessor {
private int codePoint = 0;
private int shift = -1;
/**
* Processes the next code unit in a UTF-8 code point sequence.
*
* @param value a {@code byte} representing the next code unit in a UTF-8 code point sequence.
*
* @return {@code false} if the current code unit signals the end of an undefined code point; otherwise, a value
* of {@code true}.
*/
@Override
public boolean process(byte value) {
switch (this.shift) {
default: {
// Next unit of code point sequence
this.codePoint |= (value & 0xFF << this.shift);
this.shift -= Byte.SIZE;
return true;
}
case 0: {
// End of code point sequence
this.codePoint |= value & 0xFF;
this.shift = -1;
return Character.isDefined(this.codePoint);
}
case -1: {
// Start of code point sequence
final int leadingByte = value & 0xFF;
if (leadingByte < 0x7F) {
// UTF-8-1 = 0x00-7F
this.codePoint = leadingByte;
return true;
}
if (0xC2 <= leadingByte && leadingByte <= 0xDF) {
// UTF8-8-2 = 0xC2-DF UTF8-tail
this.codePoint = leadingByte << Byte.SIZE;
this.shift = 0;
return true;
}
if (0xE0 <= leadingByte && leadingByte <= 0xEF) {
// UTF-8-3 = 0xE0 0xA0-BF UTF8-tail / 0xE1-EC 2(UTF8-tail) / 0xED 0x80-9F UTF8-tail / 0xEE-EF 2(UTF8-tail)
this.codePoint = leadingByte << 2 * Byte.SIZE;
this.shift = Byte.SIZE;
return true;
}
if (0xF0 <= leadingByte && leadingByte <= 0xF4) {
// UTF8-4 = 0xF0 0x90-BF 2( UTF8-tail ) / 0xF1-F3 3( UTF8-tail ) / 0xF4 0x80-8F 2( UTF8-tail )
this.codePoint = leadingByte << 3 * Byte.SIZE;
this.shift = 3 * Byte.SIZE;
return true;
}
return false;
}
}
}
/**
* Returns the value of the most-recently read code point.
*
* @return value of the most-recently read code point.
*/
int codePoint() {
return this.codePoint;
}
}
}

View File

@@ -3,26 +3,56 @@
package com.azure.data.cosmos.core;
import com.google.common.collect.ImmutableList;
import io.netty.buffer.ByteBuf;
import io.netty.buffer.Unpooled;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import static org.testng.Assert.*;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.Optional;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertSame;
import static org.testng.Assert.assertTrue;
public class Utf8StringTest {
@Test
public void testIsEmpty() {
Utf8String value;
value = Utf8String.transcodeUtf16("");
assertTrue(value.isEmpty());
assertSame(value, Utf8String.EMPTY);
value = Utf8String.fromUnsafe(Unpooled.EMPTY_BUFFER);
assertTrue(value.isEmpty());
assertSame(value, Utf8String.EMPTY);
Optional<Utf8String> optional = Utf8String.from(Unpooled.EMPTY_BUFFER);
assertTrue(optional.isPresent());
assertTrue(optional.get().isEmpty());
assertSame(optional.get(), Utf8String.EMPTY);
}
@Test
public void testIsNull() {
Utf8String value = Utf8String.transcodeUtf16(null);
assertTrue(value.isNull());
assertSame(value, Utf8String.NULL);
}
@Test
public void testChars() {
}
@Test
public void testCodePoints() {
@Test(dataProvider = "unicodeTextDataProvider")
public void testCodePoints(UnicodeTextItem item) {
Utf8String value = Utf8String.transcodeUtf16(item.value());
assertEquals(value.codePoints().iterator(), item.value().codePoints().iterator());
}
@Test
@@ -80,4 +110,48 @@ public class Utf8StringTest {
@Test
public void testTranscodeUtf16() {
}
@DataProvider(name = "unicodeTextDataProvider")
private static Iterator<Object[]> unicodeTextData() {
ImmutableList<UnicodeTextItem> items = ImmutableList.of(
// English
new UnicodeTextItem("The quick brown fox jumps over the lazy dog."),
// German
new UnicodeTextItem("Der schnelle braune Fuchs springt über den faulen Hund."),
// Swedish
new UnicodeTextItem("Den snabbbruna räven hoppar över den lata hunden."),
// Greek
new UnicodeTextItem("Η γρήγορη καφέ αλεπού πηδάει πάνω από το τεμπέλικο σκυλί."),
// Japanese
new UnicodeTextItem("速い茶色のキツネは怠laな犬を飛び越えます。"),
// Deseret alphabet
new UnicodeTextItem("\uD801\uDC10\uD801\uDC2F\uD801\uDC4A\uD801\uDC2C, \uD801\uDC38\uD801\uDC35 \uD801\uDC2A\uD801\uDC49 \uD801\uDC4F?")
);
return items.stream().map(item -> new Object[] { item }).iterator();
}
private static class UnicodeTextItem {
private final byte[] buffer;
private final String value;
UnicodeTextItem(String value) {
this.buffer = value.getBytes(StandardCharsets.UTF_8);
this.value = value;
}
public byte[] buffer() {
return this.buffer;
}
public ByteBuf byteBuf() {
return Unpooled.wrappedBuffer(this.buffer);
}
public String value() {
return this.value;
}
}
}