mirror of
https://github.com/microsoft/HybridRow.git
synced 2026-01-26 21:03:14 +00:00
Added and debugged some tests
This commit is contained in:
@@ -18,13 +18,16 @@ import com.google.common.base.Suppliers;
|
|||||||
import com.google.common.base.Utf8;
|
import com.google.common.base.Utf8;
|
||||||
import io.netty.buffer.ByteBuf;
|
import io.netty.buffer.ByteBuf;
|
||||||
import io.netty.buffer.ByteBufHolder;
|
import io.netty.buffer.ByteBufHolder;
|
||||||
|
import io.netty.buffer.ByteBufUtil;
|
||||||
import io.netty.buffer.Unpooled;
|
import io.netty.buffer.Unpooled;
|
||||||
import io.netty.util.ByteProcessor;
|
import io.netty.util.ByteProcessor;
|
||||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||||
|
import org.checkerframework.checker.initialization.qual.NotOnlyInitialized;
|
||||||
|
|
||||||
import javax.annotation.Nonnull;
|
import javax.annotation.Nonnull;
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.NoSuchElementException;
|
import java.util.NoSuchElementException;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.PrimitiveIterator;
|
import java.util.PrimitiveIterator;
|
||||||
@@ -246,8 +249,15 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
*/
|
*/
|
||||||
@Nonnull
|
@Nonnull
|
||||||
public static Optional<Utf8String> from(@Nonnull final ByteBuf buffer) {
|
public static Optional<Utf8String> from(@Nonnull final ByteBuf buffer) {
|
||||||
|
|
||||||
checkNotNull(buffer, "expected non-null buffer");
|
checkNotNull(buffer, "expected non-null buffer");
|
||||||
return Utf8.isWellFormed(buffer.array()) ? Optional.of(new Utf8String(buffer)) : Optional.empty();
|
|
||||||
|
if (buffer.writerIndex() == 0) {
|
||||||
|
return Optional.of(EMPTY);
|
||||||
|
}
|
||||||
|
|
||||||
|
int index = buffer.forEachByte(0, buffer.writerIndex(), new UTF8CodePointValidator());
|
||||||
|
return index >= 0 ? Optional.empty() : Optional.of(new Utf8String(buffer));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -262,7 +272,7 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
@Nonnull
|
@Nonnull
|
||||||
public static Utf8String fromUnsafe(@Nonnull ByteBuf buffer) {
|
public static Utf8String fromUnsafe(@Nonnull ByteBuf buffer) {
|
||||||
checkNotNull(buffer, "expected non-null buffer");
|
checkNotNull(buffer, "expected non-null buffer");
|
||||||
return new Utf8String(buffer);
|
return buffer.writerIndex() == 0 ? EMPTY : new Utf8String(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -356,6 +366,7 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@Nonnull
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return this.buffer.getCharSequence(0, this.buffer.writerIndex(), UTF_8).toString();
|
return this.buffer.getCharSequence(0, this.buffer.writerIndex(), UTF_8).toString();
|
||||||
}
|
}
|
||||||
@@ -381,27 +392,20 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
* <p>
|
* <p>
|
||||||
* This method must transcode the UTF-16 into UTF-8 which both requires allocation and is a size of data operation.
|
* This method must transcode the UTF-16 into UTF-8 which both requires allocation and is a size of data operation.
|
||||||
*
|
*
|
||||||
* @param string A UTF-16 encoding string or {@code null}
|
* @param string A UTF-16 encoded string or {@code null}.
|
||||||
* @return A new {@link Utf8String} or {@code null}, if {@code string} is {@code null}
|
*
|
||||||
|
* @return A new {@link Utf8String}, Utf8String.EMPTY, {@code string} is empty, or Utf8String.NULL, if
|
||||||
|
* {@code string} is {@code null}.
|
||||||
*/
|
*/
|
||||||
@Nullable
|
@Nonnull
|
||||||
public static Utf8String transcodeUtf16(@Nullable final String string) {
|
public static Utf8String transcodeUtf16(@Nullable final String string) {
|
||||||
|
|
||||||
if (string == null) {
|
if (string == null) {
|
||||||
return null;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (string.isEmpty()) {
|
if (string.isEmpty()) {
|
||||||
return EMPTY;
|
return EMPTY;
|
||||||
}
|
}
|
||||||
|
return new Utf8String(Unpooled.wrappedBuffer(string.getBytes(UTF_8)));
|
||||||
final int length = Utf8.encodedLength(string);
|
|
||||||
final ByteBuf buffer = Unpooled.wrappedBuffer(new byte[length]);
|
|
||||||
final int count = buffer.writeCharSequence(string, UTF_8);
|
|
||||||
|
|
||||||
checkState(count == length, "count: %s, length: %s", count, length);
|
|
||||||
|
|
||||||
return new Utf8String(buffer);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final class CodePointIterator extends UTF8CodePointGetter implements IntIterator.OfInt {
|
private static final class CodePointIterator extends UTF8CodePointGetter implements IntIterator.OfInt {
|
||||||
@@ -417,7 +421,7 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
return this.length > 0;
|
return 0 <= this.start && this.start < this.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -433,8 +437,9 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
throw new NoSuchElementException();
|
throw new NoSuchElementException();
|
||||||
}
|
}
|
||||||
|
|
||||||
this.start = this.buffer.forEachByte(this.start, this.length, this);
|
final int index = this.buffer.forEachByte(this.start, this.length - this.start, this);
|
||||||
this.length -= this.start;
|
assert index >= 0;
|
||||||
|
this.start = index + 1;
|
||||||
|
|
||||||
return this.codePoint();
|
return this.codePoint();
|
||||||
}
|
}
|
||||||
@@ -676,4 +681,97 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
return this.codePoint;
|
return this.codePoint;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link ByteProcessor} used to validate a UTF-8 encoded strings.
|
||||||
|
* <p>
|
||||||
|
* This {@link #process(byte)} method reads a single code point at a time. The first byte read following
|
||||||
|
* construction of an instance of this class must be a leading byte. This is used to determine the number of
|
||||||
|
* single-byte UTF-8 code units in the code point. The {@link #process(byte)} method returns {@code false} when
|
||||||
|
* an undefined code point is encountered.
|
||||||
|
*
|
||||||
|
* @see <a href="https://tools.ietf.org/html/rfc3629">RFC 3629: UTF-8, a transformation format of ISO 10646</a>
|
||||||
|
*/
|
||||||
|
private static class UTF8CodePointValidator implements ByteProcessor {
|
||||||
|
|
||||||
|
private int codePoint = 0;
|
||||||
|
private int shift = -1;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Processes the next code unit in a UTF-8 code point sequence.
|
||||||
|
*
|
||||||
|
* @param value a {@code byte} representing the next code unit in a UTF-8 code point sequence.
|
||||||
|
*
|
||||||
|
* @return {@code false} if the current code unit signals the end of an undefined code point; otherwise, a value
|
||||||
|
* of {@code true}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public boolean process(byte value) {
|
||||||
|
|
||||||
|
switch (this.shift) {
|
||||||
|
|
||||||
|
default: {
|
||||||
|
|
||||||
|
// Next unit of code point sequence
|
||||||
|
|
||||||
|
this.codePoint |= (value & 0xFF << this.shift);
|
||||||
|
this.shift -= Byte.SIZE;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
case 0: {
|
||||||
|
|
||||||
|
// End of code point sequence
|
||||||
|
|
||||||
|
this.codePoint |= value & 0xFF;
|
||||||
|
this.shift = -1;
|
||||||
|
|
||||||
|
return Character.isDefined(this.codePoint);
|
||||||
|
}
|
||||||
|
case -1: {
|
||||||
|
|
||||||
|
// Start of code point sequence
|
||||||
|
|
||||||
|
final int leadingByte = value & 0xFF;
|
||||||
|
|
||||||
|
if (leadingByte < 0x7F) {
|
||||||
|
// UTF-8-1 = 0x00-7F
|
||||||
|
this.codePoint = leadingByte;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (0xC2 <= leadingByte && leadingByte <= 0xDF) {
|
||||||
|
// UTF8-8-2 = 0xC2-DF UTF8-tail
|
||||||
|
this.codePoint = leadingByte << Byte.SIZE;
|
||||||
|
this.shift = 0;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (0xE0 <= leadingByte && leadingByte <= 0xEF) {
|
||||||
|
// UTF-8-3 = 0xE0 0xA0-BF UTF8-tail / 0xE1-EC 2(UTF8-tail) / 0xED 0x80-9F UTF8-tail / 0xEE-EF 2(UTF8-tail)
|
||||||
|
this.codePoint = leadingByte << 2 * Byte.SIZE;
|
||||||
|
this.shift = Byte.SIZE;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (0xF0 <= leadingByte && leadingByte <= 0xF4) {
|
||||||
|
// UTF8-4 = 0xF0 0x90-BF 2( UTF8-tail ) / 0xF1-F3 3( UTF8-tail ) / 0xF4 0x80-8F 2( UTF8-tail )
|
||||||
|
this.codePoint = leadingByte << 3 * Byte.SIZE;
|
||||||
|
this.shift = 3 * Byte.SIZE;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the value of the most-recently read code point.
|
||||||
|
*
|
||||||
|
* @return value of the most-recently read code point.
|
||||||
|
*/
|
||||||
|
int codePoint() {
|
||||||
|
return this.codePoint;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,26 +3,56 @@
|
|||||||
|
|
||||||
package com.azure.data.cosmos.core;
|
package com.azure.data.cosmos.core;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableList;
|
||||||
|
import io.netty.buffer.ByteBuf;
|
||||||
|
import io.netty.buffer.Unpooled;
|
||||||
|
import org.testng.annotations.DataProvider;
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
import static org.testng.Assert.*;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import static org.testng.Assert.assertEquals;
|
||||||
|
import static org.testng.Assert.assertSame;
|
||||||
|
import static org.testng.Assert.assertTrue;
|
||||||
|
|
||||||
public class Utf8StringTest {
|
public class Utf8StringTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testIsEmpty() {
|
public void testIsEmpty() {
|
||||||
|
|
||||||
|
Utf8String value;
|
||||||
|
|
||||||
|
value = Utf8String.transcodeUtf16("");
|
||||||
|
assertTrue(value.isEmpty());
|
||||||
|
assertSame(value, Utf8String.EMPTY);
|
||||||
|
|
||||||
|
value = Utf8String.fromUnsafe(Unpooled.EMPTY_BUFFER);
|
||||||
|
assertTrue(value.isEmpty());
|
||||||
|
assertSame(value, Utf8String.EMPTY);
|
||||||
|
|
||||||
|
Optional<Utf8String> optional = Utf8String.from(Unpooled.EMPTY_BUFFER);
|
||||||
|
assertTrue(optional.isPresent());
|
||||||
|
assertTrue(optional.get().isEmpty());
|
||||||
|
assertSame(optional.get(), Utf8String.EMPTY);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testIsNull() {
|
public void testIsNull() {
|
||||||
|
Utf8String value = Utf8String.transcodeUtf16(null);
|
||||||
|
assertTrue(value.isNull());
|
||||||
|
assertSame(value, Utf8String.NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testChars() {
|
public void testChars() {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test(dataProvider = "unicodeTextDataProvider")
|
||||||
public void testCodePoints() {
|
public void testCodePoints(UnicodeTextItem item) {
|
||||||
|
Utf8String value = Utf8String.transcodeUtf16(item.value());
|
||||||
|
assertEquals(value.codePoints().iterator(), item.value().codePoints().iterator());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -80,4 +110,48 @@ public class Utf8StringTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testTranscodeUtf16() {
|
public void testTranscodeUtf16() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@DataProvider(name = "unicodeTextDataProvider")
|
||||||
|
private static Iterator<Object[]> unicodeTextData() {
|
||||||
|
|
||||||
|
ImmutableList<UnicodeTextItem> items = ImmutableList.of(
|
||||||
|
// English
|
||||||
|
new UnicodeTextItem("The quick brown fox jumps over the lazy dog."),
|
||||||
|
// German
|
||||||
|
new UnicodeTextItem("Der schnelle braune Fuchs springt über den faulen Hund."),
|
||||||
|
// Swedish
|
||||||
|
new UnicodeTextItem("Den snabbbruna räven hoppar över den lata hunden."),
|
||||||
|
// Greek
|
||||||
|
new UnicodeTextItem("Η γρήγορη καφέ αλεπού πηδάει πάνω από το τεμπέλικο σκυλί."),
|
||||||
|
// Japanese
|
||||||
|
new UnicodeTextItem("速い茶色のキツネは怠laな犬を飛び越えます。"),
|
||||||
|
// Deseret alphabet
|
||||||
|
new UnicodeTextItem("\uD801\uDC10\uD801\uDC2F\uD801\uDC4A\uD801\uDC2C, \uD801\uDC38\uD801\uDC35 \uD801\uDC2A\uD801\uDC49 \uD801\uDC4F?")
|
||||||
|
);
|
||||||
|
|
||||||
|
return items.stream().map(item -> new Object[] { item }).iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class UnicodeTextItem {
|
||||||
|
|
||||||
|
private final byte[] buffer;
|
||||||
|
private final String value;
|
||||||
|
|
||||||
|
UnicodeTextItem(String value) {
|
||||||
|
this.buffer = value.getBytes(StandardCharsets.UTF_8);
|
||||||
|
this.value = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte[] buffer() {
|
||||||
|
return this.buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ByteBuf byteBuf() {
|
||||||
|
return Unpooled.wrappedBuffer(this.buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String value() {
|
||||||
|
return this.value;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user