mirror of
https://github.com/microsoft/HybridRow.git
synced 2026-01-26 12:53:12 +00:00
Refinements to Utf8String and Utf8StringTest
This commit is contained in:
@@ -478,15 +478,20 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
* @param start the start index, inclusive.
|
* @param start the start index, inclusive.
|
||||||
* @param end the end index, exclusive.
|
* @param end the end index, exclusive.
|
||||||
* @return the specified subsequence
|
* @return the specified subsequence
|
||||||
* @throws IndexOutOfBoundsException, if
|
* @throws IllegalArgumentException if the values of {@code start} or {@code end} would cause a code point to be
|
||||||
* <p><ul>
|
* split into a surrogate pair. This exception will only be thrown on sequences
|
||||||
* <li>{@code start} or {@code end} are negative,
|
* containing 4-byte UTF-8 encodings. Whether the exception is thrown on sequences
|
||||||
* <li>{@code end} is greater than {@code length()}, or
|
* containing 4-byte UTF-8 encodings depends on the values of {@code start} and
|
||||||
* <li>{@code start} is greater than {@code end}.
|
* {@code end}. To avoid this exception at the cost of data conversion and memory
|
||||||
* </ul></p>
|
* allocation, convert this {@link Utf8String} to a {@link String} and call
|
||||||
|
* {@link String#subSequence}.
|
||||||
|
* @throws IndexOutOfBoundsException if {@code start} or {@code end} are negative, {@code end} is greater than
|
||||||
|
* {@link #length()}, <li>{@code start} is greater than {@code end}, or
|
||||||
|
* {@link #isNull()} is {@code true}.
|
||||||
*/
|
*/
|
||||||
|
@Nonnull
|
||||||
@Override
|
@Override
|
||||||
public CharSequence subSequence(int start, int end) {
|
public CharSequence subSequence(final int start, final int end) {
|
||||||
|
|
||||||
final int length = this.length();
|
final int length = this.length();
|
||||||
|
|
||||||
@@ -499,6 +504,10 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
return EMPTY;
|
return EMPTY;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (start == 0 && end == length) {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
final int encodedLength = this.buffer.writerIndex();
|
final int encodedLength = this.buffer.writerIndex();
|
||||||
final int i, n;
|
final int i, n;
|
||||||
|
|
||||||
@@ -508,11 +517,22 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
} else {
|
} else {
|
||||||
final UTF16CodeUnitCounter counter = new UTF16CodeUnitCounter(start);
|
final UTF16CodeUnitCounter counter = new UTF16CodeUnitCounter(start);
|
||||||
i = this.buffer.forEachByte(0, encodedLength, counter);
|
i = this.buffer.forEachByte(0, encodedLength, counter);
|
||||||
|
checkArgument(counter.index == counter.end, "start: %s, end: %s, counter: %s", start, end, counter);
|
||||||
n = encodedLength - i;
|
n = encodedLength - i;
|
||||||
}
|
}
|
||||||
|
|
||||||
final int j = this.buffer.forEachByte(i, n, new UTF16CodeUnitCounter(end - start));
|
final int j;
|
||||||
return fromUnsafe(this.buffer.slice(i, j >= 0 ? j - i : n));
|
|
||||||
|
if (end == length) {
|
||||||
|
j = encodedLength;
|
||||||
|
} else {
|
||||||
|
final UTF16CodeUnitCounter counter = new UTF16CodeUnitCounter(end - start);
|
||||||
|
j = this.buffer.forEachByte(i, n, counter);
|
||||||
|
checkArgument(counter.index == counter.end, "start: %s, end: %s, counter: %s", start, end, counter);
|
||||||
|
assert j >= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return fromUnsafe(this.buffer.slice(i, j - i));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -588,11 +608,10 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
// Bits 08-15 = 0b10YYYYYY (byte 3)
|
// Bits 08-15 = 0b10YYYYYY (byte 3)
|
||||||
// Bits 00-07 = 0b10XXXXXX (byte 4)
|
// Bits 00-07 = 0b10XXXXXX (byte 4)
|
||||||
//
|
//
|
||||||
// The corresponding UTF-16 code point can be viewed as a 21-bit integer,
|
// The corresponding code point can be viewed as a 21-bit integer, 0bVVVZZZZZZYYYYYYXXXXXX. Hence, we map
|
||||||
// 0bVVVZZZZZZYYYYYYXXXXXX. Hence, we map the UTF-8 code units into 3 bytes with the first
|
// the UTF-8 code units into 3 bytes with the first 3 bits coming from the first (high order) byte, the
|
||||||
// 4 bits coming from the first (high order) byte, the next 6 bits from the second byte,
|
// next 6 bits from the second byte, the next 6 bits from the third byte, and the last 6 bits from the
|
||||||
// the next 6 bits from the third byte, and the last 6 bits from the fourth (low order)
|
// fourth (low order) byte.
|
||||||
// byte.
|
|
||||||
|
|
||||||
final int b1 = characterEncoding & 0b00000111_00000000_00000000_00000000;
|
final int b1 = characterEncoding & 0b00000111_00000000_00000000_00000000;
|
||||||
final int b2 = characterEncoding & 0b00000000_00111111_00000000_00000000;
|
final int b2 = characterEncoding & 0b00000000_00111111_00000000_00000000;
|
||||||
@@ -613,10 +632,9 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
// Bits 08-15 = 0b10YYYYYY (byte 2)
|
// Bits 08-15 = 0b10YYYYYY (byte 2)
|
||||||
// Bits 00-07 = 0b10XXXXXX (byte 3)
|
// Bits 00-07 = 0b10XXXXXX (byte 3)
|
||||||
//
|
//
|
||||||
// The corresponding UTF-16 code point can be viewed as a 16-bit integer,
|
// The corresponding code point can be viewed as a 16-bit integer, 0bZZZZYYYYYYXXXXXX. Hence, we map the
|
||||||
// 0bZZZZYYYYYYXXXXXX. Hence, we map the UTF-8 code units into 2 bytes with the first 3
|
// UTF-8 code units into 2 bytes with the first 3 bits coming from the first (high order) byte, the next 6
|
||||||
// bits coming from the first (high order) byte, the next 6 bits from the second (mid order)
|
// bits from the second (mid order) byte, and the last 6 bits from the third (low order) byte.
|
||||||
// byte, and the last 6 bits from the third (low order) byte.
|
|
||||||
|
|
||||||
final int b1 = characterEncoding & 0b000011110000000000000000;
|
final int b1 = characterEncoding & 0b000011110000000000000000;
|
||||||
final int b2 = characterEncoding & 0b000000000011111100000000;
|
final int b2 = characterEncoding & 0b000000000011111100000000;
|
||||||
@@ -636,9 +654,9 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
// Bits 08-15 = 0b110YYYYY (byte 1)
|
// Bits 08-15 = 0b110YYYYY (byte 1)
|
||||||
// Bits 00-07 = 0b10XXXXXX (byte 2)
|
// Bits 00-07 = 0b10XXXXXX (byte 2)
|
||||||
//
|
//
|
||||||
// The corresponding UTF-16 code point can be viewed as an 11-bit integer, 0bYYYYYXXXXXX.
|
// The corresponding code point can be viewed as an 11-bit integer, 0bYYYYYXXXXXX. Hence, we map the UTF-8
|
||||||
// Hence, we map the UTF-8 code units into 1 byte with the first 5 bits coming from the
|
// code units into 1 byte with the first 5 bits coming from the first (high order) byte and the final 6
|
||||||
// first (high order) byte and the final 6 bits coming from the second (low order) byte
|
// bits coming from the second (low order) byte.
|
||||||
|
|
||||||
final int b1 = characterEncoding & 0b0001111100000000;
|
final int b1 = characterEncoding & 0b0001111100000000;
|
||||||
final int b2 = characterEncoding & 0b0000000000111111;
|
final int b2 = characterEncoding & 0b0000000000111111;
|
||||||
@@ -792,8 +810,13 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
|
|
||||||
@JsonProperty
|
@JsonProperty
|
||||||
private final int end;
|
private final int end;
|
||||||
|
|
||||||
@JsonProperty
|
@JsonProperty
|
||||||
private int count = 0;
|
private int count = 0;
|
||||||
|
|
||||||
|
@JsonProperty
|
||||||
|
private int index = 0;
|
||||||
|
|
||||||
@JsonProperty
|
@JsonProperty
|
||||||
private int skip = 0;
|
private int skip = 0;
|
||||||
|
|
||||||
@@ -815,6 +838,7 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
}
|
}
|
||||||
|
|
||||||
final int leadingByte = value & 0xFF;
|
final int leadingByte = value & 0xFF;
|
||||||
|
this.index = this.count;
|
||||||
|
|
||||||
if (leadingByte < 0x7F) {
|
if (leadingByte < 0x7F) {
|
||||||
// UTF-8-1 = 0x00-7F
|
// UTF-8-1 = 0x00-7F
|
||||||
@@ -841,10 +865,6 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
return this.count <= this.end;
|
return this.count <= this.end;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int skip() {
|
|
||||||
return this.skip;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int value() {
|
public int value() {
|
||||||
return this.count;
|
return this.count;
|
||||||
}
|
}
|
||||||
@@ -936,8 +956,8 @@ public final class Utf8String implements ByteBufHolder, CharSequence, Comparable
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (0xF0 <= leadingByte && leadingByte <= 0xF4) {
|
if (0xF0 <= leadingByte && leadingByte <= 0xF4) {
|
||||||
// UTF8-4 = 0xF0 0x90-BF 2( UTF8-tail ) / 0xF1-F3 3( UTF8-tail ) / 0xF4 0x80-8F 2( UTF8-tail )
|
// UTF-8-4 = 0xF0 0x90-BF 2( UTF8-tail ) / 0xF1-F3 3( UTF8-tail ) / 0xF4 0x80-8F 2( UTF8-tail )
|
||||||
this.codePoint = leadingByte << 3 * Byte.SIZE;
|
this.codePoint = leadingByte << (3 * Byte.SIZE);
|
||||||
this.shift = 2 * Byte.SIZE;
|
this.shift = 2 * Byte.SIZE;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ import java.util.Optional;
|
|||||||
|
|
||||||
import static org.testng.Assert.assertEquals;
|
import static org.testng.Assert.assertEquals;
|
||||||
import static org.testng.Assert.assertFalse;
|
import static org.testng.Assert.assertFalse;
|
||||||
|
import static org.testng.Assert.assertNotNull;
|
||||||
import static org.testng.Assert.assertSame;
|
import static org.testng.Assert.assertSame;
|
||||||
import static org.testng.Assert.assertThrows;
|
import static org.testng.Assert.assertThrows;
|
||||||
import static org.testng.Assert.assertTrue;
|
import static org.testng.Assert.assertTrue;
|
||||||
@@ -170,10 +171,40 @@ public class Utf8StringTest {
|
|||||||
|
|
||||||
Utf8String value = Utf8String.fromUnsafe(item.byteBuf());
|
Utf8String value = Utf8String.fromUnsafe(item.byteBuf());
|
||||||
|
|
||||||
|
for (int start = 0, end = start + 1; end <= value.length(); end++) {
|
||||||
|
try {
|
||||||
|
final Utf8String actual = (Utf8String)value.subSequence(start, end);
|
||||||
|
assertNotNull(actual);
|
||||||
|
assertFalse(actual.isNull());
|
||||||
|
assertEquals(actual.toUtf16(), item.value.subSequence(start, end));
|
||||||
|
} catch (IllegalArgumentException error) {
|
||||||
|
// TODO: DANOBLE: assertions
|
||||||
|
System.out.println(error.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int start = 1, end = start + 1; end <= value.length(); end++) {
|
||||||
|
try {
|
||||||
|
final Utf8String actual = (Utf8String)value.subSequence(start, end);
|
||||||
|
assertNotNull(actual);
|
||||||
|
assertFalse(actual.isNull());
|
||||||
|
assertEquals(actual.toUtf16(), item.value.subSequence(start, end));
|
||||||
|
} catch (IllegalArgumentException error) {
|
||||||
|
// TODO: DANOBLE: assertions
|
||||||
|
System.out.println(error.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (int start = 0, end = value.length(); start <= end; start++, end--) {
|
for (int start = 0, end = value.length(); start <= end; start++, end--) {
|
||||||
String actual = ((Utf8String)value.subSequence(start, end)).toUtf16();
|
try {
|
||||||
String expected = (String)item.value.subSequence(start, end);
|
final Utf8String actual = (Utf8String) value.subSequence(start, end);
|
||||||
assertEquals(actual, expected);
|
assertNotNull(actual);
|
||||||
|
assertFalse(actual.isNull());
|
||||||
|
assertEquals(actual.toUtf16(), item.value.subSequence(start, end));
|
||||||
|
} catch (IllegalArgumentException error) {
|
||||||
|
// TODO: DANOBLE: assertions
|
||||||
|
System.out.println(error.toString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user