mirror of
https://github.com/Azure/cosmos-explorer.git
synced 2026-01-22 11:14:05 +00:00
* Fixes an issue where tab titles were not truncating when characters used 4 bytes for encoding. * Changed substringUtf method to be more accurate and added comments
75 lines
2.9 KiB
TypeScript
75 lines
2.9 KiB
TypeScript
export function stripSpacesFromString(inputString?: string): string | undefined {
|
|
if (inputString === undefined || typeof inputString !== "string") {
|
|
return inputString;
|
|
}
|
|
return inputString.replace(/ /g, "");
|
|
}
|
|
|
|
/**
|
|
* Implementation of endsWith which works for IE
|
|
* @param stringToTest
|
|
* @param suffix
|
|
*/
|
|
export function endsWith(stringToTest: string, suffix: string): boolean {
|
|
return stringToTest.indexOf(suffix, stringToTest.length - suffix.length) !== -1;
|
|
}
|
|
|
|
export function startsWith(stringToTest: string, prefix: string): boolean {
|
|
return stringToTest.indexOf(prefix) === 0;
|
|
}
|
|
|
|
/**
|
|
* Returns the input number of characters from a desired string but takes into account characters encoded with different byte sizes.
|
|
* @param text The text from which to return the subset
|
|
* @param startChar The starting character from @param text (zero-based)
|
|
* @param numChars The number of characters to return starting from @param startChar
|
|
* @returns The resulting slice of characters
|
|
*/
|
|
export const substringUtf = (text: string, startChar: number, numChars: number) => {
|
|
const encoded = new TextEncoder().encode(text);
|
|
|
|
let currentChar = 0;
|
|
let currentByte = 0;
|
|
let startByte = 0;
|
|
for (; currentChar < startChar + numChars; ) {
|
|
if (currentChar === startChar) {
|
|
startByte = currentByte;
|
|
}
|
|
|
|
/*
|
|
Unicode is utf encoded using 1, 2, 3, or 4 bytes
|
|
In a byte array, we know how many bytes the character is encoded based on the first byte because it
|
|
was developed such that the first byte's range never occurs in any other byte. Subsequent bytes are
|
|
always within 128 and 191. So in binary it breaks down like this:
|
|
1 byte: 0xxxxxxx
|
|
2 bytes: 110xxxxx 10xxxxxx
|
|
3 bytes: 1110xxxx 10xxxxxx 10xxxxxx
|
|
4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
*/
|
|
switch (true) {
|
|
// The originall ASCII set is between 0 (00000000) and 127 (01111111) and those only take up one byte
|
|
case encoded[currentByte] >= 0 && encoded[currentByte] <= 127:
|
|
currentByte++;
|
|
break;
|
|
// But if the first byte is within 192 (11000000) and 223 (11011111) then we know the character is two bytes:
|
|
case encoded[currentByte] >= 192 && encoded[currentByte] <= 223:
|
|
currentByte = currentByte + 2;
|
|
break;
|
|
// If the first byte is anything within 224 (11100000) and 239 (11101111) then the character is three bytes
|
|
case encoded[currentByte] >= 224 && encoded[currentByte] <= 239:
|
|
currentByte = currentByte + 3;
|
|
break;
|
|
// If the first byte is anything within 240 (11110000) and 247 (11110111) then the character is four bytes
|
|
case encoded[currentByte] >= 240 && encoded[currentByte] <= 247:
|
|
currentByte = currentByte + 4;
|
|
break;
|
|
// Anything past is an error for now
|
|
default:
|
|
throw new Error("Unrecognized character");
|
|
}
|
|
currentChar++;
|
|
}
|
|
|
|
return new TextDecoder().decode(encoded.slice(startByte, currentByte));
|
|
};
|