mirror of
https://github.com/Azure/cosmos-explorer.git
synced 2025-12-19 00:41:31 +00:00
Fixes an issue where tab titles were not truncating when characters used 4 bytes for encoding (#2254)
* Fixes an issue where tab titles were not truncating when characters used 4 bytes for encoding. * Changed substringUtf method to be more accurate and added comments
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import { OpenTab } from "Contracts/ActionContracts";
|
||||
import { KeyboardActionGroup, clearKeyboardActionGroup } from "KeyboardShortcuts";
|
||||
import { substringUtf } from "Utils/StringUtils";
|
||||
import * as ko from "knockout";
|
||||
import * as Constants from "../../Common/Constants";
|
||||
import * as ThemeUtility from "../../Common/ThemeUtility";
|
||||
@@ -154,13 +155,13 @@ export default class TabsBase extends WaitsForTemplateViewModel {
|
||||
const db = this.database?.id();
|
||||
if (coll) {
|
||||
if (coll.length > 8) {
|
||||
return coll.slice(0, 5) + "…" + options.title;
|
||||
return substringUtf(coll, 0, 5) + "…" + options.title;
|
||||
} else {
|
||||
return coll + "." + options.title;
|
||||
}
|
||||
} else if (db) {
|
||||
if (db.length > 8) {
|
||||
return db.slice(0, 5) + "…" + options.title;
|
||||
return substringUtf(db, 0, 5) + "…" + options.title;
|
||||
} else {
|
||||
return db + "." + options.title;
|
||||
}
|
||||
|
||||
@@ -26,5 +26,22 @@ describe("StringUtils", () => {
|
||||
const transformedString: string | undefined = StringUtils.stripSpacesFromString("");
|
||||
expect(transformedString).toBe("");
|
||||
});
|
||||
|
||||
it("should return the right number of characters regardless of bytes used per character", () => {
|
||||
// Tried to use a sample of characters across the range for each of the individual byte lengths
|
||||
const ascii = "!,n~!,n~!,n~";
|
||||
const twoByteCharacters = "Āā߿܀Āā߿܀Āā߿܀";
|
||||
const threeByteCharacters = "ࠀ倀ꀀࠀ倀ꀀࠀ倀ꀀ";
|
||||
const fourByteCharacters = "𐀀𐔀𐨀𐿶𐀀𐔀𐨀𐿶𐀀𐔀𐨀𐿶";
|
||||
// Used a random character generator for each of the different byte-lengths of characters for the mixed tests
|
||||
const mixedByteSizes = "Yח䙶𫶾eԚ疿𱺿]߉ꗫ𢆤*ɉ貸𪡑";
|
||||
|
||||
expect(StringUtils.substringUtf(ascii, 0, 5)).toBe("!,n~!");
|
||||
expect(StringUtils.substringUtf(twoByteCharacters, 0, 5)).toBe("Āā߿܀Ā");
|
||||
expect(StringUtils.substringUtf(threeByteCharacters, 0, 5)).toBe("ࠀ倀ꀀࠀ");
|
||||
expect(StringUtils.substringUtf(fourByteCharacters, 0, 5)).toBe("𐀀𐔀𐨀𐿶𐀀");
|
||||
expect(StringUtils.substringUtf(mixedByteSizes, 0, 5)).toBe("Yח䙶𫶾e");
|
||||
expect(StringUtils.substringUtf(mixedByteSizes, 4, 4)).toBe("eԚ疿𱺿");
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -17,3 +17,58 @@ export function endsWith(stringToTest: string, suffix: string): boolean {
|
||||
export function startsWith(stringToTest: string, prefix: string): boolean {
|
||||
return stringToTest.indexOf(prefix) === 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the input number of characters from a desired string but takes into account characters encoded with different byte sizes.
|
||||
* @param text The text from which to return the subset
|
||||
* @param startChar The starting character from @param text (zero-based)
|
||||
* @param numChars The number of characters to return starting from @param startChar
|
||||
* @returns The resulting slice of characters
|
||||
*/
|
||||
export const substringUtf = (text: string, startChar: number, numChars: number) => {
|
||||
const encoded = new TextEncoder().encode(text);
|
||||
|
||||
let currentChar = 0;
|
||||
let currentByte = 0;
|
||||
let startByte = 0;
|
||||
for (; currentChar < startChar + numChars; ) {
|
||||
if (currentChar === startChar) {
|
||||
startByte = currentByte;
|
||||
}
|
||||
|
||||
/*
|
||||
Unicode is utf encoded using 1, 2, 3, or 4 bytes
|
||||
In a byte array, we know how many bytes the character is encoded based on the first byte because it
|
||||
was developed such that the first byte's range never occurs in any other byte. Subsequent bytes are
|
||||
always within 128 and 191. So in binary it breaks down like this:
|
||||
1 byte: 0xxxxxxx
|
||||
2 bytes: 110xxxxx 10xxxxxx
|
||||
3 bytes: 1110xxxx 10xxxxxx 10xxxxxx
|
||||
4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*/
|
||||
switch (true) {
|
||||
// The originall ASCII set is between 0 (00000000) and 127 (01111111) and those only take up one byte
|
||||
case encoded[currentByte] >= 0 && encoded[currentByte] <= 127:
|
||||
currentByte++;
|
||||
break;
|
||||
// But if the first byte is within 192 (11000000) and 223 (11011111) then we know the character is two bytes:
|
||||
case encoded[currentByte] >= 192 && encoded[currentByte] <= 223:
|
||||
currentByte = currentByte + 2;
|
||||
break;
|
||||
// If the first byte is anything within 224 (11100000) and 239 (11101111) then the character is three bytes
|
||||
case encoded[currentByte] >= 224 && encoded[currentByte] <= 239:
|
||||
currentByte = currentByte + 3;
|
||||
break;
|
||||
// If the first byte is anything within 240 (11110000) and 247 (11110111) then the character is four bytes
|
||||
case encoded[currentByte] >= 240 && encoded[currentByte] <= 247:
|
||||
currentByte = currentByte + 4;
|
||||
break;
|
||||
// Anything past is an error for now
|
||||
default:
|
||||
throw new Error("Unrecognized character");
|
||||
}
|
||||
currentChar++;
|
||||
}
|
||||
|
||||
return new TextDecoder().decode(encoded.slice(startByte, currentByte));
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user