Fixes an issue where tab titles were not truncating when characters used 4 bytes for encoding (#2254)

* Fixes an issue where tab titles were not truncating when characters used 4 bytes for encoding.

* Changed substringUtf method to be more accurate and added comments
This commit is contained in:
vchske
2025-11-24 12:36:03 -08:00
committed by GitHub
parent 0fac59967a
commit 490309b403
3 changed files with 75 additions and 2 deletions

View File

@@ -1,5 +1,6 @@
import { OpenTab } from "Contracts/ActionContracts"; import { OpenTab } from "Contracts/ActionContracts";
import { KeyboardActionGroup, clearKeyboardActionGroup } from "KeyboardShortcuts"; import { KeyboardActionGroup, clearKeyboardActionGroup } from "KeyboardShortcuts";
import { substringUtf } from "Utils/StringUtils";
import * as ko from "knockout"; import * as ko from "knockout";
import * as Constants from "../../Common/Constants"; import * as Constants from "../../Common/Constants";
import * as ThemeUtility from "../../Common/ThemeUtility"; import * as ThemeUtility from "../../Common/ThemeUtility";
@@ -154,13 +155,13 @@ export default class TabsBase extends WaitsForTemplateViewModel {
const db = this.database?.id(); const db = this.database?.id();
if (coll) { if (coll) {
if (coll.length > 8) { if (coll.length > 8) {
return coll.slice(0, 5) + "…" + options.title; return substringUtf(coll, 0, 5) + "…" + options.title;
} else { } else {
return coll + "." + options.title; return coll + "." + options.title;
} }
} else if (db) { } else if (db) {
if (db.length > 8) { if (db.length > 8) {
return db.slice(0, 5) + "…" + options.title; return substringUtf(db, 0, 5) + "…" + options.title;
} else { } else {
return db + "." + options.title; return db + "." + options.title;
} }

View File

@@ -26,5 +26,22 @@ describe("StringUtils", () => {
const transformedString: string | undefined = StringUtils.stripSpacesFromString(""); const transformedString: string | undefined = StringUtils.stripSpacesFromString("");
expect(transformedString).toBe(""); expect(transformedString).toBe("");
}); });
it("should return the right number of characters regardless of bytes used per character", () => {
// Tried to use a sample of characters across the range for each of the individual byte lengths
const ascii = "!,n~!,n~!,n~";
const twoByteCharacters = "Āā߿܀Āā߿܀Āā߿܀";
const threeByteCharacters = "ࠀ倀ꀀࠀ倀ꀀࠀ倀ꀀ";
const fourByteCharacters = "𐀀𐔀𐨀𐿶𐀀𐔀𐨀𐿶𐀀𐔀𐨀𐿶";
// Used a random character generator for each of the different byte-lengths of characters for the mixed tests
const mixedByteSizes = "Yח䙶𫶾eԚ疿𱺿]߉ꗫ𢆤*ɉ貸𪡑";
expect(StringUtils.substringUtf(ascii, 0, 5)).toBe("!,n~!");
expect(StringUtils.substringUtf(twoByteCharacters, 0, 5)).toBe("Āā߿܀Ā");
expect(StringUtils.substringUtf(threeByteCharacters, 0, 5)).toBe("ࠀ倀ꀀࠀ");
expect(StringUtils.substringUtf(fourByteCharacters, 0, 5)).toBe("𐀀𐔀𐨀𐿶𐀀");
expect(StringUtils.substringUtf(mixedByteSizes, 0, 5)).toBe("Yח䙶𫶾e");
expect(StringUtils.substringUtf(mixedByteSizes, 4, 4)).toBe("eԚ疿𱺿");
});
}); });
}); });

View File

@@ -17,3 +17,58 @@ export function endsWith(stringToTest: string, suffix: string): boolean {
export function startsWith(stringToTest: string, prefix: string): boolean { export function startsWith(stringToTest: string, prefix: string): boolean {
return stringToTest.indexOf(prefix) === 0; return stringToTest.indexOf(prefix) === 0;
} }
/**
* Returns the input number of characters from a desired string but takes into account characters encoded with different byte sizes.
* @param text The text from which to return the subset
* @param startChar The starting character from @param text (zero-based)
* @param numChars The number of characters to return starting from @param startChar
* @returns The resulting slice of characters
*/
export const substringUtf = (text: string, startChar: number, numChars: number) => {
const encoded = new TextEncoder().encode(text);
let currentChar = 0;
let currentByte = 0;
let startByte = 0;
for (; currentChar < startChar + numChars; ) {
if (currentChar === startChar) {
startByte = currentByte;
}
/*
Unicode is utf encoded using 1, 2, 3, or 4 bytes
In a byte array, we know how many bytes the character is encoded based on the first byte because it
was developed such that the first byte's range never occurs in any other byte. Subsequent bytes are
always within 128 and 191. So in binary it breaks down like this:
1 byte: 0xxxxxxx
2 bytes: 110xxxxx 10xxxxxx
3 bytes: 1110xxxx 10xxxxxx 10xxxxxx
4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
switch (true) {
// The originall ASCII set is between 0 (00000000) and 127 (01111111) and those only take up one byte
case encoded[currentByte] >= 0 && encoded[currentByte] <= 127:
currentByte++;
break;
// But if the first byte is within 192 (11000000) and 223 (11011111) then we know the character is two bytes:
case encoded[currentByte] >= 192 && encoded[currentByte] <= 223:
currentByte = currentByte + 2;
break;
// If the first byte is anything within 224 (11100000) and 239 (11101111) then the character is three bytes
case encoded[currentByte] >= 224 && encoded[currentByte] <= 239:
currentByte = currentByte + 3;
break;
// If the first byte is anything within 240 (11110000) and 247 (11110111) then the character is four bytes
case encoded[currentByte] >= 240 && encoded[currentByte] <= 247:
currentByte = currentByte + 4;
break;
// Anything past is an error for now
default:
throw new Error("Unrecognized character");
}
currentChar++;
}
return new TextDecoder().decode(encoded.slice(startByte, currentByte));
};