Fixes an issue where tab titles were not truncating when characters used 4 bytes for encoding (#2254)

* Fixes an issue where tab titles were not truncating when characters used 4 bytes for encoding. * Changed substringUtf method to be more accurate and added comments
2026-07-21 21:07:23 +01:00 · 2025-11-24 12:36:03 -08:00
parent 0fac59967a
commit 490309b403
3 changed files with 75 additions and 2 deletions
@@ -1,5 +1,6 @@
 import { OpenTab } from "Contracts/ActionContracts";
 import { KeyboardActionGroup, clearKeyboardActionGroup } from "KeyboardShortcuts";
 import { substringUtf } from "Utils/StringUtils";
 import * as ko from "knockout";
 import * as Constants from "../../Common/Constants";
 import * as ThemeUtility from "../../Common/ThemeUtility";
@@ -154,13 +155,13 @@ export default class TabsBase extends WaitsForTemplateViewModel {
    const db = this.database?.id();
    if (coll) {
      if (coll.length > 8) {
-        return coll.slice(0, 5) + "…" + options.title;
+        return substringUtf(coll, 0, 5) + "…" + options.title;
      } else {
        return coll + "." + options.title;
      }
    } else if (db) {
      if (db.length > 8) {
-        return db.slice(0, 5) + "…" + options.title;
+        return substringUtf(db, 0, 5) + "…" + options.title;
      } else {
        return db + "." + options.title;
      }
@@ -26,5 +26,22 @@ describe("StringUtils", () => {
      const transformedString: string | undefined = StringUtils.stripSpacesFromString("");
      expect(transformedString).toBe("");
    });
    it("should return the right number of characters regardless of bytes used per character", () => {
      // Tried to use a sample of characters across the range for each of the individual byte lengths
      const ascii = "!,n~!,n~!,n~";
      const twoByteCharacters = "Āā߿܀Āā߿܀Āā߿܀";
      const threeByteCharacters = "ࠀ倀ꀀࠀ倀ꀀࠀ倀ꀀ";
      const fourByteCharacters = "𐀀𐔀𐨀𐿶𐀀𐔀𐨀𐿶𐀀𐔀𐨀𐿶";
      // Used a random character generator for each of the different byte-lengths of characters for the mixed tests
      const mixedByteSizes = "Yח䙶𫶾eԚ疿𱺿]߉ꗫ𢆤*ɉ貸𪡑";
      expect(StringUtils.substringUtf(ascii, 0, 5)).toBe("!,n~!");
      expect(StringUtils.substringUtf(twoByteCharacters, 0, 5)).toBe("Āā߿܀Ā");
      expect(StringUtils.substringUtf(threeByteCharacters, 0, 5)).toBe("ࠀ倀ꀀࠀ");
      expect(StringUtils.substringUtf(fourByteCharacters, 0, 5)).toBe("𐀀𐔀𐨀𐿶𐀀");
      expect(StringUtils.substringUtf(mixedByteSizes, 0, 5)).toBe("Yח䙶𫶾e");
      expect(StringUtils.substringUtf(mixedByteSizes, 4, 4)).toBe("eԚ疿𱺿");
    });
  });
 });
@@ -17,3 +17,58 @@ export function endsWith(stringToTest: string, suffix: string): boolean {
 export function startsWith(stringToTest: string, prefix: string): boolean {
  return stringToTest.indexOf(prefix) === 0;
 }
 /**
 * Returns the input number of characters from a desired string but takes into account characters encoded with different byte sizes.
 * @param text The text from which to return the subset
 * @param startChar The starting character from @param text (zero-based)
 * @param numChars The number of characters to return starting from @param startChar
 * @returns The resulting slice of characters
 */
 export const substringUtf = (text: string, startChar: number, numChars: number) => {
  const encoded = new TextEncoder().encode(text);
  let currentChar = 0;
  let currentByte = 0;
  let startByte = 0;
  for (; currentChar < startChar + numChars; ) {
    if (currentChar === startChar) {
      startByte = currentByte;
    }
    /*
    Unicode is utf encoded using 1, 2, 3, or 4 bytes
    In a byte array, we know how many bytes the character is encoded based on the first byte because it
    was developed such that the first byte's range never occurs in any other byte. Subsequent bytes are
    always within 128 and 191. So in binary it breaks down like this:
    1 byte:  0xxxxxxx
    2 bytes: 110xxxxx 10xxxxxx
    3 bytes: 1110xxxx 10xxxxxx 10xxxxxx
    4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    */
    switch (true) {
      // The originall ASCII set is between 0 (00000000) and 127 (01111111) and those only take up one byte
      case encoded[currentByte] >= 0 && encoded[currentByte] <= 127:
        currentByte++;
        break;
      // But if the first byte is within 192 (11000000) and 223 (11011111) then we know the character is two bytes:
      case encoded[currentByte] >= 192 && encoded[currentByte] <= 223:
        currentByte = currentByte + 2;
        break;
      // If the first byte is anything within 224 (11100000) and 239 (11101111) then the character is three bytes
      case encoded[currentByte] >= 224 && encoded[currentByte] <= 239:
        currentByte = currentByte + 3;
        break;
      // If the first byte is anything within 240 (11110000) and 247 (11110111) then the character is four bytes
      case encoded[currentByte] >= 240 && encoded[currentByte] <= 247:
        currentByte = currentByte + 4;
        break;
      // Anything past is an error for now
      default:
        throw new Error("Unrecognized character");
    }
    currentChar++;
  }
  return new TextDecoder().decode(encoded.slice(startByte, currentByte));
 };