diff --git a/src/Explorer/Tabs/TabsBase.ts b/src/Explorer/Tabs/TabsBase.ts index 2b97fed3e..2602b672d 100644 --- a/src/Explorer/Tabs/TabsBase.ts +++ b/src/Explorer/Tabs/TabsBase.ts @@ -1,5 +1,6 @@ import { OpenTab } from "Contracts/ActionContracts"; import { KeyboardActionGroup, clearKeyboardActionGroup } from "KeyboardShortcuts"; +import { substringUtf } from "Utils/StringUtils"; import * as ko from "knockout"; import * as Constants from "../../Common/Constants"; import * as ThemeUtility from "../../Common/ThemeUtility"; @@ -154,13 +155,13 @@ export default class TabsBase extends WaitsForTemplateViewModel { const db = this.database?.id(); if (coll) { if (coll.length > 8) { - return coll.slice(0, 5) + "…" + options.title; + return substringUtf(coll, 0, 5) + "…" + options.title; } else { return coll + "." + options.title; } } else if (db) { if (db.length > 8) { - return db.slice(0, 5) + "…" + options.title; + return substringUtf(db, 0, 5) + "…" + options.title; } else { return db + "." + options.title; } diff --git a/src/Utils/StringUtils.test.ts b/src/Utils/StringUtils.test.ts index bd626f492..408556f07 100644 --- a/src/Utils/StringUtils.test.ts +++ b/src/Utils/StringUtils.test.ts @@ -26,5 +26,22 @@ describe("StringUtils", () => { const transformedString: string | undefined = StringUtils.stripSpacesFromString(""); expect(transformedString).toBe(""); }); + + it("should return the right number of characters regardless of bytes used per character", () => { + // Tried to use a sample of characters across the range for each of the individual byte lengths + const ascii = "!,n~!,n~!,n~"; + const twoByteCharacters = "Āā߿܀Āā߿܀Āā߿܀"; + const threeByteCharacters = "ࠀ倀ꀀࠀ倀ꀀࠀ倀ꀀ"; + const fourByteCharacters = "𐀀𐔀𐨀𐿶𐀀𐔀𐨀𐿶𐀀𐔀𐨀𐿶"; + // Used a random character generator for each of the different byte-lengths of characters for the mixed tests + const mixedByteSizes = "Yח䙶𫶾eԚ疿𱺿]߉ꗫ𢆤*ɉ貸𪡑"; + + expect(StringUtils.substringUtf(ascii, 0, 5)).toBe("!,n~!"); + expect(StringUtils.substringUtf(twoByteCharacters, 0, 5)).toBe("Āā߿܀Ā"); + expect(StringUtils.substringUtf(threeByteCharacters, 0, 5)).toBe("ࠀ倀ꀀࠀ"); + expect(StringUtils.substringUtf(fourByteCharacters, 0, 5)).toBe("𐀀𐔀𐨀𐿶𐀀"); + expect(StringUtils.substringUtf(mixedByteSizes, 0, 5)).toBe("Yח䙶𫶾e"); + expect(StringUtils.substringUtf(mixedByteSizes, 4, 4)).toBe("eԚ疿𱺿"); + }); }); }); diff --git a/src/Utils/StringUtils.ts b/src/Utils/StringUtils.ts index 02ceba5f2..3e0d68ba2 100644 --- a/src/Utils/StringUtils.ts +++ b/src/Utils/StringUtils.ts @@ -17,3 +17,58 @@ export function endsWith(stringToTest: string, suffix: string): boolean { export function startsWith(stringToTest: string, prefix: string): boolean { return stringToTest.indexOf(prefix) === 0; } + +/** + * Returns the input number of characters from a desired string but takes into account characters encoded with different byte sizes. + * @param text The text from which to return the subset + * @param startChar The starting character from @param text (zero-based) + * @param numChars The number of characters to return starting from @param startChar + * @returns The resulting slice of characters + */ +export const substringUtf = (text: string, startChar: number, numChars: number) => { + const encoded = new TextEncoder().encode(text); + + let currentChar = 0; + let currentByte = 0; + let startByte = 0; + for (; currentChar < startChar + numChars; ) { + if (currentChar === startChar) { + startByte = currentByte; + } + + /* + Unicode is utf encoded using 1, 2, 3, or 4 bytes + In a byte array, we know how many bytes the character is encoded based on the first byte because it + was developed such that the first byte's range never occurs in any other byte. Subsequent bytes are + always within 128 and 191. So in binary it breaks down like this: + 1 byte: 0xxxxxxx + 2 bytes: 110xxxxx 10xxxxxx + 3 bytes: 1110xxxx 10xxxxxx 10xxxxxx + 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + switch (true) { + // The originall ASCII set is between 0 (00000000) and 127 (01111111) and those only take up one byte + case encoded[currentByte] >= 0 && encoded[currentByte] <= 127: + currentByte++; + break; + // But if the first byte is within 192 (11000000) and 223 (11011111) then we know the character is two bytes: + case encoded[currentByte] >= 192 && encoded[currentByte] <= 223: + currentByte = currentByte + 2; + break; + // If the first byte is anything within 224 (11100000) and 239 (11101111) then the character is three bytes + case encoded[currentByte] >= 224 && encoded[currentByte] <= 239: + currentByte = currentByte + 3; + break; + // If the first byte is anything within 240 (11110000) and 247 (11110111) then the character is four bytes + case encoded[currentByte] >= 240 && encoded[currentByte] <= 247: + currentByte = currentByte + 4; + break; + // Anything past is an error for now + default: + throw new Error("Unrecognized character"); + } + currentChar++; + } + + return new TextDecoder().decode(encoded.slice(startByte, currentByte)); +};