From f1408a3ce17d4f86008540c48a6edd8d315210e6 Mon Sep 17 00:00:00 2001 From: Charles Packer Date: Sat, 31 Jan 2026 20:12:48 -0800 Subject: [PATCH] fix(cli): smarter mojibake detection to preserve valid Unicode (#764) Co-authored-by: Letta --- src/cli/components/Text.tsx | 70 ++++++++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 16 deletions(-) diff --git a/src/cli/components/Text.tsx b/src/cli/components/Text.tsx index aa125d2..0fd0ebe 100644 --- a/src/cli/components/Text.tsx +++ b/src/cli/components/Text.tsx @@ -4,6 +4,56 @@ import type { ReactNode } from "react"; const isBun = typeof Bun !== "undefined"; const decoder = new TextDecoder("utf-8", { fatal: false }); +function isContinuationByte(byte: number): boolean { + return byte >= 0x80 && byte <= 0xbf; +} + +function looksLikeMojibake(value: string): boolean { + let sawUtf8Sequence = false; + + for (let i = 0; i < value.length; i++) { + const byte = value.charCodeAt(i); + + // If any code unit is outside byte range, it's real Unicode already. + if (byte > 0xff) return false; + + if (byte >= 0xc2 && byte <= 0xdf) { + if (i + 1 < value.length && isContinuationByte(value.charCodeAt(i + 1))) { + sawUtf8Sequence = true; + i += 1; + continue; + } + } + + if (byte >= 0xe0 && byte <= 0xef) { + if ( + i + 2 < value.length && + isContinuationByte(value.charCodeAt(i + 1)) && + isContinuationByte(value.charCodeAt(i + 2)) + ) { + sawUtf8Sequence = true; + i += 2; + continue; + } + } + + if (byte >= 0xf0 && byte <= 0xf4) { + if ( + i + 3 < value.length && + isContinuationByte(value.charCodeAt(i + 1)) && + isContinuationByte(value.charCodeAt(i + 2)) && + isContinuationByte(value.charCodeAt(i + 3)) + ) { + sawUtf8Sequence = true; + i += 3; + continue; + } + } + } + + return sawUtf8Sequence; +} + function fixBunEncoding(value: ReactNode): ReactNode { if (!isBun) return value; @@ -11,25 +61,13 @@ function fixBunEncoding(value: ReactNode): ReactNode { // Quick check: if no non-ASCII characters, return as-is if (!/[\x80-\xFF]/.test(value)) return value; - const bytes: number[] = []; + if (!looksLikeMojibake(value)) return value; + const bytes = new Uint8Array(value.length); for (let i = 0; i < value.length; i++) { - const code = value.charCodeAt(i); - - // Check for 2-byte UTF-8 sequence: 0xC2 followed by 0x80-0xBF - if (code === 0xc2 && i + 1 < value.length) { - const nextCode = value.charCodeAt(i + 1); - if (nextCode >= 0x80 && nextCode <= 0xbf) { - bytes.push(0xc2, nextCode); - i++; - continue; - } - } - - bytes.push(code); + bytes[i] = value.charCodeAt(i); } - - return decoder.decode(new Uint8Array(bytes)); + return decoder.decode(bytes); } // Handle arrays of children