fix(cli): smarter mojibake detection to preserve valid Unicode (#764)

Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
Charles Packer
2026-01-31 20:12:48 -08:00
committed by GitHub
parent 639c3ff49d
commit f1408a3ce1

View File

@@ -4,6 +4,56 @@ import type { ReactNode } from "react";
const isBun = typeof Bun !== "undefined";
const decoder = new TextDecoder("utf-8", { fatal: false });
function isContinuationByte(byte: number): boolean {
return byte >= 0x80 && byte <= 0xbf;
}
function looksLikeMojibake(value: string): boolean {
let sawUtf8Sequence = false;
for (let i = 0; i < value.length; i++) {
const byte = value.charCodeAt(i);
// If any code unit is outside byte range, it's real Unicode already.
if (byte > 0xff) return false;
if (byte >= 0xc2 && byte <= 0xdf) {
if (i + 1 < value.length && isContinuationByte(value.charCodeAt(i + 1))) {
sawUtf8Sequence = true;
i += 1;
continue;
}
}
if (byte >= 0xe0 && byte <= 0xef) {
if (
i + 2 < value.length &&
isContinuationByte(value.charCodeAt(i + 1)) &&
isContinuationByte(value.charCodeAt(i + 2))
) {
sawUtf8Sequence = true;
i += 2;
continue;
}
}
if (byte >= 0xf0 && byte <= 0xf4) {
if (
i + 3 < value.length &&
isContinuationByte(value.charCodeAt(i + 1)) &&
isContinuationByte(value.charCodeAt(i + 2)) &&
isContinuationByte(value.charCodeAt(i + 3))
) {
sawUtf8Sequence = true;
i += 3;
continue;
}
}
}
return sawUtf8Sequence;
}
function fixBunEncoding(value: ReactNode): ReactNode {
if (!isBun) return value;
@@ -11,25 +61,13 @@ function fixBunEncoding(value: ReactNode): ReactNode {
// Quick check: if no non-ASCII characters, return as-is
if (!/[\x80-\xFF]/.test(value)) return value;
const bytes: number[] = [];
if (!looksLikeMojibake(value)) return value;
const bytes = new Uint8Array(value.length);
for (let i = 0; i < value.length; i++) {
const code = value.charCodeAt(i);
// Check for 2-byte UTF-8 sequence: 0xC2 followed by 0x80-0xBF
if (code === 0xc2 && i + 1 < value.length) {
const nextCode = value.charCodeAt(i + 1);
if (nextCode >= 0x80 && nextCode <= 0xbf) {
bytes.push(0xc2, nextCode);
i++;
continue;
}
}
bytes.push(code);
bytes[i] = value.charCodeAt(i);
}
return decoder.decode(new Uint8Array(bytes));
return decoder.decode(bytes);
}
// Handle arrays of children