fix: sanitize null bytes before persisting in db [LET-6710] (#8147)

fix: sanitize null bytes before persisting in db
This commit is contained in:
cthomas
2025-12-30 10:02:37 -08:00
committed by Caren Thomas
parent c192600e8c
commit b83e7c1cf9

View File

@@ -167,9 +167,15 @@ class PassageManager:
if np_embedding.shape[0] != MAX_EMBEDDING_DIM:
embedding = np.pad(np_embedding, (0, MAX_EMBEDDING_DIM - np_embedding.shape[0]), mode="constant").tolist()
# Sanitize text to remove null bytes which PostgreSQL rejects
text = data["text"]
if text and "\x00" in text:
text = text.replace("\x00", "")
logger.warning(f"Removed null bytes from passage text (length: {len(data['text'])} -> {len(text)})")
common_fields = {
"id": data.get("id"),
"text": data["text"],
"text": text,
"embedding": embedding,
"embedding_config": data["embedding_config"],
"organization_id": data["organization_id"],
@@ -228,9 +234,15 @@ class PassageManager:
if np_embedding.shape[0] != MAX_EMBEDDING_DIM:
embedding = np.pad(np_embedding, (0, MAX_EMBEDDING_DIM - np_embedding.shape[0]), mode="constant").tolist()
# Sanitize text to remove null bytes which PostgreSQL rejects
text = data["text"]
if text and "\x00" in text:
text = text.replace("\x00", "")
logger.warning(f"Removed null bytes from passage text (length: {len(data['text'])} -> {len(text)})")
common_fields = {
"id": data.get("id"),
"text": data["text"],
"text": text,
"embedding": embedding,
"embedding_config": data["embedding_config"],
"organization_id": data["organization_id"],