From d80043c4aa2758a6d92894d9adaa39331e3167c0 Mon Sep 17 00:00:00 2001
From: Matt <matt@Matt-Surface.local>
Date: Tue, 17 Feb 2026 11:34:05 +0100
Subject: [PATCH] Strip null bytes from extracted text to fix PostgreSQL UTF-8
 errors

Some PDFs contain \x00 null bytes in their text which PostgreSQL rejects
with "invalid byte sequence for encoding UTF8: 0x00". Sanitize extracted
text in both document-analyzer and file-content-extractor services.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/server/services/document-analyzer.ts      | 13 ++++++++++---
 src/server/services/file-content-extractor.ts |  6 +++---
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/server/services/document-analyzer.ts b/src/server/services/document-analyzer.ts
index 55c0ef1..1913ae3 100644
--- a/src/server/services/document-analyzer.ts
+++ b/src/server/services/document-analyzer.ts
@@ -32,6 +32,13 @@ function isAnalyzableMimeType(mimeType: string): boolean {
   return ANALYZABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
 }
 
+/**
+ * Strip null bytes from extracted text — PostgreSQL rejects \x00 in UTF-8 text columns.
+ */
+function sanitizeText(text: string): string {
+  return text.replace(/\0/g, '')
+}
+
 // ─── Types ──────────────────────────────────────────────────────────────────
 
 export type AnalysisResult = {
@@ -124,7 +131,7 @@ export async function analyzeFileContent(
 
     if (mimeType === 'application/pdf') {
       const parsed = await parsePdf(buffer)
-      text = parsed.text
+      text = sanitizeText(parsed.text)
       pageCount = parsed.pageCount
     } else if (
       mimeType ===
@@ -132,10 +139,10 @@ export async function analyzeFileContent(
       mimeType === 'application/msword'
     ) {
       const parsed = await parseDocx(buffer)
-      text = parsed.text
+      text = sanitizeText(parsed.text)
     } else {
       // Text-based files (plain text, CSV, markdown, HTML, RTF)
-      text = buffer.toString('utf-8')
+      text = sanitizeText(buffer.toString('utf-8'))
     }
 
     result.pageCount = pageCount
diff --git a/src/server/services/file-content-extractor.ts b/src/server/services/file-content-extractor.ts
index 623ea3f..816e370 100644
--- a/src/server/services/file-content-extractor.ts
+++ b/src/server/services/file-content-extractor.ts
@@ -62,17 +62,17 @@ export async function extractFileContent(
       const { extractText, getDocumentProxy } = await import('unpdf')
       const pdf = await getDocumentProxy(new Uint8Array(buffer))
       const result = await extractText(pdf, { mergePages: true })
-      text = result.text as string
+      text = (result.text as string).replace(/\0/g, '')
     } else if (
       mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
       mimeType === 'application/msword'
     ) {
       const mammoth = await import('mammoth')
       const result = await mammoth.extractRawText({ buffer })
-      text = result.value
+      text = result.value.replace(/\0/g, '')
     } else {
       // Text-based files
-      text = buffer.toString('utf-8')
+      text = buffer.toString('utf-8').replace(/\0/g, '')
     }
 
     // Truncate to limit