Strip null bytes from extracted text to fix PostgreSQL UTF-8 errors

Some PDFs contain \x00 null bytes in their text which PostgreSQL rejects with "invalid byte sequence for encoding UTF8: 0x00". Sanitize extracted text in both document-analyzer and file-content-extractor services. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 11:34:05 +01:00
parent 1a0525c108
commit d80043c4aa
2 changed files with 13 additions and 6 deletions
--- a/src/server/services/document-analyzer.ts
+++ b/src/server/services/document-analyzer.ts
@@ -32,6 +32,13 @@ function isAnalyzableMimeType(mimeType: string): boolean {
  return ANALYZABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
 }

+/**
+ * Strip null bytes from extracted text — PostgreSQL rejects \x00 in UTF-8 text columns.
+ */
+function sanitizeText(text: string): string {
+  return text.replace(/\0/g, '')
+}
+
 // ─── Types ──────────────────────────────────────────────────────────────────

 export type AnalysisResult = {
@@ -124,7 +131,7 @@ export async function analyzeFileContent(

    if (mimeType === 'application/pdf') {
      const parsed = await parsePdf(buffer)
-      text = parsed.text
+      text = sanitizeText(parsed.text)
      pageCount = parsed.pageCount
    } else if (
      mimeType ===
@@ -132,10 +139,10 @@ export async function analyzeFileContent(
      mimeType === 'application/msword'
    ) {
      const parsed = await parseDocx(buffer)
-      text = parsed.text
+      text = sanitizeText(parsed.text)
    } else {
      // Text-based files (plain text, CSV, markdown, HTML, RTF)
-      text = buffer.toString('utf-8')
+      text = sanitizeText(buffer.toString('utf-8'))
    }

    result.pageCount = pageCount