Strip null bytes from extracted text to fix PostgreSQL UTF-8 errors

Some PDFs contain \x00 null bytes in their text which PostgreSQL rejects with "invalid byte sequence for encoding UTF8: 0x00". Sanitize extracted text in both document-analyzer and file-content-extractor services. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 11:34:05 +01:00
parent 1a0525c108
commit d80043c4aa
2 changed files with 13 additions and 6 deletions
--- a/src/server/services/file-content-extractor.ts
+++ b/src/server/services/file-content-extractor.ts
@@ -62,17 +62,17 @@ export async function extractFileContent(
      const { extractText, getDocumentProxy } = await import('unpdf')
      const pdf = await getDocumentProxy(new Uint8Array(buffer))
      const result = await extractText(pdf, { mergePages: true })
-      text = result.text as string
+      text = (result.text as string).replace(/\0/g, '')
    } else if (
      mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
      mimeType === 'application/msword'
    ) {
      const mammoth = await import('mammoth')
      const result = await mammoth.extractRawText({ buffer })
-      text = result.value
+      text = result.value.replace(/\0/g, '')
    } else {
      // Text-based files
-      text = buffer.toString('utf-8')
+      text = buffer.toString('utf-8').replace(/\0/g, '')
    }

    // Truncate to limit