From d80043c4aa2758a6d92894d9adaa39331e3167c0 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 17 Feb 2026 11:34:05 +0100 Subject: [PATCH] Strip null bytes from extracted text to fix PostgreSQL UTF-8 errors Some PDFs contain \x00 null bytes in their text which PostgreSQL rejects with "invalid byte sequence for encoding UTF8: 0x00". Sanitize extracted text in both document-analyzer and file-content-extractor services. Co-Authored-By: Claude Opus 4.6 --- src/server/services/document-analyzer.ts | 13 ++++++++++--- src/server/services/file-content-extractor.ts | 6 +++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/server/services/document-analyzer.ts b/src/server/services/document-analyzer.ts index 55c0ef1..1913ae3 100644 --- a/src/server/services/document-analyzer.ts +++ b/src/server/services/document-analyzer.ts @@ -32,6 +32,13 @@ function isAnalyzableMimeType(mimeType: string): boolean { return ANALYZABLE_MIME_TYPES.some((t) => mimeType.startsWith(t)) } +/** + * Strip null bytes from extracted text — PostgreSQL rejects \x00 in UTF-8 text columns. + */ +function sanitizeText(text: string): string { + return text.replace(/\0/g, '') +} + // ─── Types ────────────────────────────────────────────────────────────────── export type AnalysisResult = { @@ -124,7 +131,7 @@ export async function analyzeFileContent( if (mimeType === 'application/pdf') { const parsed = await parsePdf(buffer) - text = parsed.text + text = sanitizeText(parsed.text) pageCount = parsed.pageCount } else if ( mimeType === @@ -132,10 +139,10 @@ export async function analyzeFileContent( mimeType === 'application/msword' ) { const parsed = await parseDocx(buffer) - text = parsed.text + text = sanitizeText(parsed.text) } else { // Text-based files (plain text, CSV, markdown, HTML, RTF) - text = buffer.toString('utf-8') + text = sanitizeText(buffer.toString('utf-8')) } result.pageCount = pageCount diff --git a/src/server/services/file-content-extractor.ts b/src/server/services/file-content-extractor.ts index 623ea3f..816e370 100644 --- a/src/server/services/file-content-extractor.ts +++ b/src/server/services/file-content-extractor.ts @@ -62,17 +62,17 @@ export async function extractFileContent( const { extractText, getDocumentProxy } = await import('unpdf') const pdf = await getDocumentProxy(new Uint8Array(buffer)) const result = await extractText(pdf, { mergePages: true }) - text = result.text as string + text = (result.text as string).replace(/\0/g, '') } else if ( mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' || mimeType === 'application/msword' ) { const mammoth = await import('mammoth') const result = await mammoth.extractRawText({ buffer }) - text = result.value + text = result.value.replace(/\0/g, '') } else { // Text-based files - text = buffer.toString('utf-8') + text = buffer.toString('utf-8').replace(/\0/g, '') } // Truncate to limit