Strip null bytes from extracted text to fix PostgreSQL UTF-8 errors
Some checks failed
Build and Push Docker Image / build (push) Has been cancelled
Some checks failed
Build and Push Docker Image / build (push) Has been cancelled
Some PDFs contain \x00 null bytes in their text which PostgreSQL rejects with "invalid byte sequence for encoding UTF8: 0x00". Sanitize extracted text in both document-analyzer and file-content-extractor services. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -62,17 +62,17 @@ export async function extractFileContent(
|
||||
const { extractText, getDocumentProxy } = await import('unpdf')
|
||||
const pdf = await getDocumentProxy(new Uint8Array(buffer))
|
||||
const result = await extractText(pdf, { mergePages: true })
|
||||
text = result.text as string
|
||||
text = (result.text as string).replace(/\0/g, '')
|
||||
} else if (
|
||||
mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
|
||||
mimeType === 'application/msword'
|
||||
) {
|
||||
const mammoth = await import('mammoth')
|
||||
const result = await mammoth.extractRawText({ buffer })
|
||||
text = result.value
|
||||
text = result.value.replace(/\0/g, '')
|
||||
} else {
|
||||
// Text-based files
|
||||
text = buffer.toString('utf-8')
|
||||
text = buffer.toString('utf-8').replace(/\0/g, '')
|
||||
}
|
||||
|
||||
// Truncate to limit
|
||||
|
||||
Reference in New Issue
Block a user