Strip null bytes from extracted text to fix PostgreSQL UTF-8 errors
Some checks failed
Build and Push Docker Image / build (push) Has been cancelled
Some checks failed
Build and Push Docker Image / build (push) Has been cancelled
Some PDFs contain \x00 null bytes in their text which PostgreSQL rejects with "invalid byte sequence for encoding UTF8: 0x00". Sanitize extracted text in both document-analyzer and file-content-extractor services. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -32,6 +32,13 @@ function isAnalyzableMimeType(mimeType: string): boolean {
|
|||||||
return ANALYZABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
|
return ANALYZABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Strip null bytes from extracted text — PostgreSQL rejects \x00 in UTF-8 text columns.
|
||||||
|
*/
|
||||||
|
function sanitizeText(text: string): string {
|
||||||
|
return text.replace(/\0/g, '')
|
||||||
|
}
|
||||||
|
|
||||||
// ─── Types ──────────────────────────────────────────────────────────────────
|
// ─── Types ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
export type AnalysisResult = {
|
export type AnalysisResult = {
|
||||||
@@ -124,7 +131,7 @@ export async function analyzeFileContent(
|
|||||||
|
|
||||||
if (mimeType === 'application/pdf') {
|
if (mimeType === 'application/pdf') {
|
||||||
const parsed = await parsePdf(buffer)
|
const parsed = await parsePdf(buffer)
|
||||||
text = parsed.text
|
text = sanitizeText(parsed.text)
|
||||||
pageCount = parsed.pageCount
|
pageCount = parsed.pageCount
|
||||||
} else if (
|
} else if (
|
||||||
mimeType ===
|
mimeType ===
|
||||||
@@ -132,10 +139,10 @@ export async function analyzeFileContent(
|
|||||||
mimeType === 'application/msword'
|
mimeType === 'application/msword'
|
||||||
) {
|
) {
|
||||||
const parsed = await parseDocx(buffer)
|
const parsed = await parseDocx(buffer)
|
||||||
text = parsed.text
|
text = sanitizeText(parsed.text)
|
||||||
} else {
|
} else {
|
||||||
// Text-based files (plain text, CSV, markdown, HTML, RTF)
|
// Text-based files (plain text, CSV, markdown, HTML, RTF)
|
||||||
text = buffer.toString('utf-8')
|
text = sanitizeText(buffer.toString('utf-8'))
|
||||||
}
|
}
|
||||||
|
|
||||||
result.pageCount = pageCount
|
result.pageCount = pageCount
|
||||||
|
|||||||
@@ -62,17 +62,17 @@ export async function extractFileContent(
|
|||||||
const { extractText, getDocumentProxy } = await import('unpdf')
|
const { extractText, getDocumentProxy } = await import('unpdf')
|
||||||
const pdf = await getDocumentProxy(new Uint8Array(buffer))
|
const pdf = await getDocumentProxy(new Uint8Array(buffer))
|
||||||
const result = await extractText(pdf, { mergePages: true })
|
const result = await extractText(pdf, { mergePages: true })
|
||||||
text = result.text as string
|
text = (result.text as string).replace(/\0/g, '')
|
||||||
} else if (
|
} else if (
|
||||||
mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
|
mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
|
||||||
mimeType === 'application/msword'
|
mimeType === 'application/msword'
|
||||||
) {
|
) {
|
||||||
const mammoth = await import('mammoth')
|
const mammoth = await import('mammoth')
|
||||||
const result = await mammoth.extractRawText({ buffer })
|
const result = await mammoth.extractRawText({ buffer })
|
||||||
text = result.value
|
text = result.value.replace(/\0/g, '')
|
||||||
} else {
|
} else {
|
||||||
// Text-based files
|
// Text-based files
|
||||||
text = buffer.toString('utf-8')
|
text = buffer.toString('utf-8').replace(/\0/g, '')
|
||||||
}
|
}
|
||||||
|
|
||||||
// Truncate to limit
|
// Truncate to limit
|
||||||
|
|||||||
Reference in New Issue
Block a user