Fix document analysis: switch to unpdf + mammoth for PDF/Word parsing
All checks were successful
Build and Push Docker Image / build (push) Successful in 11m26s

pdf-parse v2 requires DOMMatrix (browser API) which fails in Node.js.
Replaced with unpdf (serverless PDF.js build) for PDFs and mammoth for
Word .docx files. Also fixed the same broken pdf-parse usage in
file-content-extractor.ts used by AI filtering.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt
2026-02-17 10:27:36 +01:00
parent c9640c6086
commit ed5e782f61
4 changed files with 298 additions and 26 deletions

View File

@@ -15,6 +15,8 @@ import { getStorageProvider } from '@/lib/storage'
const MAX_TEXT_PER_FILE = 50_000 // ~50KB of text per file
const PARSEABLE_MIME_TYPES = [
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/msword',
'text/plain',
'text/csv',
'text/markdown',
@@ -57,11 +59,17 @@ export async function extractFileContent(
let text: string
if (mimeType === 'application/pdf') {
// Dynamic import to avoid loading pdf-parse when not needed
const pdfParseModule = await import('pdf-parse')
const pdfParse = typeof pdfParseModule === 'function' ? pdfParseModule : (pdfParseModule as any).default ?? pdfParseModule
const pdf = await pdfParse(buffer)
text = pdf.text
const { extractText, getDocumentProxy } = await import('unpdf')
const pdf = await getDocumentProxy(new Uint8Array(buffer))
const result = await extractText(pdf, { mergePages: true })
text = result.text as string
} else if (
mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
mimeType === 'application/msword'
) {
const mammoth = await import('mammoth')
const result = await mammoth.extractRawText({ buffer })
text = result.value
} else {
// Text-based files
text = buffer.toString('utf-8')