Fix document analysis: switch to unpdf + mammoth for PDF/Word parsing

pdf-parse v2 requires DOMMatrix (browser API) which fails in Node.js. Replaced with unpdf (serverless PDF.js build) for PDFs and mammoth for Word .docx files. Also fixed the same broken pdf-parse usage in file-content-extractor.ts used by AI filtering. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 10:27:36 +01:00
parent c9640c6086
commit ed5e782f61
4 changed files with 298 additions and 26 deletions
--- a/src/server/services/file-content-extractor.ts
+++ b/src/server/services/file-content-extractor.ts
@@ -15,6 +15,8 @@ import { getStorageProvider } from '@/lib/storage'
 const MAX_TEXT_PER_FILE = 50_000 // ~50KB of text per file
 const PARSEABLE_MIME_TYPES = [
  'application/pdf',
+  'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+  'application/msword',
  'text/plain',
  'text/csv',
  'text/markdown',
@@ -57,11 +59,17 @@ export async function extractFileContent(
    let text: string

    if (mimeType === 'application/pdf') {
-      // Dynamic import to avoid loading pdf-parse when not needed
-      const pdfParseModule = await import('pdf-parse')
-      const pdfParse = typeof pdfParseModule === 'function' ? pdfParseModule : (pdfParseModule as any).default ?? pdfParseModule
-      const pdf = await pdfParse(buffer)
-      text = pdf.text
+      const { extractText, getDocumentProxy } = await import('unpdf')
+      const pdf = await getDocumentProxy(new Uint8Array(buffer))
+      const result = await extractText(pdf, { mergePages: true })
+      text = result.text as string
+    } else if (
+      mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
+      mimeType === 'application/msword'
+    ) {
+      const mammoth = await import('mammoth')
+      const result = await mammoth.extractRawText({ buffer })
+      text = result.value
    } else {
      // Text-based files
      text = buffer.toString('utf-8')