Fix document analysis: switch to unpdf + mammoth for PDF/Word parsing

pdf-parse v2 requires DOMMatrix (browser API) which fails in Node.js. Replaced with unpdf (serverless PDF.js build) for PDFs and mammoth for Word .docx files. Also fixed the same broken pdf-parse usage in file-content-extractor.ts used by AI filtering. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 10:27:36 +01:00
parent c9640c6086
commit ed5e782f61
4 changed files with 298 additions and 26 deletions
--- a/src/server/services/document-analyzer.ts
+++ b/src/server/services/document-analyzer.ts
@@ -6,17 +6,32 @@
 * - Text preview (first ~2000 chars)
 * - Language detection via franc
 *
+ * Supports: PDF (via unpdf), Word .docx (via mammoth), plain text files.
 * Runs optionally on upload (controlled by SystemSettings) and
 * retroactively via admin endpoint.
 */

 import { getStorageProvider } from '@/lib/storage'
-import { isParseableMimeType } from './file-content-extractor'
 import { prisma } from '@/lib/prisma'

 const TEXT_PREVIEW_LIMIT = 2000
 const BATCH_SIZE = 10

+const ANALYZABLE_MIME_TYPES = [
+  'application/pdf',
+  'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx
+  'application/msword', // .doc (limited support)
+  'text/plain',
+  'text/csv',
+  'text/markdown',
+  'text/html',
+  'application/rtf',
+]
+
+function isAnalyzableMimeType(mimeType: string): boolean {
+  return ANALYZABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
+}
+
 // ─── Types ──────────────────────────────────────────────────────────────────

 export type AnalysisResult = {
@@ -32,8 +47,6 @@ export type AnalysisResult = {

 /**
 * Detect language using franc. Returns ISO 639-3 code and confidence.
- * franc returns a distance-based score where lower = better match.
- * We convert to 0-1 confidence where 1 = perfect match.
 */
 async function detectLanguage(
  text: string
@@ -42,7 +55,6 @@ async function detectLanguage(
    return { lang: 'und', confidence: 0 }
  }

-  // Use a reasonable sample for detection (first 5000 chars)
  const sample = text.slice(0, 5000)

  const { francAll } = await import('franc')
@@ -53,15 +65,31 @@ async function detectLanguage(
  }

  const topLang = results[0][0]
-  const topScore = results[0][1] // 1.0 = best match, 0.0 = worst
-
-  // franc scores: 1.0 is best match, scale drops from there
-  // Convert to a 0-1 confidence
+  const topScore = results[0][1]
  const confidence = Math.max(0, Math.min(1, topScore))

  return { lang: topLang, confidence: Math.round(confidence * 100) / 100 }
 }

+// ─── Parsers ─────────────────────────────────────────────────────────────────
+
+async function parsePdf(
+  buffer: Buffer
+): Promise<{ text: string; pageCount: number }> {
+  const { extractText, getDocumentProxy } = await import('unpdf')
+  const pdf = await getDocumentProxy(new Uint8Array(buffer))
+  const { totalPages, text } = await extractText(pdf, { mergePages: true })
+  return { text: text as string, pageCount: totalPages }
+}
+
+async function parseDocx(
+  buffer: Buffer
+): Promise<{ text: string }> {
+  const mammoth = await import('mammoth')
+  const result = await mammoth.extractRawText({ buffer })
+  return { text: result.value }
+}
+
 // ─── Core Analysis ──────────────────────────────────────────────────────────

 /**
@@ -83,7 +111,7 @@ export async function analyzeFileContent(
    langConfidence: null,
  }

-  if (!isParseableMimeType(mimeType)) {
+  if (!isAnalyzableMimeType(mimeType)) {
    return { ...result, error: 'Unsupported mime type for analysis' }
  }

@@ -95,14 +123,16 @@ export async function analyzeFileContent(
    let pageCount: number | null = null

    if (mimeType === 'application/pdf') {
-      const pdfParseModule = await import('pdf-parse')
-      const pdfParse =
-        typeof pdfParseModule === 'function'
-          ? pdfParseModule
-          : (pdfParseModule as any).default ?? pdfParseModule
-      const pdf = await pdfParse(buffer)
-      text = pdf.text || ''
-      pageCount = pdf.numpages ?? null
+      const parsed = await parsePdf(buffer)
+      text = parsed.text
+      pageCount = parsed.pageCount
+    } else if (
+      mimeType ===
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
+      mimeType === 'application/msword'
+    ) {
+      const parsed = await parseDocx(buffer)
+      text = parsed.text
    } else {
      // Text-based files (plain text, CSV, markdown, HTML, RTF)
      text = buffer.toString('utf-8')
@@ -227,8 +257,8 @@ export async function analyzeProjectFiles(
    const batch = files.slice(i, i + BATCH_SIZE)
    const results = await Promise.allSettled(
      batch.map(async (file) => {
-        if (!isParseableMimeType(file.mimeType)) {
-          // Mark non-parseable files as analyzed with no data
+        if (!isAnalyzableMimeType(file.mimeType)) {
+          // Mark non-analyzable files as analyzed with no data
          await prisma.projectFile.update({
            where: { id: file.id },
            data: { analyzedAt: new Date() },
@@ -302,7 +332,7 @@ export async function analyzeAllUnanalyzed(): Promise<{
    const batch = files.slice(i, i + BATCH_SIZE)
    const results = await Promise.allSettled(
      batch.map(async (file) => {
-        if (!isParseableMimeType(file.mimeType)) {
+        if (!isAnalyzableMimeType(file.mimeType)) {
          await prisma.projectFile.update({
            where: { id: file.id },
            data: { analyzedAt: new Date() },
@@ -359,7 +389,6 @@ export async function isAutoAnalysisEnabled(): Promise<boolean> {
    const setting = await prisma.systemSettings.findUnique({
      where: { key: 'file_analysis_auto_enabled' },
    })
-    // Default to true if setting doesn't exist
    return setting?.value !== 'false'
  } catch {
    return true
--- a/src/server/services/file-content-extractor.ts
+++ b/src/server/services/file-content-extractor.ts
@@ -15,6 +15,8 @@ import { getStorageProvider } from '@/lib/storage'
 const MAX_TEXT_PER_FILE = 50_000 // ~50KB of text per file
 const PARSEABLE_MIME_TYPES = [
  'application/pdf',
+  'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+  'application/msword',
  'text/plain',
  'text/csv',
  'text/markdown',
@@ -57,11 +59,17 @@ export async function extractFileContent(
    let text: string

    if (mimeType === 'application/pdf') {
-      // Dynamic import to avoid loading pdf-parse when not needed
-      const pdfParseModule = await import('pdf-parse')
-      const pdfParse = typeof pdfParseModule === 'function' ? pdfParseModule : (pdfParseModule as any).default ?? pdfParseModule
-      const pdf = await pdfParse(buffer)
-      text = pdf.text
+      const { extractText, getDocumentProxy } = await import('unpdf')
+      const pdf = await getDocumentProxy(new Uint8Array(buffer))
+      const result = await extractText(pdf, { mergePages: true })
+      text = result.text as string
+    } else if (
+      mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
+      mimeType === 'application/msword'
+    ) {
+      const mammoth = await import('mammoth')
+      const result = await mammoth.extractRawText({ buffer })
+      text = result.value
    } else {
      // Text-based files
      text = buffer.toString('utf-8')