Fix document analysis: switch to unpdf + mammoth for PDF/Word parsing
All checks were successful
Build and Push Docker Image / build (push) Successful in 11m26s

pdf-parse v2 requires DOMMatrix (browser API) which fails in Node.js.
Replaced with unpdf (serverless PDF.js build) for PDFs and mammoth for
Word .docx files. Also fixed the same broken pdf-parse usage in
file-content-extractor.ts used by AI filtering.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt
2026-02-17 10:27:36 +01:00
parent c9640c6086
commit ed5e782f61
4 changed files with 298 additions and 26 deletions

View File

@@ -6,17 +6,32 @@
* - Text preview (first ~2000 chars)
* - Language detection via franc
*
* Supports: PDF (via unpdf), Word .docx (via mammoth), plain text files.
* Runs optionally on upload (controlled by SystemSettings) and
* retroactively via admin endpoint.
*/
import { getStorageProvider } from '@/lib/storage'
import { isParseableMimeType } from './file-content-extractor'
import { prisma } from '@/lib/prisma'
const TEXT_PREVIEW_LIMIT = 2000
const BATCH_SIZE = 10
const ANALYZABLE_MIME_TYPES = [
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx
'application/msword', // .doc (limited support)
'text/plain',
'text/csv',
'text/markdown',
'text/html',
'application/rtf',
]
function isAnalyzableMimeType(mimeType: string): boolean {
return ANALYZABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
}
// ─── Types ──────────────────────────────────────────────────────────────────
export type AnalysisResult = {
@@ -32,8 +47,6 @@ export type AnalysisResult = {
/**
* Detect language using franc. Returns ISO 639-3 code and confidence.
* franc returns a distance-based score where lower = better match.
* We convert to 0-1 confidence where 1 = perfect match.
*/
async function detectLanguage(
text: string
@@ -42,7 +55,6 @@ async function detectLanguage(
return { lang: 'und', confidence: 0 }
}
// Use a reasonable sample for detection (first 5000 chars)
const sample = text.slice(0, 5000)
const { francAll } = await import('franc')
@@ -53,15 +65,31 @@ async function detectLanguage(
}
const topLang = results[0][0]
const topScore = results[0][1] // 1.0 = best match, 0.0 = worst
// franc scores: 1.0 is best match, scale drops from there
// Convert to a 0-1 confidence
const topScore = results[0][1]
const confidence = Math.max(0, Math.min(1, topScore))
return { lang: topLang, confidence: Math.round(confidence * 100) / 100 }
}
// ─── Parsers ─────────────────────────────────────────────────────────────────
async function parsePdf(
buffer: Buffer
): Promise<{ text: string; pageCount: number }> {
const { extractText, getDocumentProxy } = await import('unpdf')
const pdf = await getDocumentProxy(new Uint8Array(buffer))
const { totalPages, text } = await extractText(pdf, { mergePages: true })
return { text: text as string, pageCount: totalPages }
}
async function parseDocx(
buffer: Buffer
): Promise<{ text: string }> {
const mammoth = await import('mammoth')
const result = await mammoth.extractRawText({ buffer })
return { text: result.value }
}
// ─── Core Analysis ──────────────────────────────────────────────────────────
/**
@@ -83,7 +111,7 @@ export async function analyzeFileContent(
langConfidence: null,
}
if (!isParseableMimeType(mimeType)) {
if (!isAnalyzableMimeType(mimeType)) {
return { ...result, error: 'Unsupported mime type for analysis' }
}
@@ -95,14 +123,16 @@ export async function analyzeFileContent(
let pageCount: number | null = null
if (mimeType === 'application/pdf') {
const pdfParseModule = await import('pdf-parse')
const pdfParse =
typeof pdfParseModule === 'function'
? pdfParseModule
: (pdfParseModule as any).default ?? pdfParseModule
const pdf = await pdfParse(buffer)
text = pdf.text || ''
pageCount = pdf.numpages ?? null
const parsed = await parsePdf(buffer)
text = parsed.text
pageCount = parsed.pageCount
} else if (
mimeType ===
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
mimeType === 'application/msword'
) {
const parsed = await parseDocx(buffer)
text = parsed.text
} else {
// Text-based files (plain text, CSV, markdown, HTML, RTF)
text = buffer.toString('utf-8')
@@ -227,8 +257,8 @@ export async function analyzeProjectFiles(
const batch = files.slice(i, i + BATCH_SIZE)
const results = await Promise.allSettled(
batch.map(async (file) => {
if (!isParseableMimeType(file.mimeType)) {
// Mark non-parseable files as analyzed with no data
if (!isAnalyzableMimeType(file.mimeType)) {
// Mark non-analyzable files as analyzed with no data
await prisma.projectFile.update({
where: { id: file.id },
data: { analyzedAt: new Date() },
@@ -302,7 +332,7 @@ export async function analyzeAllUnanalyzed(): Promise<{
const batch = files.slice(i, i + BATCH_SIZE)
const results = await Promise.allSettled(
batch.map(async (file) => {
if (!isParseableMimeType(file.mimeType)) {
if (!isAnalyzableMimeType(file.mimeType)) {
await prisma.projectFile.update({
where: { id: file.id },
data: { analyzedAt: new Date() },
@@ -359,7 +389,6 @@ export async function isAutoAnalysisEnabled(): Promise<boolean> {
const setting = await prisma.systemSettings.findUnique({
where: { key: 'file_analysis_auto_enabled' },
})
// Default to true if setting doesn't exist
return setting?.value !== 'false'
} catch {
return true

View File

@@ -15,6 +15,8 @@ import { getStorageProvider } from '@/lib/storage'
const MAX_TEXT_PER_FILE = 50_000 // ~50KB of text per file
const PARSEABLE_MIME_TYPES = [
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/msword',
'text/plain',
'text/csv',
'text/markdown',
@@ -57,11 +59,17 @@ export async function extractFileContent(
let text: string
if (mimeType === 'application/pdf') {
// Dynamic import to avoid loading pdf-parse when not needed
const pdfParseModule = await import('pdf-parse')
const pdfParse = typeof pdfParseModule === 'function' ? pdfParseModule : (pdfParseModule as any).default ?? pdfParseModule
const pdf = await pdfParse(buffer)
text = pdf.text
const { extractText, getDocumentProxy } = await import('unpdf')
const pdf = await getDocumentProxy(new Uint8Array(buffer))
const result = await extractText(pdf, { mergePages: true })
text = result.text as string
} else if (
mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
mimeType === 'application/msword'
) {
const mammoth = await import('mammoth')
const result = await mammoth.extractRawText({ buffer })
text = result.value
} else {
// Text-based files
text = buffer.toString('utf-8')