From ed5e782f615f586b0ba0f69c075f8b417a6e458d Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 17 Feb 2026 10:27:36 +0100 Subject: [PATCH] Fix document analysis: switch to unpdf + mammoth for PDF/Word parsing pdf-parse v2 requires DOMMatrix (browser API) which fails in Node.js. Replaced with unpdf (serverless PDF.js build) for PDFs and mammoth for Word .docx files. Also fixed the same broken pdf-parse usage in file-content-extractor.ts used by AI filtering. Co-Authored-By: Claude Opus 4.6 --- package-lock.json | 233 ++++++++++++++++++ package.json | 2 + src/server/services/document-analyzer.ts | 71 ++++-- src/server/services/file-content-extractor.ts | 18 +- 4 files changed, 298 insertions(+), 26 deletions(-) diff --git a/package-lock.json b/package-lock.json index 5010892..35d6774 100644 --- a/package-lock.json +++ b/package-lock.json @@ -55,6 +55,7 @@ "jspdf-autotable": "^5.0.7", "leaflet": "^1.9.4", "lucide-react": "^0.563.0", + "mammoth": "^1.11.0", "minio": "^8.0.2", "motion": "^11.15.0", "next": "^15.1.0", @@ -75,6 +76,7 @@ "sonner": "^2.0.7", "superjson": "^2.2.2", "tailwind-merge": "^3.4.0", + "unpdf": "^1.4.0", "use-debounce": "^10.0.4", "zod": "^3.24.1" }, @@ -5472,6 +5474,15 @@ "url": "https://opencollective.com/vitest" } }, + "node_modules/@xmldom/xmldom": { + "version": "0.8.11", + "resolved": "https://registry.npmjs.org/@xmldom/xmldom/-/xmldom-0.8.11.tgz", + "integrity": "sha512-cQzWCtO6C8TQiYl1ruKNn2U6Ao4o4WBBcbL61yJl84x+j5sOWWFU9X7DpND8XZG3daDppSsigMdfAIl2upQBRw==", + "license": "MIT", + "engines": { + "node": ">=10.0.0" + } + }, "node_modules/@zxing/text-encoding": { "version": "0.9.0", "resolved": "https://registry.npmjs.org/@zxing/text-encoding/-/text-encoding-0.9.0.tgz", @@ -5823,6 +5834,26 @@ "node": ">= 0.6.0" } }, + "node_modules/base64-js": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", + "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, "node_modules/bcryptjs": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/bcryptjs/-/bcryptjs-3.0.3.tgz", @@ -5841,6 +5872,12 @@ "readable-stream": "^3.4.0" } }, + "node_modules/bluebird": { + "version": "3.4.7", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.4.7.tgz", + "integrity": "sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==", + "license": "MIT" + }, "node_modules/brace-expansion": { "version": "1.1.12", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz", @@ -6251,6 +6288,12 @@ "url": "https://opencollective.com/core-js" } }, + "node_modules/core-util-is": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", + "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==", + "license": "MIT" + }, "node_modules/country-flag-icons": { "version": "1.6.12", "resolved": "https://registry.npmjs.org/country-flag-icons/-/country-flag-icons-1.6.12.tgz", @@ -6654,6 +6697,12 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/dingbat-to-unicode": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dingbat-to-unicode/-/dingbat-to-unicode-1.0.1.tgz", + "integrity": "sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w==", + "license": "BSD-2-Clause" + }, "node_modules/doctrine": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-2.1.0.tgz", @@ -6690,6 +6739,15 @@ "url": "https://dotenvx.com" } }, + "node_modules/duck": { + "version": "0.1.12", + "resolved": "https://registry.npmjs.org/duck/-/duck-0.1.12.tgz", + "integrity": "sha512-wkctla1O6VfP89gQ+J/yDesM0S7B7XLXjKGzXxMDVFg7uEn706niAtyYovKbyq1oT9YwDcly721/iUWoc8MVRg==", + "license": "BSD", + "dependencies": { + "underscore": "^1.13.1" + } + }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", @@ -8380,6 +8438,12 @@ "node": ">= 4" } }, + "node_modules/immediate": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz", + "integrity": "sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==", + "license": "MIT" + }, "node_modules/immer": { "version": "10.2.0", "resolved": "https://registry.npmjs.org/immer/-/immer-10.2.0.tgz", @@ -9076,6 +9140,60 @@ "node": ">=4.0" } }, + "node_modules/jszip": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz", + "integrity": "sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==", + "license": "(MIT OR GPL-3.0-or-later)", + "dependencies": { + "lie": "~3.3.0", + "pako": "~1.0.2", + "readable-stream": "~2.3.6", + "setimmediate": "^1.0.5" + } + }, + "node_modules/jszip/node_modules/isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", + "license": "MIT" + }, + "node_modules/jszip/node_modules/pako": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", + "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==", + "license": "(MIT AND Zlib)" + }, + "node_modules/jszip/node_modules/readable-stream": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", + "integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==", + "license": "MIT", + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "node_modules/jszip/node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", + "license": "MIT" + }, + "node_modules/jszip/node_modules/string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", + "license": "MIT", + "dependencies": { + "safe-buffer": "~5.1.0" + } + }, "node_modules/keyv": { "version": "4.5.4", "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz", @@ -9153,6 +9271,15 @@ "integrity": "sha512-T/Cz6iLcsZdb5jDncDcUNhSAJ0VlSC9TnsqtBNdpkaAmy24/R1RhErtNWVWBrcUZKs9hSgaVsBkc7HxYnazIfw==", "license": "MIT" }, + "node_modules/lie": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/lie/-/lie-3.3.0.tgz", + "integrity": "sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==", + "license": "MIT", + "dependencies": { + "immediate": "~3.0.5" + } + }, "node_modules/lightningcss": { "version": "1.30.2", "resolved": "https://registry.npmjs.org/lightningcss/-/lightningcss-1.30.2.tgz", @@ -9467,6 +9594,17 @@ "loose-envify": "cli.js" } }, + "node_modules/lop": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/lop/-/lop-0.4.2.tgz", + "integrity": "sha512-RefILVDQ4DKoRZsJ4Pj22TxE3omDO47yFpkIBoDKzkqPRISs5U1cnAdg/5583YPkWPaLIYHOKRMQSvjFsO26cw==", + "license": "BSD-2-Clause", + "dependencies": { + "duck": "^0.1.12", + "option": "~0.2.1", + "underscore": "^1.13.1" + } + }, "node_modules/lucide-react": { "version": "0.563.0", "resolved": "https://registry.npmjs.org/lucide-react/-/lucide-react-0.563.0.tgz", @@ -9485,6 +9623,39 @@ "@jridgewell/sourcemap-codec": "^1.5.5" } }, + "node_modules/mammoth": { + "version": "1.11.0", + "resolved": "https://registry.npmjs.org/mammoth/-/mammoth-1.11.0.tgz", + "integrity": "sha512-BcEqqY/BOwIcI1iR5tqyVlqc3KIaMRa4egSoK83YAVrBf6+yqdAAbtUcFDCWX8Zef8/fgNZ6rl4VUv+vVX8ddQ==", + "license": "BSD-2-Clause", + "dependencies": { + "@xmldom/xmldom": "^0.8.6", + "argparse": "~1.0.3", + "base64-js": "^1.5.1", + "bluebird": "~3.4.0", + "dingbat-to-unicode": "^1.0.1", + "jszip": "^3.7.1", + "lop": "^0.4.2", + "path-is-absolute": "^1.0.0", + "underscore": "^1.13.1", + "xmlbuilder": "^10.0.0" + }, + "bin": { + "mammoth": "bin/mammoth" + }, + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/mammoth/node_modules/argparse": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", + "integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==", + "license": "MIT", + "dependencies": { + "sprintf-js": "~1.0.2" + } + }, "node_modules/markdown-it": { "version": "14.1.0", "resolved": "https://registry.npmjs.org/markdown-it/-/markdown-it-14.1.0.tgz", @@ -10899,6 +11070,12 @@ } } }, + "node_modules/option": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/option/-/option-0.2.4.tgz", + "integrity": "sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==", + "license": "BSD-2-Clause" + }, "node_modules/optionator": { "version": "0.9.4", "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz", @@ -11032,6 +11209,15 @@ "node": ">=8" } }, + "node_modules/path-is-absolute": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", + "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/path-key": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz", @@ -11352,6 +11538,12 @@ } } }, + "node_modules/process-nextick-args": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", + "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==", + "license": "MIT" + }, "node_modules/prop-types": { "version": "15.8.1", "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz", @@ -12512,6 +12704,12 @@ "node": ">= 0.4" } }, + "node_modules/setimmediate": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz", + "integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==", + "license": "MIT" + }, "node_modules/sharp": { "version": "0.34.5", "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.5.tgz", @@ -12701,6 +12899,12 @@ "node": ">=6" } }, + "node_modules/sprintf-js": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", + "integrity": "sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==", + "license": "BSD-3-Clause" + }, "node_modules/stable-hash": { "version": "0.0.5", "resolved": "https://registry.npmjs.org/stable-hash/-/stable-hash-0.0.5.tgz", @@ -13396,6 +13600,12 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/underscore": { + "version": "1.13.7", + "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.7.tgz", + "integrity": "sha512-GMXzWtsc57XAtguZgaQViUOzs0KTkk8ojr3/xAxXLITqf/3EMwxC0inyETfDFjH/Krbhuep0HNbbjI9i/q3F3g==", + "license": "MIT" + }, "node_modules/undici-types": { "version": "7.16.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", @@ -13503,6 +13713,20 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/unpdf": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/unpdf/-/unpdf-1.4.0.tgz", + "integrity": "sha512-TahIk0xdH/4jh/MxfclzU79g40OyxtP00VnEUZdEkJoYtXAHWLiir6t3FC6z3vDqQTzc2ZHcla6uEiVTNjejuA==", + "license": "MIT", + "peerDependencies": { + "@napi-rs/canvas": "^0.1.69" + }, + "peerDependenciesMeta": { + "@napi-rs/canvas": { + "optional": true + } + } + }, "node_modules/unrs-resolver": { "version": "1.11.1", "resolved": "https://registry.npmjs.org/unrs-resolver/-/unrs-resolver-1.11.1.tgz", @@ -14123,6 +14347,15 @@ "node": ">=4.0" } }, + "node_modules/xmlbuilder": { + "version": "10.1.1", + "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-10.1.1.tgz", + "integrity": "sha512-OyzrcFLL/nb6fMGHbiRDuPup9ljBycsdCypwuyg5AAHvyWzGfChJpCXMG88AGTIMFhGZ9RccFN1e6lhg3hkwKg==", + "license": "MIT", + "engines": { + "node": ">=4.0" + } + }, "node_modules/y-prosemirror": { "version": "1.3.7", "resolved": "https://registry.npmjs.org/y-prosemirror/-/y-prosemirror-1.3.7.tgz", diff --git a/package.json b/package.json index 533311e..ba9bc3e 100644 --- a/package.json +++ b/package.json @@ -68,6 +68,7 @@ "jspdf-autotable": "^5.0.7", "leaflet": "^1.9.4", "lucide-react": "^0.563.0", + "mammoth": "^1.11.0", "minio": "^8.0.2", "motion": "^11.15.0", "next": "^15.1.0", @@ -88,6 +89,7 @@ "sonner": "^2.0.7", "superjson": "^2.2.2", "tailwind-merge": "^3.4.0", + "unpdf": "^1.4.0", "use-debounce": "^10.0.4", "zod": "^3.24.1" }, diff --git a/src/server/services/document-analyzer.ts b/src/server/services/document-analyzer.ts index 1e1931b..55c0ef1 100644 --- a/src/server/services/document-analyzer.ts +++ b/src/server/services/document-analyzer.ts @@ -6,17 +6,32 @@ * - Text preview (first ~2000 chars) * - Language detection via franc * + * Supports: PDF (via unpdf), Word .docx (via mammoth), plain text files. * Runs optionally on upload (controlled by SystemSettings) and * retroactively via admin endpoint. */ import { getStorageProvider } from '@/lib/storage' -import { isParseableMimeType } from './file-content-extractor' import { prisma } from '@/lib/prisma' const TEXT_PREVIEW_LIMIT = 2000 const BATCH_SIZE = 10 +const ANALYZABLE_MIME_TYPES = [ + 'application/pdf', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx + 'application/msword', // .doc (limited support) + 'text/plain', + 'text/csv', + 'text/markdown', + 'text/html', + 'application/rtf', +] + +function isAnalyzableMimeType(mimeType: string): boolean { + return ANALYZABLE_MIME_TYPES.some((t) => mimeType.startsWith(t)) +} + // ─── Types ────────────────────────────────────────────────────────────────── export type AnalysisResult = { @@ -32,8 +47,6 @@ export type AnalysisResult = { /** * Detect language using franc. Returns ISO 639-3 code and confidence. - * franc returns a distance-based score where lower = better match. - * We convert to 0-1 confidence where 1 = perfect match. */ async function detectLanguage( text: string @@ -42,7 +55,6 @@ async function detectLanguage( return { lang: 'und', confidence: 0 } } - // Use a reasonable sample for detection (first 5000 chars) const sample = text.slice(0, 5000) const { francAll } = await import('franc') @@ -53,15 +65,31 @@ async function detectLanguage( } const topLang = results[0][0] - const topScore = results[0][1] // 1.0 = best match, 0.0 = worst - - // franc scores: 1.0 is best match, scale drops from there - // Convert to a 0-1 confidence + const topScore = results[0][1] const confidence = Math.max(0, Math.min(1, topScore)) return { lang: topLang, confidence: Math.round(confidence * 100) / 100 } } +// ─── Parsers ───────────────────────────────────────────────────────────────── + +async function parsePdf( + buffer: Buffer +): Promise<{ text: string; pageCount: number }> { + const { extractText, getDocumentProxy } = await import('unpdf') + const pdf = await getDocumentProxy(new Uint8Array(buffer)) + const { totalPages, text } = await extractText(pdf, { mergePages: true }) + return { text: text as string, pageCount: totalPages } +} + +async function parseDocx( + buffer: Buffer +): Promise<{ text: string }> { + const mammoth = await import('mammoth') + const result = await mammoth.extractRawText({ buffer }) + return { text: result.value } +} + // ─── Core Analysis ────────────────────────────────────────────────────────── /** @@ -83,7 +111,7 @@ export async function analyzeFileContent( langConfidence: null, } - if (!isParseableMimeType(mimeType)) { + if (!isAnalyzableMimeType(mimeType)) { return { ...result, error: 'Unsupported mime type for analysis' } } @@ -95,14 +123,16 @@ export async function analyzeFileContent( let pageCount: number | null = null if (mimeType === 'application/pdf') { - const pdfParseModule = await import('pdf-parse') - const pdfParse = - typeof pdfParseModule === 'function' - ? pdfParseModule - : (pdfParseModule as any).default ?? pdfParseModule - const pdf = await pdfParse(buffer) - text = pdf.text || '' - pageCount = pdf.numpages ?? null + const parsed = await parsePdf(buffer) + text = parsed.text + pageCount = parsed.pageCount + } else if ( + mimeType === + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' || + mimeType === 'application/msword' + ) { + const parsed = await parseDocx(buffer) + text = parsed.text } else { // Text-based files (plain text, CSV, markdown, HTML, RTF) text = buffer.toString('utf-8') @@ -227,8 +257,8 @@ export async function analyzeProjectFiles( const batch = files.slice(i, i + BATCH_SIZE) const results = await Promise.allSettled( batch.map(async (file) => { - if (!isParseableMimeType(file.mimeType)) { - // Mark non-parseable files as analyzed with no data + if (!isAnalyzableMimeType(file.mimeType)) { + // Mark non-analyzable files as analyzed with no data await prisma.projectFile.update({ where: { id: file.id }, data: { analyzedAt: new Date() }, @@ -302,7 +332,7 @@ export async function analyzeAllUnanalyzed(): Promise<{ const batch = files.slice(i, i + BATCH_SIZE) const results = await Promise.allSettled( batch.map(async (file) => { - if (!isParseableMimeType(file.mimeType)) { + if (!isAnalyzableMimeType(file.mimeType)) { await prisma.projectFile.update({ where: { id: file.id }, data: { analyzedAt: new Date() }, @@ -359,7 +389,6 @@ export async function isAutoAnalysisEnabled(): Promise { const setting = await prisma.systemSettings.findUnique({ where: { key: 'file_analysis_auto_enabled' }, }) - // Default to true if setting doesn't exist return setting?.value !== 'false' } catch { return true diff --git a/src/server/services/file-content-extractor.ts b/src/server/services/file-content-extractor.ts index 9635c93..623ea3f 100644 --- a/src/server/services/file-content-extractor.ts +++ b/src/server/services/file-content-extractor.ts @@ -15,6 +15,8 @@ import { getStorageProvider } from '@/lib/storage' const MAX_TEXT_PER_FILE = 50_000 // ~50KB of text per file const PARSEABLE_MIME_TYPES = [ 'application/pdf', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/msword', 'text/plain', 'text/csv', 'text/markdown', @@ -57,11 +59,17 @@ export async function extractFileContent( let text: string if (mimeType === 'application/pdf') { - // Dynamic import to avoid loading pdf-parse when not needed - const pdfParseModule = await import('pdf-parse') - const pdfParse = typeof pdfParseModule === 'function' ? pdfParseModule : (pdfParseModule as any).default ?? pdfParseModule - const pdf = await pdfParse(buffer) - text = pdf.text + const { extractText, getDocumentProxy } = await import('unpdf') + const pdf = await getDocumentProxy(new Uint8Array(buffer)) + const result = await extractText(pdf, { mergePages: true }) + text = result.text as string + } else if ( + mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' || + mimeType === 'application/msword' + ) { + const mammoth = await import('mammoth') + const result = await mammoth.extractRawText({ buffer }) + text = result.value } else { // Text-based files text = buffer.toString('utf-8')