Fix document analysis: switch to unpdf + mammoth for PDF/Word parsing
All checks were successful
Build and Push Docker Image / build (push) Successful in 11m26s
All checks were successful
Build and Push Docker Image / build (push) Successful in 11m26s
pdf-parse v2 requires DOMMatrix (browser API) which fails in Node.js. Replaced with unpdf (serverless PDF.js build) for PDFs and mammoth for Word .docx files. Also fixed the same broken pdf-parse usage in file-content-extractor.ts used by AI filtering. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -6,17 +6,32 @@
|
||||
* - Text preview (first ~2000 chars)
|
||||
* - Language detection via franc
|
||||
*
|
||||
* Supports: PDF (via unpdf), Word .docx (via mammoth), plain text files.
|
||||
* Runs optionally on upload (controlled by SystemSettings) and
|
||||
* retroactively via admin endpoint.
|
||||
*/
|
||||
|
||||
import { getStorageProvider } from '@/lib/storage'
|
||||
import { isParseableMimeType } from './file-content-extractor'
|
||||
import { prisma } from '@/lib/prisma'
|
||||
|
||||
const TEXT_PREVIEW_LIMIT = 2000
|
||||
const BATCH_SIZE = 10
|
||||
|
||||
const ANALYZABLE_MIME_TYPES = [
|
||||
'application/pdf',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx
|
||||
'application/msword', // .doc (limited support)
|
||||
'text/plain',
|
||||
'text/csv',
|
||||
'text/markdown',
|
||||
'text/html',
|
||||
'application/rtf',
|
||||
]
|
||||
|
||||
function isAnalyzableMimeType(mimeType: string): boolean {
|
||||
return ANALYZABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
|
||||
}
|
||||
|
||||
// ─── Types ──────────────────────────────────────────────────────────────────
|
||||
|
||||
export type AnalysisResult = {
|
||||
@@ -32,8 +47,6 @@ export type AnalysisResult = {
|
||||
|
||||
/**
|
||||
* Detect language using franc. Returns ISO 639-3 code and confidence.
|
||||
* franc returns a distance-based score where lower = better match.
|
||||
* We convert to 0-1 confidence where 1 = perfect match.
|
||||
*/
|
||||
async function detectLanguage(
|
||||
text: string
|
||||
@@ -42,7 +55,6 @@ async function detectLanguage(
|
||||
return { lang: 'und', confidence: 0 }
|
||||
}
|
||||
|
||||
// Use a reasonable sample for detection (first 5000 chars)
|
||||
const sample = text.slice(0, 5000)
|
||||
|
||||
const { francAll } = await import('franc')
|
||||
@@ -53,15 +65,31 @@ async function detectLanguage(
|
||||
}
|
||||
|
||||
const topLang = results[0][0]
|
||||
const topScore = results[0][1] // 1.0 = best match, 0.0 = worst
|
||||
|
||||
// franc scores: 1.0 is best match, scale drops from there
|
||||
// Convert to a 0-1 confidence
|
||||
const topScore = results[0][1]
|
||||
const confidence = Math.max(0, Math.min(1, topScore))
|
||||
|
||||
return { lang: topLang, confidence: Math.round(confidence * 100) / 100 }
|
||||
}
|
||||
|
||||
// ─── Parsers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
async function parsePdf(
|
||||
buffer: Buffer
|
||||
): Promise<{ text: string; pageCount: number }> {
|
||||
const { extractText, getDocumentProxy } = await import('unpdf')
|
||||
const pdf = await getDocumentProxy(new Uint8Array(buffer))
|
||||
const { totalPages, text } = await extractText(pdf, { mergePages: true })
|
||||
return { text: text as string, pageCount: totalPages }
|
||||
}
|
||||
|
||||
async function parseDocx(
|
||||
buffer: Buffer
|
||||
): Promise<{ text: string }> {
|
||||
const mammoth = await import('mammoth')
|
||||
const result = await mammoth.extractRawText({ buffer })
|
||||
return { text: result.value }
|
||||
}
|
||||
|
||||
// ─── Core Analysis ──────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
@@ -83,7 +111,7 @@ export async function analyzeFileContent(
|
||||
langConfidence: null,
|
||||
}
|
||||
|
||||
if (!isParseableMimeType(mimeType)) {
|
||||
if (!isAnalyzableMimeType(mimeType)) {
|
||||
return { ...result, error: 'Unsupported mime type for analysis' }
|
||||
}
|
||||
|
||||
@@ -95,14 +123,16 @@ export async function analyzeFileContent(
|
||||
let pageCount: number | null = null
|
||||
|
||||
if (mimeType === 'application/pdf') {
|
||||
const pdfParseModule = await import('pdf-parse')
|
||||
const pdfParse =
|
||||
typeof pdfParseModule === 'function'
|
||||
? pdfParseModule
|
||||
: (pdfParseModule as any).default ?? pdfParseModule
|
||||
const pdf = await pdfParse(buffer)
|
||||
text = pdf.text || ''
|
||||
pageCount = pdf.numpages ?? null
|
||||
const parsed = await parsePdf(buffer)
|
||||
text = parsed.text
|
||||
pageCount = parsed.pageCount
|
||||
} else if (
|
||||
mimeType ===
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
|
||||
mimeType === 'application/msword'
|
||||
) {
|
||||
const parsed = await parseDocx(buffer)
|
||||
text = parsed.text
|
||||
} else {
|
||||
// Text-based files (plain text, CSV, markdown, HTML, RTF)
|
||||
text = buffer.toString('utf-8')
|
||||
@@ -227,8 +257,8 @@ export async function analyzeProjectFiles(
|
||||
const batch = files.slice(i, i + BATCH_SIZE)
|
||||
const results = await Promise.allSettled(
|
||||
batch.map(async (file) => {
|
||||
if (!isParseableMimeType(file.mimeType)) {
|
||||
// Mark non-parseable files as analyzed with no data
|
||||
if (!isAnalyzableMimeType(file.mimeType)) {
|
||||
// Mark non-analyzable files as analyzed with no data
|
||||
await prisma.projectFile.update({
|
||||
where: { id: file.id },
|
||||
data: { analyzedAt: new Date() },
|
||||
@@ -302,7 +332,7 @@ export async function analyzeAllUnanalyzed(): Promise<{
|
||||
const batch = files.slice(i, i + BATCH_SIZE)
|
||||
const results = await Promise.allSettled(
|
||||
batch.map(async (file) => {
|
||||
if (!isParseableMimeType(file.mimeType)) {
|
||||
if (!isAnalyzableMimeType(file.mimeType)) {
|
||||
await prisma.projectFile.update({
|
||||
where: { id: file.id },
|
||||
data: { analyzedAt: new Date() },
|
||||
@@ -359,7 +389,6 @@ export async function isAutoAnalysisEnabled(): Promise<boolean> {
|
||||
const setting = await prisma.systemSettings.findUnique({
|
||||
where: { key: 'file_analysis_auto_enabled' },
|
||||
})
|
||||
// Default to true if setting doesn't exist
|
||||
return setting?.value !== 'false'
|
||||
} catch {
|
||||
return true
|
||||
|
||||
Reference in New Issue
Block a user