/** * Document Analyzer Service * * Extracts metadata from uploaded files: * - Page count (PDFs) * - Text preview (first ~2000 chars) * - Language detection via franc * * Supports: PDF (via unpdf), Word .docx (via mammoth), plain text files. * Runs optionally on upload (controlled by SystemSettings) and * retroactively via admin endpoint. */ import { getStorageProvider } from '@/lib/storage' import { prisma } from '@/lib/prisma' const TEXT_PREVIEW_LIMIT = 2000 const BATCH_SIZE = 10 const ANALYZABLE_MIME_TYPES = [ 'application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx 'application/msword', // .doc (limited support) 'text/plain', 'text/csv', 'text/markdown', 'text/html', 'application/rtf', ] function isAnalyzableMimeType(mimeType: string): boolean { return ANALYZABLE_MIME_TYPES.some((t) => mimeType.startsWith(t)) } /** * Strip null bytes from extracted text — PostgreSQL rejects \x00 in UTF-8 text columns. */ function sanitizeText(text: string): string { return text.replace(/\0/g, '') } // ─── Types ────────────────────────────────────────────────────────────────── export type AnalysisResult = { fileId: string pageCount: number | null textPreview: string | null detectedLang: string | null langConfidence: number | null error?: string } // ─── Language Detection ────────────────────────────────────────────────────── /** * Detect language using franc. Returns ISO 639-3 code and confidence. */ async function detectLanguage( text: string ): Promise<{ lang: string; confidence: number }> { if (!text || text.trim().length < 20) { return { lang: 'und', confidence: 0 } } const sample = text.slice(0, 5000) const { francAll } = await import('franc') const results = francAll(sample, { minLength: 10 }) if (!results || results.length === 0 || results[0][0] === 'und') { return { lang: 'und', confidence: 0 } } const topLang = results[0][0] const topScore = results[0][1] const confidence = Math.max(0, Math.min(1, topScore)) return { lang: topLang, confidence: Math.round(confidence * 100) / 100 } } // ─── Parsers ───────────────────────────────────────────────────────────────── async function parsePdf( buffer: Buffer ): Promise<{ text: string; pageCount: number }> { const { extractText, getDocumentProxy } = await import('unpdf') const pdf = await getDocumentProxy(new Uint8Array(buffer)) const { totalPages, text } = await extractText(pdf, { mergePages: true }) return { text: text as string, pageCount: totalPages } } async function parseDocx( buffer: Buffer ): Promise<{ text: string }> { const mammoth = await import('mammoth') const result = await mammoth.extractRawText({ buffer }) return { text: result.value } } // ─── Core Analysis ────────────────────────────────────────────────────────── /** * Analyze a single file: extract page count, text preview, and detect language. * Downloads the file from storage, parses it, and returns results. */ export async function analyzeFileContent( objectKey: string, bucket: string, mimeType: string, fileName: string, fileId: string ): Promise { const result: AnalysisResult = { fileId, pageCount: null, textPreview: null, detectedLang: null, langConfidence: null, } if (!isAnalyzableMimeType(mimeType)) { return { ...result, error: 'Unsupported mime type for analysis' } } try { const storage = await getStorageProvider() const buffer = await storage.getObject(objectKey) let text = '' let pageCount: number | null = null if (mimeType === 'application/pdf') { const parsed = await parsePdf(buffer) text = sanitizeText(parsed.text) pageCount = parsed.pageCount } else if ( mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' || mimeType === 'application/msword' ) { const parsed = await parseDocx(buffer) text = sanitizeText(parsed.text) } else { // Text-based files (plain text, CSV, markdown, HTML, RTF) text = sanitizeText(buffer.toString('utf-8')) } result.pageCount = pageCount // Text preview if (text.trim()) { result.textPreview = text.length > TEXT_PREVIEW_LIMIT ? text.slice(0, TEXT_PREVIEW_LIMIT) : text } // Language detection if (text.trim().length >= 20) { const langResult = await detectLanguage(text) result.detectedLang = langResult.lang result.langConfidence = langResult.confidence } return result } catch (error) { console.warn( `[DocAnalyzer] Failed to analyze ${fileName}:`, error instanceof Error ? error.message : error ) return { ...result, error: error instanceof Error ? error.message : 'Analysis failed', } } } // ─── DB-Integrated Operations ─────────────────────────────────────────────── /** * Analyze a single file by ID and persist results to DB. */ export async function analyzeFile(fileId: string): Promise { const file = await prisma.projectFile.findUnique({ where: { id: fileId }, select: { id: true, objectKey: true, bucket: true, mimeType: true, fileName: true, }, }) if (!file) { return { fileId, pageCount: null, textPreview: null, detectedLang: null, langConfidence: null, error: 'File not found', } } const result = await analyzeFileContent( file.objectKey, file.bucket, file.mimeType, file.fileName, file.id ) // Persist results await prisma.projectFile.update({ where: { id: fileId }, data: { pageCount: result.pageCount, textPreview: result.textPreview, detectedLang: result.detectedLang, langConfidence: result.langConfidence, analyzedAt: new Date(), }, }) return result } /** * Analyze a single file by ID with a delay (for post-upload use). * The delay accounts for presigned URL uploads where the file * may not be in storage yet when the DB record is created. */ export async function analyzeFileDelayed( fileId: string, delayMs = 3000 ): Promise { await new Promise((resolve) => setTimeout(resolve, delayMs)) return analyzeFile(fileId) } /** * Analyze all files for a specific project. */ export async function analyzeProjectFiles( projectId: string ): Promise<{ analyzed: number; failed: number; total: number }> { const files = await prisma.projectFile.findMany({ where: { projectId }, select: { id: true, objectKey: true, bucket: true, mimeType: true, fileName: true, }, }) let analyzed = 0 let failed = 0 // Process in batches for (let i = 0; i < files.length; i += BATCH_SIZE) { const batch = files.slice(i, i + BATCH_SIZE) const results = await Promise.allSettled( batch.map(async (file) => { if (!isAnalyzableMimeType(file.mimeType)) { // Mark non-analyzable files as analyzed with no data await prisma.projectFile.update({ where: { id: file.id }, data: { analyzedAt: new Date() }, }) return 'skipped' } const result = await analyzeFileContent( file.objectKey, file.bucket, file.mimeType, file.fileName, file.id ) await prisma.projectFile.update({ where: { id: file.id }, data: { pageCount: result.pageCount, textPreview: result.textPreview, detectedLang: result.detectedLang, langConfidence: result.langConfidence, analyzedAt: new Date(), }, }) return result.error ? 'failed' : 'analyzed' }) ) for (const r of results) { if (r.status === 'fulfilled') { if (r.value === 'analyzed') analyzed++ else if (r.value === 'failed') failed++ } else { failed++ } } } return { analyzed, failed, total: files.length } } /** * Retroactive batch analysis: analyze all files that haven't been analyzed yet. * Returns counts. Processes in batches to avoid memory issues. */ export async function analyzeAllUnanalyzed(): Promise<{ analyzed: number failed: number skipped: number total: number }> { const files = await prisma.projectFile.findMany({ where: { analyzedAt: null, project: { isTest: false } }, select: { id: true, objectKey: true, bucket: true, mimeType: true, fileName: true, }, orderBy: { createdAt: 'desc' }, }) let analyzed = 0 let failed = 0 let skipped = 0 for (let i = 0; i < files.length; i += BATCH_SIZE) { const batch = files.slice(i, i + BATCH_SIZE) const results = await Promise.allSettled( batch.map(async (file) => { if (!isAnalyzableMimeType(file.mimeType)) { await prisma.projectFile.update({ where: { id: file.id }, data: { analyzedAt: new Date() }, }) return 'skipped' } const result = await analyzeFileContent( file.objectKey, file.bucket, file.mimeType, file.fileName, file.id ) await prisma.projectFile.update({ where: { id: file.id }, data: { pageCount: result.pageCount, textPreview: result.textPreview, detectedLang: result.detectedLang, langConfidence: result.langConfidence, analyzedAt: new Date(), }, }) return result.error ? 'failed' : 'analyzed' }) ) for (const r of results) { if (r.status === 'fulfilled') { if (r.value === 'analyzed') analyzed++ else if (r.value === 'failed') failed++ else if (r.value === 'skipped') skipped++ } else { failed++ } } console.log( `[DocAnalyzer] Batch progress: ${i + batch.length}/${files.length} (${analyzed} analyzed, ${skipped} skipped, ${failed} failed)` ) } return { analyzed, failed, skipped, total: files.length } } /** * Check if auto-analysis is enabled via SystemSettings. */ export async function isAutoAnalysisEnabled(): Promise { try { const setting = await prisma.systemSettings.findUnique({ where: { key: 'file_analysis_auto_enabled' }, }) return setting?.value !== 'false' } catch { return true } }