/** * File Content Extractor * * Downloads files from storage and extracts text content for AI analysis. * Supports PDF and plain text files. Used when round config has aiParseFiles=true. * * Limits: * - Max 50KB of extracted text per file (to stay within AI token limits) * - Only PDF and text-based files are parsed * - Extraction failures are non-fatal (file is skipped) */ import { getStorageProvider } from '@/lib/storage' const MAX_TEXT_PER_FILE = 50_000 // ~50KB of text per file const PARSEABLE_MIME_TYPES = [ 'application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/msword', 'text/plain', 'text/csv', 'text/markdown', 'text/html', 'application/rtf', ] export type ExtractedFileContent = { fileId: string fileName: string content: string | null error?: string } /** * Check if a file's mime type supports content extraction */ export function isParseableMimeType(mimeType: string): boolean { return PARSEABLE_MIME_TYPES.some((t) => mimeType.startsWith(t)) } /** * Extract text content from a single file stored in MinIO/S3. * Returns null content if file type is unsupported or extraction fails. */ export async function extractFileContent( objectKey: string, mimeType: string, fileName: string, fileId: string, ): Promise { if (!isParseableMimeType(mimeType)) { return { fileId, fileName, content: null, error: 'Unsupported mime type' } } try { const storage = await getStorageProvider() const buffer = await storage.getObject(objectKey) let text: string if (mimeType === 'application/pdf') { const { extractText, getDocumentProxy } = await import('unpdf') const pdf = await getDocumentProxy(new Uint8Array(buffer)) const result = await extractText(pdf, { mergePages: true }) text = (result.text as string).replace(/\0/g, '') } else if ( mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' || mimeType === 'application/msword' ) { const mammoth = await import('mammoth') const result = await mammoth.extractRawText({ buffer }) text = result.value.replace(/\0/g, '') } else { // Text-based files text = buffer.toString('utf-8').replace(/\0/g, '') } // Truncate to limit if (text.length > MAX_TEXT_PER_FILE) { text = text.slice(0, MAX_TEXT_PER_FILE) + '\n[... content truncated ...]' } return { fileId, fileName, content: text } } catch (error) { console.warn(`[FileExtractor] Failed to extract content from ${fileName}:`, error) return { fileId, fileName, content: null, error: error instanceof Error ? error.message : 'Extraction failed', } } } /** * Extract content from multiple files in parallel. * Non-fatal: files that fail extraction are returned with null content. */ export async function extractMultipleFileContents( files: Array<{ id: string fileName: string mimeType: string objectKey: string }>, ): Promise { const parseableFiles = files.filter((f) => isParseableMimeType(f.mimeType)) if (parseableFiles.length === 0) return [] const results = await Promise.allSettled( parseableFiles.map((f) => extractFileContent(f.objectKey, f.mimeType, f.fileName, f.id)), ) return results.map((r, i) => r.status === 'fulfilled' ? r.value : { fileId: parseableFiles[i].id, fileName: parseableFiles[i].fileName, content: null, error: 'Promise rejected' }, ) }