113 lines
3.1 KiB
TypeScript
113 lines
3.1 KiB
TypeScript
|
|
/**
|
||
|
|
* File Content Extractor
|
||
|
|
*
|
||
|
|
* Downloads files from storage and extracts text content for AI analysis.
|
||
|
|
* Supports PDF and plain text files. Used when round config has aiParseFiles=true.
|
||
|
|
*
|
||
|
|
* Limits:
|
||
|
|
* - Max 50KB of extracted text per file (to stay within AI token limits)
|
||
|
|
* - Only PDF and text-based files are parsed
|
||
|
|
* - Extraction failures are non-fatal (file is skipped)
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { getStorageProvider } from '@/lib/storage'
|
||
|
|
|
||
|
|
const MAX_TEXT_PER_FILE = 50_000 // ~50KB of text per file
|
||
|
|
const PARSEABLE_MIME_TYPES = [
|
||
|
|
'application/pdf',
|
||
|
|
'text/plain',
|
||
|
|
'text/csv',
|
||
|
|
'text/markdown',
|
||
|
|
'text/html',
|
||
|
|
'application/rtf',
|
||
|
|
]
|
||
|
|
|
||
|
|
export type ExtractedFileContent = {
|
||
|
|
fileId: string
|
||
|
|
fileName: string
|
||
|
|
content: string | null
|
||
|
|
error?: string
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Check if a file's mime type supports content extraction
|
||
|
|
*/
|
||
|
|
export function isParseableMimeType(mimeType: string): boolean {
|
||
|
|
return PARSEABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Extract text content from a single file stored in MinIO/S3.
|
||
|
|
* Returns null content if file type is unsupported or extraction fails.
|
||
|
|
*/
|
||
|
|
export async function extractFileContent(
|
||
|
|
objectKey: string,
|
||
|
|
mimeType: string,
|
||
|
|
fileName: string,
|
||
|
|
fileId: string,
|
||
|
|
): Promise<ExtractedFileContent> {
|
||
|
|
if (!isParseableMimeType(mimeType)) {
|
||
|
|
return { fileId, fileName, content: null, error: 'Unsupported mime type' }
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
const storage = await getStorageProvider()
|
||
|
|
const buffer = await storage.getObject(objectKey)
|
||
|
|
|
||
|
|
let text: string
|
||
|
|
|
||
|
|
if (mimeType === 'application/pdf') {
|
||
|
|
// Dynamic import to avoid loading pdf-parse when not needed
|
||
|
|
const pdfParseModule = await import('pdf-parse')
|
||
|
|
const pdfParse = typeof pdfParseModule === 'function' ? pdfParseModule : (pdfParseModule as any).default ?? pdfParseModule
|
||
|
|
const pdf = await pdfParse(buffer)
|
||
|
|
text = pdf.text
|
||
|
|
} else {
|
||
|
|
// Text-based files
|
||
|
|
text = buffer.toString('utf-8')
|
||
|
|
}
|
||
|
|
|
||
|
|
// Truncate to limit
|
||
|
|
if (text.length > MAX_TEXT_PER_FILE) {
|
||
|
|
text = text.slice(0, MAX_TEXT_PER_FILE) + '\n[... content truncated ...]'
|
||
|
|
}
|
||
|
|
|
||
|
|
return { fileId, fileName, content: text }
|
||
|
|
} catch (error) {
|
||
|
|
console.warn(`[FileExtractor] Failed to extract content from ${fileName}:`, error)
|
||
|
|
return {
|
||
|
|
fileId,
|
||
|
|
fileName,
|
||
|
|
content: null,
|
||
|
|
error: error instanceof Error ? error.message : 'Extraction failed',
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Extract content from multiple files in parallel.
|
||
|
|
* Non-fatal: files that fail extraction are returned with null content.
|
||
|
|
*/
|
||
|
|
export async function extractMultipleFileContents(
|
||
|
|
files: Array<{
|
||
|
|
id: string
|
||
|
|
fileName: string
|
||
|
|
mimeType: string
|
||
|
|
objectKey: string
|
||
|
|
}>,
|
||
|
|
): Promise<ExtractedFileContent[]> {
|
||
|
|
const parseableFiles = files.filter((f) => isParseableMimeType(f.mimeType))
|
||
|
|
|
||
|
|
if (parseableFiles.length === 0) return []
|
||
|
|
|
||
|
|
const results = await Promise.allSettled(
|
||
|
|
parseableFiles.map((f) => extractFileContent(f.objectKey, f.mimeType, f.fileName, f.id)),
|
||
|
|
)
|
||
|
|
|
||
|
|
return results.map((r, i) =>
|
||
|
|
r.status === 'fulfilled'
|
||
|
|
? r.value
|
||
|
|
: { fileId: parseableFiles[i].id, fileName: parseableFiles[i].fileName, content: null, error: 'Promise rejected' },
|
||
|
|
)
|
||
|
|
}
|