Files
MOPC-Portal/src/server/services/file-content-extractor.ts
Matt d80043c4aa
Some checks failed
Build and Push Docker Image / build (push) Has been cancelled
Strip null bytes from extracted text to fix PostgreSQL UTF-8 errors
Some PDFs contain \x00 null bytes in their text which PostgreSQL rejects
with "invalid byte sequence for encoding UTF8: 0x00". Sanitize extracted
text in both document-analyzer and file-content-extractor services.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 11:34:05 +01:00

121 lines
3.5 KiB
TypeScript

/**
* File Content Extractor
*
* Downloads files from storage and extracts text content for AI analysis.
* Supports PDF and plain text files. Used when round config has aiParseFiles=true.
*
* Limits:
* - Max 50KB of extracted text per file (to stay within AI token limits)
* - Only PDF and text-based files are parsed
* - Extraction failures are non-fatal (file is skipped)
*/
import { getStorageProvider } from '@/lib/storage'
const MAX_TEXT_PER_FILE = 50_000 // ~50KB of text per file
const PARSEABLE_MIME_TYPES = [
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/msword',
'text/plain',
'text/csv',
'text/markdown',
'text/html',
'application/rtf',
]
export type ExtractedFileContent = {
fileId: string
fileName: string
content: string | null
error?: string
}
/**
* Check if a file's mime type supports content extraction
*/
export function isParseableMimeType(mimeType: string): boolean {
return PARSEABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
}
/**
* Extract text content from a single file stored in MinIO/S3.
* Returns null content if file type is unsupported or extraction fails.
*/
export async function extractFileContent(
objectKey: string,
mimeType: string,
fileName: string,
fileId: string,
): Promise<ExtractedFileContent> {
if (!isParseableMimeType(mimeType)) {
return { fileId, fileName, content: null, error: 'Unsupported mime type' }
}
try {
const storage = await getStorageProvider()
const buffer = await storage.getObject(objectKey)
let text: string
if (mimeType === 'application/pdf') {
const { extractText, getDocumentProxy } = await import('unpdf')
const pdf = await getDocumentProxy(new Uint8Array(buffer))
const result = await extractText(pdf, { mergePages: true })
text = (result.text as string).replace(/\0/g, '')
} else if (
mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
mimeType === 'application/msword'
) {
const mammoth = await import('mammoth')
const result = await mammoth.extractRawText({ buffer })
text = result.value.replace(/\0/g, '')
} else {
// Text-based files
text = buffer.toString('utf-8').replace(/\0/g, '')
}
// Truncate to limit
if (text.length > MAX_TEXT_PER_FILE) {
text = text.slice(0, MAX_TEXT_PER_FILE) + '\n[... content truncated ...]'
}
return { fileId, fileName, content: text }
} catch (error) {
console.warn(`[FileExtractor] Failed to extract content from ${fileName}:`, error)
return {
fileId,
fileName,
content: null,
error: error instanceof Error ? error.message : 'Extraction failed',
}
}
}
/**
* Extract content from multiple files in parallel.
* Non-fatal: files that fail extraction are returned with null content.
*/
export async function extractMultipleFileContents(
files: Array<{
id: string
fileName: string
mimeType: string
objectKey: string
}>,
): Promise<ExtractedFileContent[]> {
const parseableFiles = files.filter((f) => isParseableMimeType(f.mimeType))
if (parseableFiles.length === 0) return []
const results = await Promise.allSettled(
parseableFiles.map((f) => extractFileContent(f.objectKey, f.mimeType, f.fileName, f.id)),
)
return results.map((r, i) =>
r.status === 'fulfilled'
? r.value
: { fileId: parseableFiles[i].id, fileName: parseableFiles[i].fileName, content: null, error: 'Promise rejected' },
)
}