AI category-aware evaluation: per-round config, file parsing, shortlist, advance flow
Some checks failed
Build and Push Docker Image / build (push) Has been cancelled
Some checks failed
Build and Push Docker Image / build (push) Has been cancelled
- Per-juror cap mode (HARD/SOFT/NONE) in add-member dialog and members table - Jury invite flow: create user + add to group + send invitation from dialog - Per-round config: notifyOnAdvance, aiParseFiles, startupAdvanceCount, conceptAdvanceCount - Moved notify-on-advance from competition-level to per-round setting - AI filtering: round-tagged files with newest-first sorting, optional file content extraction - File content extractor service (pdf-parse for PDF, utf-8 for text files) - AI shortlist runs independently per category (STARTUP / BUSINESS_CONCEPT) - generateAIRecommendations tRPC endpoint with per-round config integration - AI recommendations UI: trigger button, confirmation dialog, per-category results display - Category-aware advance dialog: select/deselect projects by category with target caps - STAGE_ACTIVE bug fix in assignment router Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
112
src/server/services/file-content-extractor.ts
Normal file
112
src/server/services/file-content-extractor.ts
Normal file
@@ -0,0 +1,112 @@
|
||||
/**
|
||||
* File Content Extractor
|
||||
*
|
||||
* Downloads files from storage and extracts text content for AI analysis.
|
||||
* Supports PDF and plain text files. Used when round config has aiParseFiles=true.
|
||||
*
|
||||
* Limits:
|
||||
* - Max 50KB of extracted text per file (to stay within AI token limits)
|
||||
* - Only PDF and text-based files are parsed
|
||||
* - Extraction failures are non-fatal (file is skipped)
|
||||
*/
|
||||
|
||||
import { getStorageProvider } from '@/lib/storage'
|
||||
|
||||
const MAX_TEXT_PER_FILE = 50_000 // ~50KB of text per file
|
||||
const PARSEABLE_MIME_TYPES = [
|
||||
'application/pdf',
|
||||
'text/plain',
|
||||
'text/csv',
|
||||
'text/markdown',
|
||||
'text/html',
|
||||
'application/rtf',
|
||||
]
|
||||
|
||||
export type ExtractedFileContent = {
|
||||
fileId: string
|
||||
fileName: string
|
||||
content: string | null
|
||||
error?: string
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a file's mime type supports content extraction
|
||||
*/
|
||||
export function isParseableMimeType(mimeType: string): boolean {
|
||||
return PARSEABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text content from a single file stored in MinIO/S3.
|
||||
* Returns null content if file type is unsupported or extraction fails.
|
||||
*/
|
||||
export async function extractFileContent(
|
||||
objectKey: string,
|
||||
mimeType: string,
|
||||
fileName: string,
|
||||
fileId: string,
|
||||
): Promise<ExtractedFileContent> {
|
||||
if (!isParseableMimeType(mimeType)) {
|
||||
return { fileId, fileName, content: null, error: 'Unsupported mime type' }
|
||||
}
|
||||
|
||||
try {
|
||||
const storage = await getStorageProvider()
|
||||
const buffer = await storage.getObject(objectKey)
|
||||
|
||||
let text: string
|
||||
|
||||
if (mimeType === 'application/pdf') {
|
||||
// Dynamic import to avoid loading pdf-parse when not needed
|
||||
const pdfParseModule = await import('pdf-parse')
|
||||
const pdfParse = typeof pdfParseModule === 'function' ? pdfParseModule : (pdfParseModule as any).default ?? pdfParseModule
|
||||
const pdf = await pdfParse(buffer)
|
||||
text = pdf.text
|
||||
} else {
|
||||
// Text-based files
|
||||
text = buffer.toString('utf-8')
|
||||
}
|
||||
|
||||
// Truncate to limit
|
||||
if (text.length > MAX_TEXT_PER_FILE) {
|
||||
text = text.slice(0, MAX_TEXT_PER_FILE) + '\n[... content truncated ...]'
|
||||
}
|
||||
|
||||
return { fileId, fileName, content: text }
|
||||
} catch (error) {
|
||||
console.warn(`[FileExtractor] Failed to extract content from ${fileName}:`, error)
|
||||
return {
|
||||
fileId,
|
||||
fileName,
|
||||
content: null,
|
||||
error: error instanceof Error ? error.message : 'Extraction failed',
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract content from multiple files in parallel.
|
||||
* Non-fatal: files that fail extraction are returned with null content.
|
||||
*/
|
||||
export async function extractMultipleFileContents(
|
||||
files: Array<{
|
||||
id: string
|
||||
fileName: string
|
||||
mimeType: string
|
||||
objectKey: string
|
||||
}>,
|
||||
): Promise<ExtractedFileContent[]> {
|
||||
const parseableFiles = files.filter((f) => isParseableMimeType(f.mimeType))
|
||||
|
||||
if (parseableFiles.length === 0) return []
|
||||
|
||||
const results = await Promise.allSettled(
|
||||
parseableFiles.map((f) => extractFileContent(f.objectKey, f.mimeType, f.fileName, f.id)),
|
||||
)
|
||||
|
||||
return results.map((r, i) =>
|
||||
r.status === 'fulfilled'
|
||||
? r.value
|
||||
: { fileId: parseableFiles[i].id, fileName: parseableFiles[i].fileName, content: null, error: 'Promise rejected' },
|
||||
)
|
||||
}
|
||||
Reference in New Issue
Block a user