AI category-aware evaluation: per-round config, file parsing, shortlist, advance flow

- Per-juror cap mode (HARD/SOFT/NONE) in add-member dialog and members table - Jury invite flow: create user + add to group + send invitation from dialog - Per-round config: notifyOnAdvance, aiParseFiles, startupAdvanceCount, conceptAdvanceCount - Moved notify-on-advance from competition-level to per-round setting - AI filtering: round-tagged files with newest-first sorting, optional file content extraction - File content extractor service (pdf-parse for PDF, utf-8 for text files) - AI shortlist runs independently per category (STARTUP / BUSINESS_CONCEPT) - generateAIRecommendations tRPC endpoint with per-round config integration - AI recommendations UI: trigger button, confirmation dialog, per-category results display - Category-aware advance dialog: select/deselect projects by category with target caps - STAGE_ACTIVE bug fix in assignment router Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 10:09:52 +01:00
parent 93f4ad4b31
commit 80c9e35971
21 changed files with 1886 additions and 1381 deletions
--- a/src/server/services/file-content-extractor.ts
+++ b/src/server/services/file-content-extractor.ts
@@ -0,0 +1,112 @@
+/**
+ * File Content Extractor
+ *
+ * Downloads files from storage and extracts text content for AI analysis.
+ * Supports PDF and plain text files. Used when round config has aiParseFiles=true.
+ *
+ * Limits:
+ * - Max 50KB of extracted text per file (to stay within AI token limits)
+ * - Only PDF and text-based files are parsed
+ * - Extraction failures are non-fatal (file is skipped)
+ */
+
+import { getStorageProvider } from '@/lib/storage'
+
+const MAX_TEXT_PER_FILE = 50_000 // ~50KB of text per file
+const PARSEABLE_MIME_TYPES = [
+  'application/pdf',
+  'text/plain',
+  'text/csv',
+  'text/markdown',
+  'text/html',
+  'application/rtf',
+]
+
+export type ExtractedFileContent = {
+  fileId: string
+  fileName: string
+  content: string | null
+  error?: string
+}
+
+/**
+ * Check if a file's mime type supports content extraction
+ */
+export function isParseableMimeType(mimeType: string): boolean {
+  return PARSEABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
+}
+
+/**
+ * Extract text content from a single file stored in MinIO/S3.
+ * Returns null content if file type is unsupported or extraction fails.
+ */
+export async function extractFileContent(
+  objectKey: string,
+  mimeType: string,
+  fileName: string,
+  fileId: string,
+): Promise<ExtractedFileContent> {
+  if (!isParseableMimeType(mimeType)) {
+    return { fileId, fileName, content: null, error: 'Unsupported mime type' }
+  }
+
+  try {
+    const storage = await getStorageProvider()
+    const buffer = await storage.getObject(objectKey)
+
+    let text: string
+
+    if (mimeType === 'application/pdf') {
+      // Dynamic import to avoid loading pdf-parse when not needed
+      const pdfParseModule = await import('pdf-parse')
+      const pdfParse = typeof pdfParseModule === 'function' ? pdfParseModule : (pdfParseModule as any).default ?? pdfParseModule
+      const pdf = await pdfParse(buffer)
+      text = pdf.text
+    } else {
+      // Text-based files
+      text = buffer.toString('utf-8')
+    }
+
+    // Truncate to limit
+    if (text.length > MAX_TEXT_PER_FILE) {
+      text = text.slice(0, MAX_TEXT_PER_FILE) + '\n[... content truncated ...]'
+    }
+
+    return { fileId, fileName, content: text }
+  } catch (error) {
+    console.warn(`[FileExtractor] Failed to extract content from ${fileName}:`, error)
+    return {
+      fileId,
+      fileName,
+      content: null,
+      error: error instanceof Error ? error.message : 'Extraction failed',
+    }
+  }
+}
+
+/**
+ * Extract content from multiple files in parallel.
+ * Non-fatal: files that fail extraction are returned with null content.
+ */
+export async function extractMultipleFileContents(
+  files: Array<{
+    id: string
+    fileName: string
+    mimeType: string
+    objectKey: string
+  }>,
+): Promise<ExtractedFileContent[]> {
+  const parseableFiles = files.filter((f) => isParseableMimeType(f.mimeType))
+
+  if (parseableFiles.length === 0) return []
+
+  const results = await Promise.allSettled(
+    parseableFiles.map((f) => extractFileContent(f.objectKey, f.mimeType, f.fileName, f.id)),
+  )
+
+  return results.map((r, i) =>
+    r.status === 'fulfilled'
+      ? r.value
+      : { fileId: parseableFiles[i].id, fileName: parseableFiles[i].fileName, content: null, error: 'Promise rejected' },
+  )
+}