AI category-aware evaluation: per-round config, file parsing, shortlist, advance flow

- Per-juror cap mode (HARD/SOFT/NONE) in add-member dialog and members table - Jury invite flow: create user + add to group + send invitation from dialog - Per-round config: notifyOnAdvance, aiParseFiles, startupAdvanceCount, conceptAdvanceCount - Moved notify-on-advance from competition-level to per-round setting - AI filtering: round-tagged files with newest-first sorting, optional file content extraction - File content extractor service (pdf-parse for PDF, utf-8 for text files) - AI shortlist runs independently per category (STARTUP / BUSINESS_CONCEPT) - generateAIRecommendations tRPC endpoint with per-round config integration - AI recommendations UI: trigger button, confirmation dialog, per-category results display - Category-aware advance dialog: select/deselect projects by category with target caps - STAGE_ACTIVE bug fix in assignment router Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 10:09:52 +01:00
parent 93f4ad4b31
commit 80c9e35971
21 changed files with 1886 additions and 1381 deletions
--- a/src/server/services/ai-filtering.ts
+++ b/src/server/services/ai-filtering.ts
@@ -112,7 +112,16 @@ interface ProjectForFiltering {
  institution?: string | null
  submissionSource?: SubmissionSource
  submittedAt?: Date | null
-  files: Array<{ id: string; fileName: string; fileType?: FileType | null; size?: number; pageCount?: number | null }>
+  files: Array<{
+    id: string
+    fileName: string
+    fileType?: FileType | null
+    size?: number
+    pageCount?: number | null
+    roundName?: string | null
+    isCurrentRound?: boolean
+    textContent?: string
+  }>
  _count?: {
    teamMembers?: number
    files?: number
@@ -170,9 +179,10 @@ Return a JSON object with this exact structure:
 - founded_year: when the company/initiative was founded (use for age checks)
 - ocean_issue: the ocean conservation area
 - file_count, file_types: uploaded documents summary
- files[]: per-file details with file_type, page_count (if known), and size_kb
+- files[]: per-file details with file_type, page_count (if known), size_kb, round_name (which round the file was submitted for), and is_current_round flag
 - description: project summary text
 - tags: topic tags
+- If document content is provided (text_content field in files), use it for deeper analysis. Pay SPECIAL ATTENTION to files from the current round (is_current_round=true) as they are the most recent and relevant submissions.

 ## Guidelines
 - Evaluate ONLY against the provided criteria, not your own standards
--- a/src/server/services/ai-shortlist.ts
+++ b/src/server/services/ai-shortlist.ts
@@ -2,7 +2,8 @@
 * AI Shortlist Service
 *
 * Generates ranked recommendations at end of evaluation rounds.
- * Follows patterns from ai-filtering.ts and ai-evaluation-summary.ts.
+ * Runs SEPARATELY for each category (STARTUP / BUSINESS_CONCEPT)
+ * to produce independent rankings per the competition's advancement rules.
 *
 * GDPR Compliance:
 * - All project data is anonymized before AI processing
@@ -12,124 +13,43 @@
 import { getOpenAI, getConfiguredModel, buildCompletionParams } from '@/lib/openai'
 import { logAIUsage, extractTokenUsage } from '@/server/utils/ai-usage'
 import { classifyAIError, logAIError } from './ai-errors'
+import { extractMultipleFileContents } from './file-content-extractor'
 import type { PrismaClient } from '@prisma/client'

 // ─── Types ──────────────────────────────────────────────────────────────────

 export type ShortlistResult = {
  success: boolean
-  recommendations: ShortlistRecommendation[]
+  recommendations: CategoryRecommendations
  errors?: string[]
  tokensUsed?: number
 }

+export type CategoryRecommendations = {
+  STARTUP: ShortlistRecommendation[]
+  BUSINESS_CONCEPT: ShortlistRecommendation[]
+}
+
 export type ShortlistRecommendation = {
  projectId: string
  rank: number
  score: number
+  category: string
  strengths: string[]
  concerns: string[]
  recommendation: string
 }

-// ─── Main Function ──────────────────────────────────────────────────────────
+// ─── Prompt Building ────────────────────────────────────────────────────────

-/**
- * Generate an AI shortlist for projects in a round.
- * Only runs if EvaluationConfig.generateAiShortlist is true.
- */
-export async function generateShortlist(
-  params: {
-    roundId: string
-    competitionId: string
-    category?: string
-    topN?: number
-    rubric?: string
-  },
-  prisma: PrismaClient | any,
-): Promise<ShortlistResult> {
-  const { roundId, competitionId, category, topN = 10, rubric } = params
+function buildShortlistPrompt(category: string, topN: number, rubric?: string): string {
+  const categoryLabel = category === 'STARTUP' ? 'Startup' : 'Business Concept'

-  try {
-    // Load projects with evaluations
-    const where: Record<string, unknown> = {
-      assignments: { some: { roundId } },
-    }
-    if (category) {
-      where.competitionCategory = category
-    }
-
-    const projects = await prisma.project.findMany({
-      where,
-      include: {
-        assignments: {
-          where: { roundId },
-          include: {
-            evaluation: true,
-          },
-        },
-        projectTags: { include: { tag: true } },
-        files: { select: { id: true, type: true } },
-        teamMembers: { select: { user: { select: { name: true } } } },
-      },
-    })
-
-    if (projects.length === 0) {
-      return {
-        success: true,
-        recommendations: [],
-        errors: ['No projects found for this round'],
-      }
-    }
-
-    // Aggregate scores per project
-    const projectSummaries = projects.map((project: any) => {
-      const evaluations = project.assignments
-        .map((a: any) => a.evaluation)
-        .filter(Boolean)
-        .filter((e: any) => e.status === 'SUBMITTED')
-
-      const scores = evaluations.map((e: any) => e.globalScore ?? 0)
-      const avgScore = scores.length > 0
-        ? scores.reduce((sum: number, s: number) => sum + s, 0) / scores.length
-        : 0
-
-      const feedbacks = evaluations
-        .map((e: any) => e.feedbackGeneral)
-        .filter(Boolean)
-
-      return {
-        id: project.id,
-        title: project.title,
-        description: project.description,
-        category: project.competitionCategory,
-        tags: project.projectTags.map((pt: any) => pt.tag.name),
-        avgScore,
-        evaluationCount: evaluations.length,
-        feedbackSamples: feedbacks.slice(0, 3), // Max 3 feedback samples
-      }
-    })
-
-    // Anonymize for AI
-    const anonymized = projectSummaries.map((p: any, index: number) => ({
-      anonymousId: `PROJECT_${String(index + 1).padStart(3, '0')}`,
-      ...p,
-      // Strip identifying info
-      title: undefined,
-      id: undefined,
-    }))
-
-    // Build idMap for de-anonymization
-    const idMap = new Map<string, string>()
-    projectSummaries.forEach((p: any, index: number) => {
-      idMap.set(`PROJECT_${String(index + 1).padStart(3, '0')}`, p.id)
-    })
-
-    // Build prompt
-    const systemPrompt = `You are a senior jury advisor for the Monaco Ocean Protection Challenge.
+  return `You are a senior jury advisor for the Monaco Ocean Protection Challenge.

 ## Your Role
-Analyze aggregated evaluation data to produce a ranked shortlist of top projects.
+Analyze aggregated evaluation data to produce a ranked shortlist of the top ${topN} ${categoryLabel} projects.
+You are evaluating ONLY ${categoryLabel} projects in this batch — rank them against each other within this category.

 ## Ranking Criteria (Weighted)
 - Evaluation Scores (40%): Average scores across all jury evaluations
@@ -137,6 +57,12 @@ Analyze aggregated evaluation data to produce a ranked shortlist of top projects
 - Feasibility (20%): Likelihood of successful implementation
 - Alignment (15%): Fit with ocean protection mission and competition goals

+## Document Analysis
+If document content is provided (text_content field in files), use it for deeper qualitative analysis.
+Pay SPECIAL ATTENTION to files marked with is_current_round=true — these are the most recent submissions.
+Older documents provide context, but recent ones should carry more weight in your assessment.
+
+${rubric ? `## Custom Evaluation Rubric\n${rubric}\n` : ''}
 ## Output Format
 Return a JSON array:
 [
@@ -146,129 +72,305 @@ Return a JSON array:
    "score": 0-100,
    "strengths": ["strength 1", "strength 2"],
    "concerns": ["concern 1"],
-    "recommendation": "1-2 sentence recommendation",
-    "criterionBreakdown": {
-      "evaluationScores": 38,
-      "innovationImpact": 22,
-      "feasibility": 18,
-      "alignment": 14
-    }
+    "recommendation": "1-2 sentence recommendation"
  }
 ]

 ## Guidelines
- Only include the requested number of top projects
+- Only include the top ${topN} projects in your ranking
 - Score should reflect weighted combination of all criteria
 - Be specific in strengths and concerns — avoid generic statements
 - Consider feedback themes and evaluator consensus
- Higher evaluator consensus should boost confidence in ranking`
+- Higher evaluator consensus should boost confidence in ranking
+- Do not include any personal identifiers`
+}

-    const userPrompt = `Analyze these anonymized project evaluations and produce a ranked shortlist of the top ${topN} projects.
+// ─── Single Category Processing ─────────────────────────────────────────────

-${rubric ? `Evaluation rubric:\n${rubric}\n\n` : ''}Projects:
+async function generateCategoryShortlist(
+  params: {
+    roundId: string
+    category: string
+    topN: number
+    rubric?: string
+    aiParseFiles: boolean
+  },
+  prisma: PrismaClient | any,
+): Promise<{ recommendations: ShortlistRecommendation[]; tokensUsed: number; errors: string[] }> {
+  const { roundId, category, topN, rubric, aiParseFiles } = params
+
+  // Load projects with evaluations for this category
+  const projects = await prisma.project.findMany({
+    where: {
+      competitionCategory: category,
+      assignments: { some: { roundId } },
+    },
+    include: {
+      assignments: {
+        where: { roundId },
+        include: { evaluation: true },
+      },
+      projectTags: { include: { tag: true } },
+      files: {
+        select: {
+          id: true,
+          fileName: true,
+          fileType: true,
+          mimeType: true,
+          size: true,
+          pageCount: true,
+          objectKey: true,
+          roundId: true,
+          createdAt: true,
+        },
+        orderBy: { createdAt: 'desc' as const },
+      },
+      teamMembers: { select: { user: { select: { name: true } } } },
+    },
+  })
+
+  if (projects.length === 0) {
+    return {
+      recommendations: [],
+      tokensUsed: 0,
+      errors: [`No ${category} projects found for this round`],
+    }
+  }
+
+  // Get round names for file tagging
+  const roundIds = new Set<string>()
+  for (const p of projects) {
+    for (const f of (p as any).files || []) {
+      if (f.roundId) roundIds.add(f.roundId)
+    }
+  }
+  const roundNames = new Map<string, string>()
+  if (roundIds.size > 0) {
+    const rounds = await prisma.round.findMany({
+      where: { id: { in: [...roundIds] } },
+      select: { id: true, name: true },
+    })
+    for (const r of rounds) roundNames.set(r.id, r.name)
+  }
+
+  // Optionally extract file contents
+  let fileContents: Map<string, string> | undefined
+  if (aiParseFiles) {
+    const allFiles = projects.flatMap((p: any) =>
+      ((p.files || []) as Array<{ id: string; fileName: string; mimeType: string; objectKey: string }>)
+    )
+    const extractions = await extractMultipleFileContents(allFiles)
+    fileContents = new Map()
+    for (const e of extractions) {
+      if (e.content) fileContents.set(e.fileId, e.content)
+    }
+  }
+
+  // Aggregate scores per project
+  const projectSummaries = projects.map((project: any) => {
+    const evaluations = project.assignments
+      .map((a: any) => a.evaluation)
+      .filter(Boolean)
+      .filter((e: any) => e.status === 'SUBMITTED')
+
+    const scores = evaluations.map((e: any) => e.globalScore ?? 0)
+    const avgScore = scores.length > 0
+      ? scores.reduce((sum: number, s: number) => sum + s, 0) / scores.length
+      : 0
+
+    const feedbacks = evaluations
+      .map((e: any) => e.feedbackGeneral || e.feedbackText)
+      .filter(Boolean)
+
+    return {
+      id: project.id,
+      description: project.description,
+      category: project.competitionCategory,
+      tags: project.projectTags.map((pt: any) => pt.tag.name),
+      avgScore,
+      evaluationCount: evaluations.length,
+      feedbackSamples: feedbacks.slice(0, 3),
+      files: (project.files || []).map((f: any) => ({
+        file_type: f.fileType ?? 'OTHER',
+        page_count: f.pageCount ?? null,
+        size_kb: Math.round((f.size ?? 0) / 1024),
+        round_name: f.roundId ? (roundNames.get(f.roundId) || null) : null,
+        is_current_round: f.roundId === roundId,
+        ...(fileContents?.get(f.id) ? { text_content: fileContents.get(f.id) } : {}),
+      })),
+    }
+  })
+
+  // Anonymize for AI
+  const anonymized = projectSummaries.map((p: any, index: number) => ({
+    anonymousId: `PROJECT_${String(index + 1).padStart(3, '0')}`,
+    ...p,
+    id: undefined,
+  }))
+
+  // Build idMap for de-anonymization
+  const idMap = new Map<string, string>()
+  projectSummaries.forEach((p: any, index: number) => {
+    idMap.set(`PROJECT_${String(index + 1).padStart(3, '0')}`, p.id)
+  })
+
+  // Call AI
+  const openai = await getOpenAI()
+  const model = await getConfiguredModel()
+
+  if (!openai) {
+    return { recommendations: [], tokensUsed: 0, errors: ['OpenAI client not configured'] }
+  }
+
+  const systemPrompt = buildShortlistPrompt(category, topN, rubric)
+  const userPrompt = `Analyze these anonymized ${category} project evaluations and produce a ranked shortlist of the top ${topN}.
+
+Projects (${anonymized.length} total):
 ${JSON.stringify(anonymized, null, 2)}

-Return a JSON array following the format specified in your instructions. Only include the top ${topN} projects. Rank by overall quality considering scores and feedback.`
+Return a JSON array following the format specified. Only include the top ${topN} projects. Rank by overall quality within this category.`

-    const openai = await getOpenAI()
-    const model = await getConfiguredModel()
+  const MAX_PARSE_RETRIES = 2
+  let parseAttempts = 0
+  let response = await openai.chat.completions.create(
+    buildCompletionParams(model, {
+      messages: [
+        { role: 'system', content: systemPrompt },
+        { role: 'user', content: userPrompt },
+      ],
+      temperature: 0.1,
+      jsonMode: true,
+    }),
+  )

-    if (!openai) {
-      return {
-        success: false,
-        recommendations: [],
-        errors: ['OpenAI client not configured'],
+  let tokenUsage = extractTokenUsage(response)
+
+  await logAIUsage({
+    action: 'SHORTLIST',
+    model,
+    promptTokens: tokenUsage.promptTokens,
+    completionTokens: tokenUsage.completionTokens,
+    totalTokens: tokenUsage.totalTokens,
+    status: 'SUCCESS',
+  })
+
+  // Parse response
+  let parsed: any[]
+  while (true) {
+    try {
+      const content = response.choices[0]?.message?.content
+      if (!content) {
+        return { recommendations: [], tokensUsed: tokenUsage.totalTokens, errors: ['Empty AI response'] }
      }
-    }
-
-    const MAX_PARSE_RETRIES = 2
-    let parseAttempts = 0
-    let response = await openai.chat.completions.create(
-      buildCompletionParams(model, {
-        messages: [
-          { role: 'system', content: systemPrompt },
-          { role: 'user', content: userPrompt },
-        ],
-        temperature: 0.1,
-        jsonMode: true,
-      }),
-    )
-
-    let tokenUsage = extractTokenUsage(response)
-
-    await logAIUsage({
-      action: 'FILTERING',
-      model,
-      promptTokens: tokenUsage.promptTokens,
-      completionTokens: tokenUsage.completionTokens,
-      totalTokens: tokenUsage.totalTokens,
-      status: 'SUCCESS',
-    })
-
-    // Parse response with retry logic
-    let parsed: any[]
-    while (true) {
-      try {
-        const content = response.choices[0]?.message?.content
-        if (!content) {
-          return {
-            success: false,
-            recommendations: [],
-            errors: ['Empty AI response'],
-            tokensUsed: tokenUsage.totalTokens,
-          }
-        }
-
-        const json = JSON.parse(content)
-        parsed = Array.isArray(json) ? json : json.rankings ?? json.projects ?? json.shortlist ?? []
-        break
-      } catch (parseError) {
-        if (parseError instanceof SyntaxError && parseAttempts < MAX_PARSE_RETRIES) {
-          parseAttempts++
-          console.warn(`[AI Shortlist] JSON parse failed, retrying (${parseAttempts}/${MAX_PARSE_RETRIES})`)
-
-          // Retry the API call with hint
-          response = await openai.chat.completions.create(
-            buildCompletionParams(model, {
-              messages: [
-                { role: 'system', content: systemPrompt },
-                { role: 'user', content: userPrompt + '\n\nIMPORTANT: Please ensure valid JSON output.' },
-              ],
-              temperature: 0.1,
-              jsonMode: true,
-            }),
-          )
-          const retryUsage = extractTokenUsage(response)
-          tokenUsage.totalTokens += retryUsage.totalTokens
-          continue
-        }
-
-        return {
-          success: false,
-          recommendations: [],
-          errors: ['Failed to parse AI response as JSON'],
-          tokensUsed: tokenUsage.totalTokens,
-        }
+      const json = JSON.parse(content)
+      parsed = Array.isArray(json) ? json : json.rankings ?? json.projects ?? json.shortlist ?? []
+      break
+    } catch (parseError) {
+      if (parseError instanceof SyntaxError && parseAttempts < MAX_PARSE_RETRIES) {
+        parseAttempts++
+        response = await openai.chat.completions.create(
+          buildCompletionParams(model, {
+            messages: [
+              { role: 'system', content: systemPrompt },
+              { role: 'user', content: userPrompt + '\n\nIMPORTANT: Please ensure valid JSON output.' },
+            ],
+            temperature: 0.1,
+            jsonMode: true,
+          }),
+        )
+        const retryUsage = extractTokenUsage(response)
+        tokenUsage.totalTokens += retryUsage.totalTokens
+        continue
      }
+      return { recommendations: [], tokensUsed: tokenUsage.totalTokens, errors: ['Failed to parse AI response'] }
    }
+  }

-    // De-anonymize and build recommendations
-    const recommendations: ShortlistRecommendation[] = parsed
-      .filter((item: any) => item.anonymousId && idMap.has(item.anonymousId))
-      .map((item: any) => ({
-        projectId: idMap.get(item.anonymousId)!,
-        rank: item.rank ?? 0,
-        score: item.score ?? 0,
-        strengths: item.strengths ?? [],
-        concerns: item.concerns ?? [],
-        recommendation: item.recommendation ?? '',
-      }))
-      .sort((a: ShortlistRecommendation, b: ShortlistRecommendation) => a.rank - b.rank)
+  // De-anonymize
+  const recommendations: ShortlistRecommendation[] = parsed
+    .filter((item: any) => item.anonymousId && idMap.has(item.anonymousId))
+    .map((item: any) => ({
+      projectId: idMap.get(item.anonymousId)!,
+      rank: item.rank ?? 0,
+      score: item.score ?? 0,
+      category,
+      strengths: item.strengths ?? [],
+      concerns: item.concerns ?? [],
+      recommendation: item.recommendation ?? '',
+    }))
+    .sort((a: ShortlistRecommendation, b: ShortlistRecommendation) => a.rank - b.rank)
+
+  return { recommendations, tokensUsed: tokenUsage.totalTokens, errors: [] }
+}
+
+// ─── Main Function ──────────────────────────────────────────────────────────
+
+/**
+ * Generate an AI shortlist for projects in a round, split by category.
+ * Runs independently for STARTUP and BUSINESS_CONCEPT.
+ */
+export async function generateShortlist(
+  params: {
+    roundId: string
+    competitionId: string
+    category?: string // If provided, only run for this category
+    topN?: number // Global fallback
+    startupTopN?: number // Per-category override
+    conceptTopN?: number // Per-category override
+    rubric?: string
+    aiParseFiles?: boolean
+  },
+  prisma: PrismaClient | any,
+): Promise<ShortlistResult> {
+  const {
+    roundId,
+    category,
+    topN = 10,
+    startupTopN,
+    conceptTopN,
+    rubric,
+    aiParseFiles = false,
+  } = params
+
+  try {
+    const categories = category
+      ? [category]
+      : ['STARTUP', 'BUSINESS_CONCEPT']
+
+    const allRecommendations: CategoryRecommendations = {
+      STARTUP: [],
+      BUSINESS_CONCEPT: [],
+    }
+    let totalTokens = 0
+    const allErrors: string[] = []
+
+    // Run each category independently
+    for (const cat of categories) {
+      const catTopN = cat === 'STARTUP'
+        ? (startupTopN ?? topN)
+        : (conceptTopN ?? topN)
+
+      console.log(`[AI Shortlist] Generating top-${catTopN} for ${cat}`)
+
+      const result = await generateCategoryShortlist(
+        { roundId, category: cat, topN: catTopN, rubric, aiParseFiles },
+        prisma,
+      )
+
+      if (cat === 'STARTUP') {
+        allRecommendations.STARTUP = result.recommendations
+      } else {
+        allRecommendations.BUSINESS_CONCEPT = result.recommendations
+      }
+      totalTokens += result.tokensUsed
+      allErrors.push(...result.errors)
+    }

    return {
      success: true,
-      recommendations,
-      tokensUsed: tokenUsage.totalTokens,
+      recommendations: allRecommendations,
+      tokensUsed: totalTokens,
+      errors: allErrors.length > 0 ? allErrors : undefined,
    }
  } catch (error) {
    const classification = classifyAIError(error)
@@ -277,7 +379,7 @@ Return a JSON array following the format specified in your instructions. Only in

    return {
      success: false,
-      recommendations: [],
+      recommendations: { STARTUP: [], BUSINESS_CONCEPT: [] },
      errors: [error instanceof Error ? error.message : 'AI shortlist generation failed'],
    }
  }
--- a/src/server/services/anonymization.ts
+++ b/src/server/services/anonymization.ts
@@ -83,6 +83,9 @@ export interface AnonymizedFileInfo {
  file_type: string // FileType enum value
  page_count: number | null // Number of pages if known
  size_kb: number // File size in KB
+  round_name?: string | null // Which round the file was submitted for
+  is_current_round?: boolean // Whether this file belongs to the current filtering/evaluation round
+  text_content?: string // Extracted text content (when aiParseFiles is enabled)
 }

 export interface AnonymizedProjectForAI {
@@ -299,10 +302,13 @@ export function anonymizeProjectForAI(
    file_types: project.files
      ?.map((f) => f.fileType)
      .filter((ft): ft is FileType => ft !== null) ?? [],
-    files: project.files?.map((f) => ({
+    files: project.files?.map((f: any) => ({
      file_type: f.fileType ?? 'OTHER',
      page_count: f.pageCount ?? null,
      size_kb: Math.round((f.size ?? 0) / 1024),
+      ...(f.roundName ? { round_name: f.roundName } : {}),
+      ...(f.isCurrentRound !== undefined ? { is_current_round: f.isCurrentRound } : {}),
+      ...(f.textContent ? { text_content: f.textContent } : {}),
    })) ?? [],
    wants_mentorship: project.wantsMentorship ?? false,
    submission_source: project.submissionSource,
--- a/src/server/services/file-content-extractor.ts
+++ b/src/server/services/file-content-extractor.ts
@@ -0,0 +1,112 @@
+/**
+ * File Content Extractor
+ *
+ * Downloads files from storage and extracts text content for AI analysis.
+ * Supports PDF and plain text files. Used when round config has aiParseFiles=true.
+ *
+ * Limits:
+ * - Max 50KB of extracted text per file (to stay within AI token limits)
+ * - Only PDF and text-based files are parsed
+ * - Extraction failures are non-fatal (file is skipped)
+ */
+
+import { getStorageProvider } from '@/lib/storage'
+
+const MAX_TEXT_PER_FILE = 50_000 // ~50KB of text per file
+const PARSEABLE_MIME_TYPES = [
+  'application/pdf',
+  'text/plain',
+  'text/csv',
+  'text/markdown',
+  'text/html',
+  'application/rtf',
+]
+
+export type ExtractedFileContent = {
+  fileId: string
+  fileName: string
+  content: string | null
+  error?: string
+}
+
+/**
+ * Check if a file's mime type supports content extraction
+ */
+export function isParseableMimeType(mimeType: string): boolean {
+  return PARSEABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
+}
+
+/**
+ * Extract text content from a single file stored in MinIO/S3.
+ * Returns null content if file type is unsupported or extraction fails.
+ */
+export async function extractFileContent(
+  objectKey: string,
+  mimeType: string,
+  fileName: string,
+  fileId: string,
+): Promise<ExtractedFileContent> {
+  if (!isParseableMimeType(mimeType)) {
+    return { fileId, fileName, content: null, error: 'Unsupported mime type' }
+  }
+
+  try {
+    const storage = await getStorageProvider()
+    const buffer = await storage.getObject(objectKey)
+
+    let text: string
+
+    if (mimeType === 'application/pdf') {
+      // Dynamic import to avoid loading pdf-parse when not needed
+      const pdfParseModule = await import('pdf-parse')
+      const pdfParse = typeof pdfParseModule === 'function' ? pdfParseModule : (pdfParseModule as any).default ?? pdfParseModule
+      const pdf = await pdfParse(buffer)
+      text = pdf.text
+    } else {
+      // Text-based files
+      text = buffer.toString('utf-8')
+    }
+
+    // Truncate to limit
+    if (text.length > MAX_TEXT_PER_FILE) {
+      text = text.slice(0, MAX_TEXT_PER_FILE) + '\n[... content truncated ...]'
+    }
+
+    return { fileId, fileName, content: text }
+  } catch (error) {
+    console.warn(`[FileExtractor] Failed to extract content from ${fileName}:`, error)
+    return {
+      fileId,
+      fileName,
+      content: null,
+      error: error instanceof Error ? error.message : 'Extraction failed',
+    }
+  }
+}
+
+/**
+ * Extract content from multiple files in parallel.
+ * Non-fatal: files that fail extraction are returned with null content.
+ */
+export async function extractMultipleFileContents(
+  files: Array<{
+    id: string
+    fileName: string
+    mimeType: string
+    objectKey: string
+  }>,
+): Promise<ExtractedFileContent[]> {
+  const parseableFiles = files.filter((f) => isParseableMimeType(f.mimeType))
+
+  if (parseableFiles.length === 0) return []
+
+  const results = await Promise.allSettled(
+    parseableFiles.map((f) => extractFileContent(f.objectKey, f.mimeType, f.fileName, f.id)),
+  )
+
+  return results.map((r, i) =>
+    r.status === 'fulfilled'
+      ? r.value
+      : { fileId: parseableFiles[i].id, fileName: parseableFiles[i].fileName, content: null, error: 'Promise rejected' },
+  )
+}