Files
MOPC-Portal/src/server/services/ai-shortlist.ts

452 lines
15 KiB
TypeScript
Raw Normal View History

/**
* AI Shortlist Service
*
* Generates ranked recommendations at end of evaluation rounds.
* Runs SEPARATELY for each category (STARTUP / BUSINESS_CONCEPT)
* to produce independent rankings per the competition's advancement rules.
*
* GDPR Compliance:
* - All project data is anonymized before AI processing
* - No personal identifiers in prompts or responses
*/
import { getOpenAI, getConfiguredModel, buildCompletionParams } from '@/lib/openai'
import { logAIUsage, extractTokenUsage } from '@/server/utils/ai-usage'
import { classifyAIError, logAIError } from './ai-errors'
import { extractMultipleFileContents } from './file-content-extractor'
fix(security): route ai-shortlist through canonical anonymization pipeline ai-shortlist was sending raw project.description, raw juror feedback text (feedbackGeneral / feedbackText), and full extracted file text content directly to OpenAI as part of the user prompt. Its only "anonymization" was renaming `id` to `anonymousId`. This bypassed the GDPR contract documented in the file's own header comment ("All project data is anonymized before AI processing — No personal identifiers in prompts") and in CLAUDE.md ("All AI calls anonymize data before sending to OpenAI"). A juror writing "Contact applicant Jane Doe at jane@example.com" in feedback would ship that PII to OpenAI verbatim every time an admin generated a shortlist. Same for any names / emails / phone numbers embedded in extracted PDF text. generateCategoryShortlist now mirrors the pattern used by ai-filtering / ai-tagging / ai-award-eligibility: - toProjectWithRelations + anonymizeProjectsForAI(_, 'FILTERING') - validateAnonymizedProjects gate that aborts on detected PII - Aggregates (avgScore, evaluationCount, feedbackSamples) computed separately and merged onto the anonymized projects; each feedback sample passes through sanitizeText (strips email/phone/url/ssn) and is truncated to 1000 chars. Defense-in-depth fix in the shared helper: anonymizeProjectForAI now also runs sanitizeText over each file's text_content before emitting it to AI services. Previously the helper passed extracted file text through unchanged, which would have leaked PII from PDF body text via ai-filtering / ai-tagging / ai-award-eligibility too if those services turn on aiParseFiles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 04:14:58 +02:00
import {
toProjectWithRelations,
anonymizeProjectsForAI,
validateAnonymizedProjects,
sanitizeText,
} from './anonymization'
import type { PrismaClient, CompetitionCategory } from '@prisma/client'
// ─── Types ──────────────────────────────────────────────────────────────────
export type ShortlistResult = {
success: boolean
recommendations: CategoryRecommendations
errors?: string[]
tokensUsed?: number
}
export type CategoryRecommendations = {
STARTUP: ShortlistRecommendation[]
BUSINESS_CONCEPT: ShortlistRecommendation[]
}
export type ShortlistRecommendation = {
projectId: string
rank: number
score: number
category: string
strengths: string[]
concerns: string[]
recommendation: string
}
// ─── Prompt Building ────────────────────────────────────────────────────────
function buildShortlistPrompt(category: string, topN: number, rubric?: string): string {
const categoryLabel = category === 'STARTUP' ? 'Startup' : 'Business Concept'
return `You are a senior jury advisor for the Monaco Ocean Protection Challenge.
## Your Role
Analyze aggregated evaluation data to produce a ranked shortlist of the top ${topN} ${categoryLabel} projects.
You are evaluating ONLY ${categoryLabel} projects in this batch rank them against each other within this category.
## Ranking Criteria (Weighted)
- Evaluation Scores (40%): Average scores across all jury evaluations
- Innovation & Impact (25%): Novelty of approach and potential environmental impact
- Feasibility (20%): Likelihood of successful implementation
- Alignment (15%): Fit with ocean protection mission and competition goals
## Document Analysis
If document content is provided (text_content field in files), use it for deeper qualitative analysis.
Pay SPECIAL ATTENTION to files marked with is_current_round=true these are the most recent submissions.
Older documents provide context, but recent ones should carry more weight in your assessment.
${rubric ? `## Custom Evaluation Rubric\n${rubric}\n` : ''}
## Output Format
Return a JSON array:
[
{
"anonymousId": "PROJECT_001",
"rank": 1,
"score": 0-100,
"strengths": ["strength 1", "strength 2"],
"concerns": ["concern 1"],
"recommendation": "1-2 sentence recommendation"
}
]
## Guidelines
- Only include the top ${topN} projects in your ranking
- Score should reflect weighted combination of all criteria
- Be specific in strengths and concerns avoid generic statements
- Consider feedback themes and evaluator consensus
- Higher evaluator consensus should boost confidence in ranking
- Do not include any personal identifiers`
}
// ─── Single Category Processing ─────────────────────────────────────────────
async function generateCategoryShortlist(
params: {
roundId: string
category: string
topN: number
rubric?: string
aiParseFiles: boolean
},
prisma: PrismaClient,
): Promise<{ recommendations: ShortlistRecommendation[]; tokensUsed: number; errors: string[] }> {
const { roundId, category, topN, rubric, aiParseFiles } = params
// Load projects with evaluations for this category
const projects = await prisma.project.findMany({
where: {
competitionCategory: category as CompetitionCategory,
assignments: { some: { roundId } },
},
include: {
assignments: {
where: { roundId },
include: { evaluation: true },
},
projectTags: { include: { tag: true } },
files: {
select: {
id: true,
fileName: true,
fileType: true,
mimeType: true,
size: true,
pageCount: true,
objectKey: true,
roundId: true,
createdAt: true,
},
orderBy: { createdAt: 'desc' as const },
},
teamMembers: { select: { user: { select: { name: true } } } },
},
})
if (projects.length === 0) {
return {
recommendations: [],
tokensUsed: 0,
errors: [`No ${category} projects found for this round`],
}
}
// Get round names for file tagging
const roundIds = new Set<string>()
for (const p of projects) {
for (const f of (p as any).files || []) {
if (f.roundId) roundIds.add(f.roundId)
}
}
const roundNames = new Map<string, string>()
if (roundIds.size > 0) {
const rounds = await prisma.round.findMany({
where: { id: { in: [...roundIds] } },
select: { id: true, name: true },
})
for (const r of rounds) roundNames.set(r.id, r.name)
}
// Optionally extract file contents
let fileContents: Map<string, string> | undefined
if (aiParseFiles) {
const allFiles = projects.flatMap((p: any) =>
((p.files || []) as Array<{ id: string; fileName: string; mimeType: string; objectKey: string }>)
)
const extractions = await extractMultipleFileContents(allFiles)
fileContents = new Map()
for (const e of extractions) {
if (e.content) fileContents.set(e.fileId, e.content)
}
}
fix(security): route ai-shortlist through canonical anonymization pipeline ai-shortlist was sending raw project.description, raw juror feedback text (feedbackGeneral / feedbackText), and full extracted file text content directly to OpenAI as part of the user prompt. Its only "anonymization" was renaming `id` to `anonymousId`. This bypassed the GDPR contract documented in the file's own header comment ("All project data is anonymized before AI processing — No personal identifiers in prompts") and in CLAUDE.md ("All AI calls anonymize data before sending to OpenAI"). A juror writing "Contact applicant Jane Doe at jane@example.com" in feedback would ship that PII to OpenAI verbatim every time an admin generated a shortlist. Same for any names / emails / phone numbers embedded in extracted PDF text. generateCategoryShortlist now mirrors the pattern used by ai-filtering / ai-tagging / ai-award-eligibility: - toProjectWithRelations + anonymizeProjectsForAI(_, 'FILTERING') - validateAnonymizedProjects gate that aborts on detected PII - Aggregates (avgScore, evaluationCount, feedbackSamples) computed separately and merged onto the anonymized projects; each feedback sample passes through sanitizeText (strips email/phone/url/ssn) and is truncated to 1000 chars. Defense-in-depth fix in the shared helper: anonymizeProjectForAI now also runs sanitizeText over each file's text_content before emitting it to AI services. Previously the helper passed extracted file text through unchanged, which would have leaked PII from PDF body text via ai-filtering / ai-tagging / ai-award-eligibility too if those services turn on aiParseFiles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 04:14:58 +02:00
// Aggregate per-project stats and free-text feedback. Sanitize feedback
// before it enters the prompt — sanitizeText strips email/phone/url/ssn
// patterns embedded in juror free-text. Without this, juror feedback like
// "Contact applicant Jane at jane@example.com" leaks PII to OpenAI.
const aggregatesByProjectId = new Map<
string,
{ avgScore: number; evaluationCount: number; feedbackSamples: string[] }
>()
for (const project of projects as any[]) {
const evaluations = project.assignments
.map((a: any) => a.evaluation)
.filter(Boolean)
.filter((e: any) => e.status === 'SUBMITTED')
const scores = evaluations.map((e: any) => e.globalScore ?? 0)
const avgScore = scores.length > 0
? scores.reduce((sum: number, s: number) => sum + s, 0) / scores.length
: 0
fix(security): route ai-shortlist through canonical anonymization pipeline ai-shortlist was sending raw project.description, raw juror feedback text (feedbackGeneral / feedbackText), and full extracted file text content directly to OpenAI as part of the user prompt. Its only "anonymization" was renaming `id` to `anonymousId`. This bypassed the GDPR contract documented in the file's own header comment ("All project data is anonymized before AI processing — No personal identifiers in prompts") and in CLAUDE.md ("All AI calls anonymize data before sending to OpenAI"). A juror writing "Contact applicant Jane Doe at jane@example.com" in feedback would ship that PII to OpenAI verbatim every time an admin generated a shortlist. Same for any names / emails / phone numbers embedded in extracted PDF text. generateCategoryShortlist now mirrors the pattern used by ai-filtering / ai-tagging / ai-award-eligibility: - toProjectWithRelations + anonymizeProjectsForAI(_, 'FILTERING') - validateAnonymizedProjects gate that aborts on detected PII - Aggregates (avgScore, evaluationCount, feedbackSamples) computed separately and merged onto the anonymized projects; each feedback sample passes through sanitizeText (strips email/phone/url/ssn) and is truncated to 1000 chars. Defense-in-depth fix in the shared helper: anonymizeProjectForAI now also runs sanitizeText over each file's text_content before emitting it to AI services. Previously the helper passed extracted file text through unchanged, which would have leaked PII from PDF body text via ai-filtering / ai-tagging / ai-award-eligibility too if those services turn on aiParseFiles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 04:14:58 +02:00
const feedbackSamples = evaluations
.map((e: any) => e.feedbackGeneral || e.feedbackText)
fix(security): route ai-shortlist through canonical anonymization pipeline ai-shortlist was sending raw project.description, raw juror feedback text (feedbackGeneral / feedbackText), and full extracted file text content directly to OpenAI as part of the user prompt. Its only "anonymization" was renaming `id` to `anonymousId`. This bypassed the GDPR contract documented in the file's own header comment ("All project data is anonymized before AI processing — No personal identifiers in prompts") and in CLAUDE.md ("All AI calls anonymize data before sending to OpenAI"). A juror writing "Contact applicant Jane Doe at jane@example.com" in feedback would ship that PII to OpenAI verbatim every time an admin generated a shortlist. Same for any names / emails / phone numbers embedded in extracted PDF text. generateCategoryShortlist now mirrors the pattern used by ai-filtering / ai-tagging / ai-award-eligibility: - toProjectWithRelations + anonymizeProjectsForAI(_, 'FILTERING') - validateAnonymizedProjects gate that aborts on detected PII - Aggregates (avgScore, evaluationCount, feedbackSamples) computed separately and merged onto the anonymized projects; each feedback sample passes through sanitizeText (strips email/phone/url/ssn) and is truncated to 1000 chars. Defense-in-depth fix in the shared helper: anonymizeProjectForAI now also runs sanitizeText over each file's text_content before emitting it to AI services. Previously the helper passed extracted file text through unchanged, which would have leaked PII from PDF body text via ai-filtering / ai-tagging / ai-award-eligibility too if those services turn on aiParseFiles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 04:14:58 +02:00
.filter((t: unknown): t is string => typeof t === 'string' && t.length > 0)
.slice(0, 3)
.map((t: string) => sanitizeText(t).slice(0, 1000))
fix(security): route ai-shortlist through canonical anonymization pipeline ai-shortlist was sending raw project.description, raw juror feedback text (feedbackGeneral / feedbackText), and full extracted file text content directly to OpenAI as part of the user prompt. Its only "anonymization" was renaming `id` to `anonymousId`. This bypassed the GDPR contract documented in the file's own header comment ("All project data is anonymized before AI processing — No personal identifiers in prompts") and in CLAUDE.md ("All AI calls anonymize data before sending to OpenAI"). A juror writing "Contact applicant Jane Doe at jane@example.com" in feedback would ship that PII to OpenAI verbatim every time an admin generated a shortlist. Same for any names / emails / phone numbers embedded in extracted PDF text. generateCategoryShortlist now mirrors the pattern used by ai-filtering / ai-tagging / ai-award-eligibility: - toProjectWithRelations + anonymizeProjectsForAI(_, 'FILTERING') - validateAnonymizedProjects gate that aborts on detected PII - Aggregates (avgScore, evaluationCount, feedbackSamples) computed separately and merged onto the anonymized projects; each feedback sample passes through sanitizeText (strips email/phone/url/ssn) and is truncated to 1000 chars. Defense-in-depth fix in the shared helper: anonymizeProjectForAI now also runs sanitizeText over each file's text_content before emitting it to AI services. Previously the helper passed extracted file text through unchanged, which would have leaked PII from PDF body text via ai-filtering / ai-tagging / ai-award-eligibility too if those services turn on aiParseFiles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 04:14:58 +02:00
aggregatesByProjectId.set(project.id, {
avgScore,
evaluationCount: evaluations.length,
fix(security): route ai-shortlist through canonical anonymization pipeline ai-shortlist was sending raw project.description, raw juror feedback text (feedbackGeneral / feedbackText), and full extracted file text content directly to OpenAI as part of the user prompt. Its only "anonymization" was renaming `id` to `anonymousId`. This bypassed the GDPR contract documented in the file's own header comment ("All project data is anonymized before AI processing — No personal identifiers in prompts") and in CLAUDE.md ("All AI calls anonymize data before sending to OpenAI"). A juror writing "Contact applicant Jane Doe at jane@example.com" in feedback would ship that PII to OpenAI verbatim every time an admin generated a shortlist. Same for any names / emails / phone numbers embedded in extracted PDF text. generateCategoryShortlist now mirrors the pattern used by ai-filtering / ai-tagging / ai-award-eligibility: - toProjectWithRelations + anonymizeProjectsForAI(_, 'FILTERING') - validateAnonymizedProjects gate that aborts on detected PII - Aggregates (avgScore, evaluationCount, feedbackSamples) computed separately and merged onto the anonymized projects; each feedback sample passes through sanitizeText (strips email/phone/url/ssn) and is truncated to 1000 chars. Defense-in-depth fix in the shared helper: anonymizeProjectForAI now also runs sanitizeText over each file's text_content before emitting it to AI services. Previously the helper passed extracted file text through unchanged, which would have leaked PII from PDF body text via ai-filtering / ai-tagging / ai-award-eligibility too if those services turn on aiParseFiles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 04:14:58 +02:00
feedbackSamples,
})
}
// Route every project through the canonical anonymization pipeline so
// description/title/institution are PII-stripped, free-text is truncated,
// and file text_content is sanitized (handled in anonymizeProjectForAI).
const projectsWithRelations = (projects as any[]).map((p) =>
toProjectWithRelations({
id: p.id,
title: p.title,
description: p.description,
competitionCategory: p.competitionCategory,
oceanIssue: p.oceanIssue ?? null,
country: p.country ?? null,
geographicZone: p.geographicZone ?? null,
institution: p.institution ?? null,
tags: (p.projectTags ?? []).map((pt: any) => pt.tag.name),
foundedAt: p.foundedAt ?? null,
wantsMentorship: p.wantsMentorship ?? false,
submissionSource: p.submissionSource ?? 'MANUAL',
submittedAt: p.submittedAt ?? null,
_count: { teamMembers: p.teamMembers?.length ?? 0, files: p.files?.length ?? 0 },
files: (p.files ?? []).map((f: any) => ({
fileType: f.fileType ?? null,
size: f.size,
pageCount: f.pageCount,
roundName: f.roundId ? roundNames.get(f.roundId) : undefined,
isCurrentRound: f.roundId === roundId,
textContent: fileContents?.get(f.id),
})),
fix(security): route ai-shortlist through canonical anonymization pipeline ai-shortlist was sending raw project.description, raw juror feedback text (feedbackGeneral / feedbackText), and full extracted file text content directly to OpenAI as part of the user prompt. Its only "anonymization" was renaming `id` to `anonymousId`. This bypassed the GDPR contract documented in the file's own header comment ("All project data is anonymized before AI processing — No personal identifiers in prompts") and in CLAUDE.md ("All AI calls anonymize data before sending to OpenAI"). A juror writing "Contact applicant Jane Doe at jane@example.com" in feedback would ship that PII to OpenAI verbatim every time an admin generated a shortlist. Same for any names / emails / phone numbers embedded in extracted PDF text. generateCategoryShortlist now mirrors the pattern used by ai-filtering / ai-tagging / ai-award-eligibility: - toProjectWithRelations + anonymizeProjectsForAI(_, 'FILTERING') - validateAnonymizedProjects gate that aborts on detected PII - Aggregates (avgScore, evaluationCount, feedbackSamples) computed separately and merged onto the anonymized projects; each feedback sample passes through sanitizeText (strips email/phone/url/ssn) and is truncated to 1000 chars. Defense-in-depth fix in the shared helper: anonymizeProjectForAI now also runs sanitizeText over each file's text_content before emitting it to AI services. Previously the helper passed extracted file text through unchanged, which would have leaked PII from PDF body text via ai-filtering / ai-tagging / ai-award-eligibility too if those services turn on aiParseFiles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 04:14:58 +02:00
}),
)
const { anonymized: anonymizedBase, mappings } = anonymizeProjectsForAI(
projectsWithRelations,
'FILTERING',
)
if (!validateAnonymizedProjects(anonymizedBase)) {
console.error('[AI Shortlist] Anonymization validation failed')
return {
recommendations: [],
tokensUsed: 0,
errors: ['GDPR compliance check failed: PII detected in anonymized data'],
}
fix(security): route ai-shortlist through canonical anonymization pipeline ai-shortlist was sending raw project.description, raw juror feedback text (feedbackGeneral / feedbackText), and full extracted file text content directly to OpenAI as part of the user prompt. Its only "anonymization" was renaming `id` to `anonymousId`. This bypassed the GDPR contract documented in the file's own header comment ("All project data is anonymized before AI processing — No personal identifiers in prompts") and in CLAUDE.md ("All AI calls anonymize data before sending to OpenAI"). A juror writing "Contact applicant Jane Doe at jane@example.com" in feedback would ship that PII to OpenAI verbatim every time an admin generated a shortlist. Same for any names / emails / phone numbers embedded in extracted PDF text. generateCategoryShortlist now mirrors the pattern used by ai-filtering / ai-tagging / ai-award-eligibility: - toProjectWithRelations + anonymizeProjectsForAI(_, 'FILTERING') - validateAnonymizedProjects gate that aborts on detected PII - Aggregates (avgScore, evaluationCount, feedbackSamples) computed separately and merged onto the anonymized projects; each feedback sample passes through sanitizeText (strips email/phone/url/ssn) and is truncated to 1000 chars. Defense-in-depth fix in the shared helper: anonymizeProjectForAI now also runs sanitizeText over each file's text_content before emitting it to AI services. Previously the helper passed extracted file text through unchanged, which would have leaked PII from PDF body text via ai-filtering / ai-tagging / ai-award-eligibility too if those services turn on aiParseFiles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 04:14:58 +02:00
}
fix(security): route ai-shortlist through canonical anonymization pipeline ai-shortlist was sending raw project.description, raw juror feedback text (feedbackGeneral / feedbackText), and full extracted file text content directly to OpenAI as part of the user prompt. Its only "anonymization" was renaming `id` to `anonymousId`. This bypassed the GDPR contract documented in the file's own header comment ("All project data is anonymized before AI processing — No personal identifiers in prompts") and in CLAUDE.md ("All AI calls anonymize data before sending to OpenAI"). A juror writing "Contact applicant Jane Doe at jane@example.com" in feedback would ship that PII to OpenAI verbatim every time an admin generated a shortlist. Same for any names / emails / phone numbers embedded in extracted PDF text. generateCategoryShortlist now mirrors the pattern used by ai-filtering / ai-tagging / ai-award-eligibility: - toProjectWithRelations + anonymizeProjectsForAI(_, 'FILTERING') - validateAnonymizedProjects gate that aborts on detected PII - Aggregates (avgScore, evaluationCount, feedbackSamples) computed separately and merged onto the anonymized projects; each feedback sample passes through sanitizeText (strips email/phone/url/ssn) and is truncated to 1000 chars. Defense-in-depth fix in the shared helper: anonymizeProjectForAI now also runs sanitizeText over each file's text_content before emitting it to AI services. Previously the helper passed extracted file text through unchanged, which would have leaked PII from PDF body text via ai-filtering / ai-tagging / ai-award-eligibility too if those services turn on aiParseFiles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 04:14:58 +02:00
// Merge anonymized base with per-project aggregates, keyed by mapping order.
// Use the same anonymousId scheme the AI prompt expects.
const anonymized = anonymizedBase.map((p, index) => {
const realId = mappings[index].realId
const agg = aggregatesByProjectId.get(realId) ?? {
avgScore: 0,
evaluationCount: 0,
feedbackSamples: [],
}
return {
anonymousId: `PROJECT_${String(index + 1).padStart(3, '0')}`,
...p,
project_id: undefined,
avgScore: agg.avgScore,
evaluationCount: agg.evaluationCount,
feedbackSamples: agg.feedbackSamples,
}
})
// Build idMap for de-anonymization
const idMap = new Map<string, string>()
fix(security): route ai-shortlist through canonical anonymization pipeline ai-shortlist was sending raw project.description, raw juror feedback text (feedbackGeneral / feedbackText), and full extracted file text content directly to OpenAI as part of the user prompt. Its only "anonymization" was renaming `id` to `anonymousId`. This bypassed the GDPR contract documented in the file's own header comment ("All project data is anonymized before AI processing — No personal identifiers in prompts") and in CLAUDE.md ("All AI calls anonymize data before sending to OpenAI"). A juror writing "Contact applicant Jane Doe at jane@example.com" in feedback would ship that PII to OpenAI verbatim every time an admin generated a shortlist. Same for any names / emails / phone numbers embedded in extracted PDF text. generateCategoryShortlist now mirrors the pattern used by ai-filtering / ai-tagging / ai-award-eligibility: - toProjectWithRelations + anonymizeProjectsForAI(_, 'FILTERING') - validateAnonymizedProjects gate that aborts on detected PII - Aggregates (avgScore, evaluationCount, feedbackSamples) computed separately and merged onto the anonymized projects; each feedback sample passes through sanitizeText (strips email/phone/url/ssn) and is truncated to 1000 chars. Defense-in-depth fix in the shared helper: anonymizeProjectForAI now also runs sanitizeText over each file's text_content before emitting it to AI services. Previously the helper passed extracted file text through unchanged, which would have leaked PII from PDF body text via ai-filtering / ai-tagging / ai-award-eligibility too if those services turn on aiParseFiles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 04:14:58 +02:00
mappings.forEach((m, index) => {
idMap.set(`PROJECT_${String(index + 1).padStart(3, '0')}`, m.realId)
})
// Call AI
const openai = await getOpenAI()
const model = await getConfiguredModel()
if (!openai) {
return { recommendations: [], tokensUsed: 0, errors: ['OpenAI client not configured'] }
}
const systemPrompt = buildShortlistPrompt(category, topN, rubric)
const userPrompt = `Analyze these anonymized ${category} project evaluations and produce a ranked shortlist of the top ${topN}.
Projects (${anonymized.length} total):
${JSON.stringify(anonymized, null, 2)}
Return a JSON array following the format specified. Only include the top ${topN} projects. Rank by overall quality within this category.`
const MAX_PARSE_RETRIES = 2
let parseAttempts = 0
let response = await openai.chat.completions.create(
buildCompletionParams(model, {
messages: [
{ role: 'system', content: systemPrompt },
{ role: 'user', content: userPrompt },
],
temperature: 0.1,
jsonMode: true,
}),
)
let tokenUsage = extractTokenUsage(response)
await logAIUsage({
action: 'SHORTLIST',
model,
promptTokens: tokenUsage.promptTokens,
completionTokens: tokenUsage.completionTokens,
totalTokens: tokenUsage.totalTokens,
status: 'SUCCESS',
})
// Parse response
let parsed: any[]
while (true) {
try {
const content = response.choices[0]?.message?.content
if (!content) {
return { recommendations: [], tokensUsed: tokenUsage.totalTokens, errors: ['Empty AI response'] }
}
const json = JSON.parse(content)
parsed = Array.isArray(json) ? json : json.rankings ?? json.projects ?? json.shortlist ?? []
break
} catch (parseError) {
if (parseError instanceof SyntaxError && parseAttempts < MAX_PARSE_RETRIES) {
parseAttempts++
response = await openai.chat.completions.create(
buildCompletionParams(model, {
messages: [
{ role: 'system', content: systemPrompt },
{ role: 'user', content: userPrompt + '\n\nIMPORTANT: Please ensure valid JSON output.' },
],
temperature: 0.1,
jsonMode: true,
}),
)
const retryUsage = extractTokenUsage(response)
tokenUsage.totalTokens += retryUsage.totalTokens
continue
}
return { recommendations: [], tokensUsed: tokenUsage.totalTokens, errors: ['Failed to parse AI response'] }
}
}
// De-anonymize
const recommendations: ShortlistRecommendation[] = parsed
.filter((item: any) => item.anonymousId && idMap.has(item.anonymousId))
.map((item: any) => ({
projectId: idMap.get(item.anonymousId)!,
rank: item.rank ?? 0,
score: item.score ?? 0,
category,
strengths: item.strengths ?? [],
concerns: item.concerns ?? [],
recommendation: item.recommendation ?? '',
}))
.sort((a: ShortlistRecommendation, b: ShortlistRecommendation) => a.rank - b.rank)
return { recommendations, tokensUsed: tokenUsage.totalTokens, errors: [] }
}
// ─── Main Function ──────────────────────────────────────────────────────────
/**
* Generate an AI shortlist for projects in a round, split by category.
* Runs independently for STARTUP and BUSINESS_CONCEPT.
*/
export async function generateShortlist(
params: {
roundId: string
competitionId: string
category?: string // If provided, only run for this category
topN?: number // Global fallback
startupTopN?: number // Per-category override
conceptTopN?: number // Per-category override
rubric?: string
aiParseFiles?: boolean
},
prisma: PrismaClient,
): Promise<ShortlistResult> {
const {
roundId,
category,
topN = 10,
startupTopN,
conceptTopN,
rubric,
aiParseFiles = false,
} = params
try {
const categories = category
? [category]
: ['STARTUP', 'BUSINESS_CONCEPT']
const allRecommendations: CategoryRecommendations = {
STARTUP: [],
BUSINESS_CONCEPT: [],
}
let totalTokens = 0
const allErrors: string[] = []
// Run categories in parallel for efficiency
const categoryPromises = categories.map(async (cat) => {
const catTopN = cat === 'STARTUP'
? (startupTopN ?? topN)
: (conceptTopN ?? topN)
console.log(`[AI Shortlist] Generating top-${catTopN} for ${cat}`)
const result = await generateCategoryShortlist(
{ roundId, category: cat, topN: catTopN, rubric, aiParseFiles },
prisma,
)
return { cat, result }
})
const categoryResults = await Promise.all(categoryPromises)
for (const { cat, result } of categoryResults) {
if (cat === 'STARTUP') {
allRecommendations.STARTUP = result.recommendations
} else {
allRecommendations.BUSINESS_CONCEPT = result.recommendations
}
totalTokens += result.tokensUsed
allErrors.push(...result.errors)
}
return {
success: true,
recommendations: allRecommendations,
tokensUsed: totalTokens,
errors: allErrors.length > 0 ? allErrors : undefined,
}
} catch (error) {
const classification = classifyAIError(error)
logAIError('ai-shortlist', 'generateShortlist', classification)
console.error('[AIShortlist] generateShortlist failed:', error)
return {
success: false,
recommendations: { STARTUP: [], BUSINESS_CONCEPT: [] },
errors: [error instanceof Error ? error.message : 'AI shortlist generation failed'],
}
}
}