ai-shortlist was sending raw project.description, raw juror feedback
text (feedbackGeneral / feedbackText), and full extracted file text
content directly to OpenAI as part of the user prompt. Its only
"anonymization" was renaming `id` to `anonymousId`. This bypassed the
GDPR contract documented in the file's own header comment ("All project
data is anonymized before AI processing — No personal identifiers in
prompts") and in CLAUDE.md ("All AI calls anonymize data before sending
to OpenAI").
A juror writing "Contact applicant Jane Doe at jane@example.com" in
feedback would ship that PII to OpenAI verbatim every time an admin
generated a shortlist. Same for any names / emails / phone numbers
embedded in extracted PDF text.
generateCategoryShortlist now mirrors the pattern used by ai-filtering /
ai-tagging / ai-award-eligibility:
- toProjectWithRelations + anonymizeProjectsForAI(_, 'FILTERING')
- validateAnonymizedProjects gate that aborts on detected PII
- Aggregates (avgScore, evaluationCount, feedbackSamples) computed
separately and merged onto the anonymized projects; each feedback
sample passes through sanitizeText (strips email/phone/url/ssn) and
is truncated to 1000 chars.
Defense-in-depth fix in the shared helper: anonymizeProjectForAI now
also runs sanitizeText over each file's text_content before emitting it
to AI services. Previously the helper passed extracted file text
through unchanged, which would have leaked PII from PDF body text via
ai-filtering / ai-tagging / ai-award-eligibility too if those services
turn on aiParseFiles.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
452 lines
15 KiB
TypeScript
452 lines
15 KiB
TypeScript
/**
|
|
* AI Shortlist Service
|
|
*
|
|
* Generates ranked recommendations at end of evaluation rounds.
|
|
* Runs SEPARATELY for each category (STARTUP / BUSINESS_CONCEPT)
|
|
* to produce independent rankings per the competition's advancement rules.
|
|
*
|
|
* GDPR Compliance:
|
|
* - All project data is anonymized before AI processing
|
|
* - No personal identifiers in prompts or responses
|
|
*/
|
|
|
|
import { getOpenAI, getConfiguredModel, buildCompletionParams } from '@/lib/openai'
|
|
import { logAIUsage, extractTokenUsage } from '@/server/utils/ai-usage'
|
|
import { classifyAIError, logAIError } from './ai-errors'
|
|
import { extractMultipleFileContents } from './file-content-extractor'
|
|
import {
|
|
toProjectWithRelations,
|
|
anonymizeProjectsForAI,
|
|
validateAnonymizedProjects,
|
|
sanitizeText,
|
|
} from './anonymization'
|
|
import type { PrismaClient, CompetitionCategory } from '@prisma/client'
|
|
|
|
// ─── Types ──────────────────────────────────────────────────────────────────
|
|
|
|
export type ShortlistResult = {
|
|
success: boolean
|
|
recommendations: CategoryRecommendations
|
|
errors?: string[]
|
|
tokensUsed?: number
|
|
}
|
|
|
|
export type CategoryRecommendations = {
|
|
STARTUP: ShortlistRecommendation[]
|
|
BUSINESS_CONCEPT: ShortlistRecommendation[]
|
|
}
|
|
|
|
export type ShortlistRecommendation = {
|
|
projectId: string
|
|
rank: number
|
|
score: number
|
|
category: string
|
|
strengths: string[]
|
|
concerns: string[]
|
|
recommendation: string
|
|
}
|
|
|
|
// ─── Prompt Building ────────────────────────────────────────────────────────
|
|
|
|
function buildShortlistPrompt(category: string, topN: number, rubric?: string): string {
|
|
const categoryLabel = category === 'STARTUP' ? 'Startup' : 'Business Concept'
|
|
|
|
return `You are a senior jury advisor for the Monaco Ocean Protection Challenge.
|
|
|
|
## Your Role
|
|
Analyze aggregated evaluation data to produce a ranked shortlist of the top ${topN} ${categoryLabel} projects.
|
|
You are evaluating ONLY ${categoryLabel} projects in this batch — rank them against each other within this category.
|
|
|
|
## Ranking Criteria (Weighted)
|
|
- Evaluation Scores (40%): Average scores across all jury evaluations
|
|
- Innovation & Impact (25%): Novelty of approach and potential environmental impact
|
|
- Feasibility (20%): Likelihood of successful implementation
|
|
- Alignment (15%): Fit with ocean protection mission and competition goals
|
|
|
|
## Document Analysis
|
|
If document content is provided (text_content field in files), use it for deeper qualitative analysis.
|
|
Pay SPECIAL ATTENTION to files marked with is_current_round=true — these are the most recent submissions.
|
|
Older documents provide context, but recent ones should carry more weight in your assessment.
|
|
|
|
${rubric ? `## Custom Evaluation Rubric\n${rubric}\n` : ''}
|
|
## Output Format
|
|
Return a JSON array:
|
|
[
|
|
{
|
|
"anonymousId": "PROJECT_001",
|
|
"rank": 1,
|
|
"score": 0-100,
|
|
"strengths": ["strength 1", "strength 2"],
|
|
"concerns": ["concern 1"],
|
|
"recommendation": "1-2 sentence recommendation"
|
|
}
|
|
]
|
|
|
|
## Guidelines
|
|
- Only include the top ${topN} projects in your ranking
|
|
- Score should reflect weighted combination of all criteria
|
|
- Be specific in strengths and concerns — avoid generic statements
|
|
- Consider feedback themes and evaluator consensus
|
|
- Higher evaluator consensus should boost confidence in ranking
|
|
- Do not include any personal identifiers`
|
|
}
|
|
|
|
// ─── Single Category Processing ─────────────────────────────────────────────
|
|
|
|
async function generateCategoryShortlist(
|
|
params: {
|
|
roundId: string
|
|
category: string
|
|
topN: number
|
|
rubric?: string
|
|
aiParseFiles: boolean
|
|
},
|
|
prisma: PrismaClient,
|
|
): Promise<{ recommendations: ShortlistRecommendation[]; tokensUsed: number; errors: string[] }> {
|
|
const { roundId, category, topN, rubric, aiParseFiles } = params
|
|
|
|
// Load projects with evaluations for this category
|
|
const projects = await prisma.project.findMany({
|
|
where: {
|
|
competitionCategory: category as CompetitionCategory,
|
|
assignments: { some: { roundId } },
|
|
},
|
|
include: {
|
|
assignments: {
|
|
where: { roundId },
|
|
include: { evaluation: true },
|
|
},
|
|
projectTags: { include: { tag: true } },
|
|
files: {
|
|
select: {
|
|
id: true,
|
|
fileName: true,
|
|
fileType: true,
|
|
mimeType: true,
|
|
size: true,
|
|
pageCount: true,
|
|
objectKey: true,
|
|
roundId: true,
|
|
createdAt: true,
|
|
},
|
|
orderBy: { createdAt: 'desc' as const },
|
|
},
|
|
teamMembers: { select: { user: { select: { name: true } } } },
|
|
},
|
|
})
|
|
|
|
if (projects.length === 0) {
|
|
return {
|
|
recommendations: [],
|
|
tokensUsed: 0,
|
|
errors: [`No ${category} projects found for this round`],
|
|
}
|
|
}
|
|
|
|
// Get round names for file tagging
|
|
const roundIds = new Set<string>()
|
|
for (const p of projects) {
|
|
for (const f of (p as any).files || []) {
|
|
if (f.roundId) roundIds.add(f.roundId)
|
|
}
|
|
}
|
|
const roundNames = new Map<string, string>()
|
|
if (roundIds.size > 0) {
|
|
const rounds = await prisma.round.findMany({
|
|
where: { id: { in: [...roundIds] } },
|
|
select: { id: true, name: true },
|
|
})
|
|
for (const r of rounds) roundNames.set(r.id, r.name)
|
|
}
|
|
|
|
// Optionally extract file contents
|
|
let fileContents: Map<string, string> | undefined
|
|
if (aiParseFiles) {
|
|
const allFiles = projects.flatMap((p: any) =>
|
|
((p.files || []) as Array<{ id: string; fileName: string; mimeType: string; objectKey: string }>)
|
|
)
|
|
const extractions = await extractMultipleFileContents(allFiles)
|
|
fileContents = new Map()
|
|
for (const e of extractions) {
|
|
if (e.content) fileContents.set(e.fileId, e.content)
|
|
}
|
|
}
|
|
|
|
// Aggregate per-project stats and free-text feedback. Sanitize feedback
|
|
// before it enters the prompt — sanitizeText strips email/phone/url/ssn
|
|
// patterns embedded in juror free-text. Without this, juror feedback like
|
|
// "Contact applicant Jane at jane@example.com" leaks PII to OpenAI.
|
|
const aggregatesByProjectId = new Map<
|
|
string,
|
|
{ avgScore: number; evaluationCount: number; feedbackSamples: string[] }
|
|
>()
|
|
for (const project of projects as any[]) {
|
|
const evaluations = project.assignments
|
|
.map((a: any) => a.evaluation)
|
|
.filter(Boolean)
|
|
.filter((e: any) => e.status === 'SUBMITTED')
|
|
|
|
const scores = evaluations.map((e: any) => e.globalScore ?? 0)
|
|
const avgScore = scores.length > 0
|
|
? scores.reduce((sum: number, s: number) => sum + s, 0) / scores.length
|
|
: 0
|
|
|
|
const feedbackSamples = evaluations
|
|
.map((e: any) => e.feedbackGeneral || e.feedbackText)
|
|
.filter((t: unknown): t is string => typeof t === 'string' && t.length > 0)
|
|
.slice(0, 3)
|
|
.map((t: string) => sanitizeText(t).slice(0, 1000))
|
|
|
|
aggregatesByProjectId.set(project.id, {
|
|
avgScore,
|
|
evaluationCount: evaluations.length,
|
|
feedbackSamples,
|
|
})
|
|
}
|
|
|
|
// Route every project through the canonical anonymization pipeline so
|
|
// description/title/institution are PII-stripped, free-text is truncated,
|
|
// and file text_content is sanitized (handled in anonymizeProjectForAI).
|
|
const projectsWithRelations = (projects as any[]).map((p) =>
|
|
toProjectWithRelations({
|
|
id: p.id,
|
|
title: p.title,
|
|
description: p.description,
|
|
competitionCategory: p.competitionCategory,
|
|
oceanIssue: p.oceanIssue ?? null,
|
|
country: p.country ?? null,
|
|
geographicZone: p.geographicZone ?? null,
|
|
institution: p.institution ?? null,
|
|
tags: (p.projectTags ?? []).map((pt: any) => pt.tag.name),
|
|
foundedAt: p.foundedAt ?? null,
|
|
wantsMentorship: p.wantsMentorship ?? false,
|
|
submissionSource: p.submissionSource ?? 'MANUAL',
|
|
submittedAt: p.submittedAt ?? null,
|
|
_count: { teamMembers: p.teamMembers?.length ?? 0, files: p.files?.length ?? 0 },
|
|
files: (p.files ?? []).map((f: any) => ({
|
|
fileType: f.fileType ?? null,
|
|
size: f.size,
|
|
pageCount: f.pageCount,
|
|
roundName: f.roundId ? roundNames.get(f.roundId) : undefined,
|
|
isCurrentRound: f.roundId === roundId,
|
|
textContent: fileContents?.get(f.id),
|
|
})),
|
|
}),
|
|
)
|
|
|
|
const { anonymized: anonymizedBase, mappings } = anonymizeProjectsForAI(
|
|
projectsWithRelations,
|
|
'FILTERING',
|
|
)
|
|
|
|
if (!validateAnonymizedProjects(anonymizedBase)) {
|
|
console.error('[AI Shortlist] Anonymization validation failed')
|
|
return {
|
|
recommendations: [],
|
|
tokensUsed: 0,
|
|
errors: ['GDPR compliance check failed: PII detected in anonymized data'],
|
|
}
|
|
}
|
|
|
|
// Merge anonymized base with per-project aggregates, keyed by mapping order.
|
|
// Use the same anonymousId scheme the AI prompt expects.
|
|
const anonymized = anonymizedBase.map((p, index) => {
|
|
const realId = mappings[index].realId
|
|
const agg = aggregatesByProjectId.get(realId) ?? {
|
|
avgScore: 0,
|
|
evaluationCount: 0,
|
|
feedbackSamples: [],
|
|
}
|
|
return {
|
|
anonymousId: `PROJECT_${String(index + 1).padStart(3, '0')}`,
|
|
...p,
|
|
project_id: undefined,
|
|
avgScore: agg.avgScore,
|
|
evaluationCount: agg.evaluationCount,
|
|
feedbackSamples: agg.feedbackSamples,
|
|
}
|
|
})
|
|
|
|
// Build idMap for de-anonymization
|
|
const idMap = new Map<string, string>()
|
|
mappings.forEach((m, index) => {
|
|
idMap.set(`PROJECT_${String(index + 1).padStart(3, '0')}`, m.realId)
|
|
})
|
|
|
|
// Call AI
|
|
const openai = await getOpenAI()
|
|
const model = await getConfiguredModel()
|
|
|
|
if (!openai) {
|
|
return { recommendations: [], tokensUsed: 0, errors: ['OpenAI client not configured'] }
|
|
}
|
|
|
|
const systemPrompt = buildShortlistPrompt(category, topN, rubric)
|
|
const userPrompt = `Analyze these anonymized ${category} project evaluations and produce a ranked shortlist of the top ${topN}.
|
|
|
|
Projects (${anonymized.length} total):
|
|
${JSON.stringify(anonymized, null, 2)}
|
|
|
|
Return a JSON array following the format specified. Only include the top ${topN} projects. Rank by overall quality within this category.`
|
|
|
|
const MAX_PARSE_RETRIES = 2
|
|
let parseAttempts = 0
|
|
let response = await openai.chat.completions.create(
|
|
buildCompletionParams(model, {
|
|
messages: [
|
|
{ role: 'system', content: systemPrompt },
|
|
{ role: 'user', content: userPrompt },
|
|
],
|
|
temperature: 0.1,
|
|
jsonMode: true,
|
|
}),
|
|
)
|
|
|
|
let tokenUsage = extractTokenUsage(response)
|
|
|
|
await logAIUsage({
|
|
action: 'SHORTLIST',
|
|
model,
|
|
promptTokens: tokenUsage.promptTokens,
|
|
completionTokens: tokenUsage.completionTokens,
|
|
totalTokens: tokenUsage.totalTokens,
|
|
status: 'SUCCESS',
|
|
})
|
|
|
|
// Parse response
|
|
let parsed: any[]
|
|
while (true) {
|
|
try {
|
|
const content = response.choices[0]?.message?.content
|
|
if (!content) {
|
|
return { recommendations: [], tokensUsed: tokenUsage.totalTokens, errors: ['Empty AI response'] }
|
|
}
|
|
const json = JSON.parse(content)
|
|
parsed = Array.isArray(json) ? json : json.rankings ?? json.projects ?? json.shortlist ?? []
|
|
break
|
|
} catch (parseError) {
|
|
if (parseError instanceof SyntaxError && parseAttempts < MAX_PARSE_RETRIES) {
|
|
parseAttempts++
|
|
response = await openai.chat.completions.create(
|
|
buildCompletionParams(model, {
|
|
messages: [
|
|
{ role: 'system', content: systemPrompt },
|
|
{ role: 'user', content: userPrompt + '\n\nIMPORTANT: Please ensure valid JSON output.' },
|
|
],
|
|
temperature: 0.1,
|
|
jsonMode: true,
|
|
}),
|
|
)
|
|
const retryUsage = extractTokenUsage(response)
|
|
tokenUsage.totalTokens += retryUsage.totalTokens
|
|
continue
|
|
}
|
|
return { recommendations: [], tokensUsed: tokenUsage.totalTokens, errors: ['Failed to parse AI response'] }
|
|
}
|
|
}
|
|
|
|
// De-anonymize
|
|
const recommendations: ShortlistRecommendation[] = parsed
|
|
.filter((item: any) => item.anonymousId && idMap.has(item.anonymousId))
|
|
.map((item: any) => ({
|
|
projectId: idMap.get(item.anonymousId)!,
|
|
rank: item.rank ?? 0,
|
|
score: item.score ?? 0,
|
|
category,
|
|
strengths: item.strengths ?? [],
|
|
concerns: item.concerns ?? [],
|
|
recommendation: item.recommendation ?? '',
|
|
}))
|
|
.sort((a: ShortlistRecommendation, b: ShortlistRecommendation) => a.rank - b.rank)
|
|
|
|
return { recommendations, tokensUsed: tokenUsage.totalTokens, errors: [] }
|
|
}
|
|
|
|
// ─── Main Function ──────────────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Generate an AI shortlist for projects in a round, split by category.
|
|
* Runs independently for STARTUP and BUSINESS_CONCEPT.
|
|
*/
|
|
export async function generateShortlist(
|
|
params: {
|
|
roundId: string
|
|
competitionId: string
|
|
category?: string // If provided, only run for this category
|
|
topN?: number // Global fallback
|
|
startupTopN?: number // Per-category override
|
|
conceptTopN?: number // Per-category override
|
|
rubric?: string
|
|
aiParseFiles?: boolean
|
|
},
|
|
prisma: PrismaClient,
|
|
): Promise<ShortlistResult> {
|
|
const {
|
|
roundId,
|
|
category,
|
|
topN = 10,
|
|
startupTopN,
|
|
conceptTopN,
|
|
rubric,
|
|
aiParseFiles = false,
|
|
} = params
|
|
|
|
try {
|
|
const categories = category
|
|
? [category]
|
|
: ['STARTUP', 'BUSINESS_CONCEPT']
|
|
|
|
const allRecommendations: CategoryRecommendations = {
|
|
STARTUP: [],
|
|
BUSINESS_CONCEPT: [],
|
|
}
|
|
let totalTokens = 0
|
|
const allErrors: string[] = []
|
|
|
|
// Run categories in parallel for efficiency
|
|
const categoryPromises = categories.map(async (cat) => {
|
|
const catTopN = cat === 'STARTUP'
|
|
? (startupTopN ?? topN)
|
|
: (conceptTopN ?? topN)
|
|
|
|
console.log(`[AI Shortlist] Generating top-${catTopN} for ${cat}`)
|
|
|
|
const result = await generateCategoryShortlist(
|
|
{ roundId, category: cat, topN: catTopN, rubric, aiParseFiles },
|
|
prisma,
|
|
)
|
|
|
|
return { cat, result }
|
|
})
|
|
|
|
const categoryResults = await Promise.all(categoryPromises)
|
|
|
|
for (const { cat, result } of categoryResults) {
|
|
if (cat === 'STARTUP') {
|
|
allRecommendations.STARTUP = result.recommendations
|
|
} else {
|
|
allRecommendations.BUSINESS_CONCEPT = result.recommendations
|
|
}
|
|
totalTokens += result.tokensUsed
|
|
allErrors.push(...result.errors)
|
|
}
|
|
|
|
return {
|
|
success: true,
|
|
recommendations: allRecommendations,
|
|
tokensUsed: totalTokens,
|
|
errors: allErrors.length > 0 ? allErrors : undefined,
|
|
}
|
|
} catch (error) {
|
|
const classification = classifyAIError(error)
|
|
logAIError('ai-shortlist', 'generateShortlist', classification)
|
|
console.error('[AIShortlist] generateShortlist failed:', error)
|
|
|
|
return {
|
|
success: false,
|
|
recommendations: { STARTUP: [], BUSINESS_CONCEPT: [] },
|
|
errors: [error instanceof Error ? error.message : 'AI shortlist generation failed'],
|
|
}
|
|
}
|
|
}
|