fix(security): route ai-shortlist through canonical anonymization pipeline
ai-shortlist was sending raw project.description, raw juror feedback
text (feedbackGeneral / feedbackText), and full extracted file text
content directly to OpenAI as part of the user prompt. Its only
"anonymization" was renaming `id` to `anonymousId`. This bypassed the
GDPR contract documented in the file's own header comment ("All project
data is anonymized before AI processing — No personal identifiers in
prompts") and in CLAUDE.md ("All AI calls anonymize data before sending
to OpenAI").
A juror writing "Contact applicant Jane Doe at jane@example.com" in
feedback would ship that PII to OpenAI verbatim every time an admin
generated a shortlist. Same for any names / emails / phone numbers
embedded in extracted PDF text.
generateCategoryShortlist now mirrors the pattern used by ai-filtering /
ai-tagging / ai-award-eligibility:
- toProjectWithRelations + anonymizeProjectsForAI(_, 'FILTERING')
- validateAnonymizedProjects gate that aborts on detected PII
- Aggregates (avgScore, evaluationCount, feedbackSamples) computed
separately and merged onto the anonymized projects; each feedback
sample passes through sanitizeText (strips email/phone/url/ssn) and
is truncated to 1000 chars.
Defense-in-depth fix in the shared helper: anonymizeProjectForAI now
also runs sanitizeText over each file's text_content before emitting it
to AI services. Previously the helper passed extracted file text
through unchanged, which would have leaked PII from PDF body text via
ai-filtering / ai-tagging / ai-award-eligibility too if those services
turn on aiParseFiles.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -14,6 +14,12 @@ import { getOpenAI, getConfiguredModel, buildCompletionParams } from '@/lib/open
|
|||||||
import { logAIUsage, extractTokenUsage } from '@/server/utils/ai-usage'
|
import { logAIUsage, extractTokenUsage } from '@/server/utils/ai-usage'
|
||||||
import { classifyAIError, logAIError } from './ai-errors'
|
import { classifyAIError, logAIError } from './ai-errors'
|
||||||
import { extractMultipleFileContents } from './file-content-extractor'
|
import { extractMultipleFileContents } from './file-content-extractor'
|
||||||
|
import {
|
||||||
|
toProjectWithRelations,
|
||||||
|
anonymizeProjectsForAI,
|
||||||
|
validateAnonymizedProjects,
|
||||||
|
sanitizeText,
|
||||||
|
} from './anonymization'
|
||||||
import type { PrismaClient, CompetitionCategory } from '@prisma/client'
|
import type { PrismaClient, CompetitionCategory } from '@prisma/client'
|
||||||
|
|
||||||
// ─── Types ──────────────────────────────────────────────────────────────────
|
// ─── Types ──────────────────────────────────────────────────────────────────
|
||||||
@@ -166,8 +172,15 @@ async function generateCategoryShortlist(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Aggregate scores per project
|
// Aggregate per-project stats and free-text feedback. Sanitize feedback
|
||||||
const projectSummaries = projects.map((project: any) => {
|
// before it enters the prompt — sanitizeText strips email/phone/url/ssn
|
||||||
|
// patterns embedded in juror free-text. Without this, juror feedback like
|
||||||
|
// "Contact applicant Jane at jane@example.com" leaks PII to OpenAI.
|
||||||
|
const aggregatesByProjectId = new Map<
|
||||||
|
string,
|
||||||
|
{ avgScore: number; evaluationCount: number; feedbackSamples: string[] }
|
||||||
|
>()
|
||||||
|
for (const project of projects as any[]) {
|
||||||
const evaluations = project.assignments
|
const evaluations = project.assignments
|
||||||
.map((a: any) => a.evaluation)
|
.map((a: any) => a.evaluation)
|
||||||
.filter(Boolean)
|
.filter(Boolean)
|
||||||
@@ -178,40 +191,86 @@ async function generateCategoryShortlist(
|
|||||||
? scores.reduce((sum: number, s: number) => sum + s, 0) / scores.length
|
? scores.reduce((sum: number, s: number) => sum + s, 0) / scores.length
|
||||||
: 0
|
: 0
|
||||||
|
|
||||||
const feedbacks = evaluations
|
const feedbackSamples = evaluations
|
||||||
.map((e: any) => e.feedbackGeneral || e.feedbackText)
|
.map((e: any) => e.feedbackGeneral || e.feedbackText)
|
||||||
.filter(Boolean)
|
.filter((t: unknown): t is string => typeof t === 'string' && t.length > 0)
|
||||||
|
.slice(0, 3)
|
||||||
|
.map((t: string) => sanitizeText(t).slice(0, 1000))
|
||||||
|
|
||||||
return {
|
aggregatesByProjectId.set(project.id, {
|
||||||
id: project.id,
|
|
||||||
description: project.description,
|
|
||||||
category: project.competitionCategory,
|
|
||||||
tags: project.projectTags.map((pt: any) => pt.tag.name),
|
|
||||||
avgScore,
|
avgScore,
|
||||||
evaluationCount: evaluations.length,
|
evaluationCount: evaluations.length,
|
||||||
feedbackSamples: feedbacks.slice(0, 3),
|
feedbackSamples,
|
||||||
files: (project.files || []).map((f: any) => ({
|
})
|
||||||
file_type: f.fileType ?? 'OTHER',
|
}
|
||||||
page_count: f.pageCount ?? null,
|
|
||||||
size_kb: Math.round((f.size ?? 0) / 1024),
|
// Route every project through the canonical anonymization pipeline so
|
||||||
round_name: f.roundId ? (roundNames.get(f.roundId) || null) : null,
|
// description/title/institution are PII-stripped, free-text is truncated,
|
||||||
is_current_round: f.roundId === roundId,
|
// and file text_content is sanitized (handled in anonymizeProjectForAI).
|
||||||
...(fileContents?.get(f.id) ? { text_content: fileContents.get(f.id) } : {}),
|
const projectsWithRelations = (projects as any[]).map((p) =>
|
||||||
|
toProjectWithRelations({
|
||||||
|
id: p.id,
|
||||||
|
title: p.title,
|
||||||
|
description: p.description,
|
||||||
|
competitionCategory: p.competitionCategory,
|
||||||
|
oceanIssue: p.oceanIssue ?? null,
|
||||||
|
country: p.country ?? null,
|
||||||
|
geographicZone: p.geographicZone ?? null,
|
||||||
|
institution: p.institution ?? null,
|
||||||
|
tags: (p.projectTags ?? []).map((pt: any) => pt.tag.name),
|
||||||
|
foundedAt: p.foundedAt ?? null,
|
||||||
|
wantsMentorship: p.wantsMentorship ?? false,
|
||||||
|
submissionSource: p.submissionSource ?? 'MANUAL',
|
||||||
|
submittedAt: p.submittedAt ?? null,
|
||||||
|
_count: { teamMembers: p.teamMembers?.length ?? 0, files: p.files?.length ?? 0 },
|
||||||
|
files: (p.files ?? []).map((f: any) => ({
|
||||||
|
fileType: f.fileType ?? null,
|
||||||
|
size: f.size,
|
||||||
|
pageCount: f.pageCount,
|
||||||
|
roundName: f.roundId ? roundNames.get(f.roundId) : undefined,
|
||||||
|
isCurrentRound: f.roundId === roundId,
|
||||||
|
textContent: fileContents?.get(f.id),
|
||||||
})),
|
})),
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
|
||||||
|
const { anonymized: anonymizedBase, mappings } = anonymizeProjectsForAI(
|
||||||
|
projectsWithRelations,
|
||||||
|
'FILTERING',
|
||||||
|
)
|
||||||
|
|
||||||
|
if (!validateAnonymizedProjects(anonymizedBase)) {
|
||||||
|
console.error('[AI Shortlist] Anonymization validation failed')
|
||||||
|
return {
|
||||||
|
recommendations: [],
|
||||||
|
tokensUsed: 0,
|
||||||
|
errors: ['GDPR compliance check failed: PII detected in anonymized data'],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Merge anonymized base with per-project aggregates, keyed by mapping order.
|
||||||
|
// Use the same anonymousId scheme the AI prompt expects.
|
||||||
|
const anonymized = anonymizedBase.map((p, index) => {
|
||||||
|
const realId = mappings[index].realId
|
||||||
|
const agg = aggregatesByProjectId.get(realId) ?? {
|
||||||
|
avgScore: 0,
|
||||||
|
evaluationCount: 0,
|
||||||
|
feedbackSamples: [],
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
anonymousId: `PROJECT_${String(index + 1).padStart(3, '0')}`,
|
||||||
|
...p,
|
||||||
|
project_id: undefined,
|
||||||
|
avgScore: agg.avgScore,
|
||||||
|
evaluationCount: agg.evaluationCount,
|
||||||
|
feedbackSamples: agg.feedbackSamples,
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
// Anonymize for AI
|
|
||||||
const anonymized = projectSummaries.map((p: any, index: number) => ({
|
|
||||||
anonymousId: `PROJECT_${String(index + 1).padStart(3, '0')}`,
|
|
||||||
...p,
|
|
||||||
id: undefined,
|
|
||||||
}))
|
|
||||||
|
|
||||||
// Build idMap for de-anonymization
|
// Build idMap for de-anonymization
|
||||||
const idMap = new Map<string, string>()
|
const idMap = new Map<string, string>()
|
||||||
projectSummaries.forEach((p: any, index: number) => {
|
mappings.forEach((m, index) => {
|
||||||
idMap.set(`PROJECT_${String(index + 1).padStart(3, '0')}`, p.id)
|
idMap.set(`PROJECT_${String(index + 1).padStart(3, '0')}`, m.realId)
|
||||||
})
|
})
|
||||||
|
|
||||||
// Call AI
|
// Call AI
|
||||||
|
|||||||
@@ -353,7 +353,9 @@ export function anonymizeProjectForAI(
|
|||||||
...(f.langConfidence != null ? { lang_confidence: f.langConfidence } : {}),
|
...(f.langConfidence != null ? { lang_confidence: f.langConfidence } : {}),
|
||||||
...(f.roundName ? { round_name: f.roundName } : {}),
|
...(f.roundName ? { round_name: f.roundName } : {}),
|
||||||
...(f.isCurrentRound !== undefined ? { is_current_round: f.isCurrentRound } : {}),
|
...(f.isCurrentRound !== undefined ? { is_current_round: f.isCurrentRound } : {}),
|
||||||
...(f.textContent ? { text_content: f.textContent } : {}),
|
// Strip PII patterns (email/phone/url/ssn) from extracted file text
|
||||||
|
// before it leaves the trust boundary to OpenAI.
|
||||||
|
...(f.textContent ? { text_content: sanitizeText(f.textContent) } : {}),
|
||||||
})) ?? [],
|
})) ?? [],
|
||||||
wants_mentorship: project.wantsMentorship ?? false,
|
wants_mentorship: project.wantsMentorship ?? false,
|
||||||
submission_source: project.submissionSource,
|
submission_source: project.submissionSource,
|
||||||
|
|||||||
Reference in New Issue
Block a user