From 7d72ee271f7f608f479709483bef2a17e7769a5d Mon Sep 17 00:00:00 2001 From: Matt Date: Wed, 29 Apr 2026 04:14:58 +0200 Subject: [PATCH] fix(security): route ai-shortlist through canonical anonymization pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ai-shortlist was sending raw project.description, raw juror feedback text (feedbackGeneral / feedbackText), and full extracted file text content directly to OpenAI as part of the user prompt. Its only "anonymization" was renaming `id` to `anonymousId`. This bypassed the GDPR contract documented in the file's own header comment ("All project data is anonymized before AI processing — No personal identifiers in prompts") and in CLAUDE.md ("All AI calls anonymize data before sending to OpenAI"). A juror writing "Contact applicant Jane Doe at jane@example.com" in feedback would ship that PII to OpenAI verbatim every time an admin generated a shortlist. Same for any names / emails / phone numbers embedded in extracted PDF text. generateCategoryShortlist now mirrors the pattern used by ai-filtering / ai-tagging / ai-award-eligibility: - toProjectWithRelations + anonymizeProjectsForAI(_, 'FILTERING') - validateAnonymizedProjects gate that aborts on detected PII - Aggregates (avgScore, evaluationCount, feedbackSamples) computed separately and merged onto the anonymized projects; each feedback sample passes through sanitizeText (strips email/phone/url/ssn) and is truncated to 1000 chars. Defense-in-depth fix in the shared helper: anonymizeProjectForAI now also runs sanitizeText over each file's text_content before emitting it to AI services. Previously the helper passed extracted file text through unchanged, which would have leaked PII from PDF body text via ai-filtering / ai-tagging / ai-award-eligibility too if those services turn on aiParseFiles. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/server/services/ai-shortlist.ts | 111 ++++++++++++++++++++------- src/server/services/anonymization.ts | 4 +- 2 files changed, 88 insertions(+), 27 deletions(-) diff --git a/src/server/services/ai-shortlist.ts b/src/server/services/ai-shortlist.ts index 7ba390f..981d7e5 100644 --- a/src/server/services/ai-shortlist.ts +++ b/src/server/services/ai-shortlist.ts @@ -14,6 +14,12 @@ import { getOpenAI, getConfiguredModel, buildCompletionParams } from '@/lib/open import { logAIUsage, extractTokenUsage } from '@/server/utils/ai-usage' import { classifyAIError, logAIError } from './ai-errors' import { extractMultipleFileContents } from './file-content-extractor' +import { + toProjectWithRelations, + anonymizeProjectsForAI, + validateAnonymizedProjects, + sanitizeText, +} from './anonymization' import type { PrismaClient, CompetitionCategory } from '@prisma/client' // ─── Types ────────────────────────────────────────────────────────────────── @@ -166,8 +172,15 @@ async function generateCategoryShortlist( } } - // Aggregate scores per project - const projectSummaries = projects.map((project: any) => { + // Aggregate per-project stats and free-text feedback. Sanitize feedback + // before it enters the prompt — sanitizeText strips email/phone/url/ssn + // patterns embedded in juror free-text. Without this, juror feedback like + // "Contact applicant Jane at jane@example.com" leaks PII to OpenAI. + const aggregatesByProjectId = new Map< + string, + { avgScore: number; evaluationCount: number; feedbackSamples: string[] } + >() + for (const project of projects as any[]) { const evaluations = project.assignments .map((a: any) => a.evaluation) .filter(Boolean) @@ -178,40 +191,86 @@ async function generateCategoryShortlist( ? scores.reduce((sum: number, s: number) => sum + s, 0) / scores.length : 0 - const feedbacks = evaluations + const feedbackSamples = evaluations .map((e: any) => e.feedbackGeneral || e.feedbackText) - .filter(Boolean) + .filter((t: unknown): t is string => typeof t === 'string' && t.length > 0) + .slice(0, 3) + .map((t: string) => sanitizeText(t).slice(0, 1000)) - return { - id: project.id, - description: project.description, - category: project.competitionCategory, - tags: project.projectTags.map((pt: any) => pt.tag.name), + aggregatesByProjectId.set(project.id, { avgScore, evaluationCount: evaluations.length, - feedbackSamples: feedbacks.slice(0, 3), - files: (project.files || []).map((f: any) => ({ - file_type: f.fileType ?? 'OTHER', - page_count: f.pageCount ?? null, - size_kb: Math.round((f.size ?? 0) / 1024), - round_name: f.roundId ? (roundNames.get(f.roundId) || null) : null, - is_current_round: f.roundId === roundId, - ...(fileContents?.get(f.id) ? { text_content: fileContents.get(f.id) } : {}), + feedbackSamples, + }) + } + + // Route every project through the canonical anonymization pipeline so + // description/title/institution are PII-stripped, free-text is truncated, + // and file text_content is sanitized (handled in anonymizeProjectForAI). + const projectsWithRelations = (projects as any[]).map((p) => + toProjectWithRelations({ + id: p.id, + title: p.title, + description: p.description, + competitionCategory: p.competitionCategory, + oceanIssue: p.oceanIssue ?? null, + country: p.country ?? null, + geographicZone: p.geographicZone ?? null, + institution: p.institution ?? null, + tags: (p.projectTags ?? []).map((pt: any) => pt.tag.name), + foundedAt: p.foundedAt ?? null, + wantsMentorship: p.wantsMentorship ?? false, + submissionSource: p.submissionSource ?? 'MANUAL', + submittedAt: p.submittedAt ?? null, + _count: { teamMembers: p.teamMembers?.length ?? 0, files: p.files?.length ?? 0 }, + files: (p.files ?? []).map((f: any) => ({ + fileType: f.fileType ?? null, + size: f.size, + pageCount: f.pageCount, + roundName: f.roundId ? roundNames.get(f.roundId) : undefined, + isCurrentRound: f.roundId === roundId, + textContent: fileContents?.get(f.id), })), + }), + ) + + const { anonymized: anonymizedBase, mappings } = anonymizeProjectsForAI( + projectsWithRelations, + 'FILTERING', + ) + + if (!validateAnonymizedProjects(anonymizedBase)) { + console.error('[AI Shortlist] Anonymization validation failed') + return { + recommendations: [], + tokensUsed: 0, + errors: ['GDPR compliance check failed: PII detected in anonymized data'], + } + } + + // Merge anonymized base with per-project aggregates, keyed by mapping order. + // Use the same anonymousId scheme the AI prompt expects. + const anonymized = anonymizedBase.map((p, index) => { + const realId = mappings[index].realId + const agg = aggregatesByProjectId.get(realId) ?? { + avgScore: 0, + evaluationCount: 0, + feedbackSamples: [], + } + return { + anonymousId: `PROJECT_${String(index + 1).padStart(3, '0')}`, + ...p, + project_id: undefined, + avgScore: agg.avgScore, + evaluationCount: agg.evaluationCount, + feedbackSamples: agg.feedbackSamples, } }) - // Anonymize for AI - const anonymized = projectSummaries.map((p: any, index: number) => ({ - anonymousId: `PROJECT_${String(index + 1).padStart(3, '0')}`, - ...p, - id: undefined, - })) - // Build idMap for de-anonymization const idMap = new Map() - projectSummaries.forEach((p: any, index: number) => { - idMap.set(`PROJECT_${String(index + 1).padStart(3, '0')}`, p.id) + mappings.forEach((m, index) => { + idMap.set(`PROJECT_${String(index + 1).padStart(3, '0')}`, m.realId) }) // Call AI diff --git a/src/server/services/anonymization.ts b/src/server/services/anonymization.ts index 58570fb..883dd8a 100644 --- a/src/server/services/anonymization.ts +++ b/src/server/services/anonymization.ts @@ -353,7 +353,9 @@ export function anonymizeProjectForAI( ...(f.langConfidence != null ? { lang_confidence: f.langConfidence } : {}), ...(f.roundName ? { round_name: f.roundName } : {}), ...(f.isCurrentRound !== undefined ? { is_current_round: f.isCurrentRound } : {}), - ...(f.textContent ? { text_content: f.textContent } : {}), + // Strip PII patterns (email/phone/url/ssn) from extracted file text + // before it leaves the trust boundary to OpenAI. + ...(f.textContent ? { text_content: sanitizeText(f.textContent) } : {}), })) ?? [], wants_mentorship: project.wantsMentorship ?? false, submission_source: project.submissionSource,