fix(security): route ai-shortlist through canonical anonymization pipeline

ai-shortlist was sending raw project.description, raw juror feedback text (feedbackGeneral / feedbackText), and full extracted file text content directly to OpenAI as part of the user prompt. Its only "anonymization" was renaming `id` to `anonymousId`. This bypassed the GDPR contract documented in the file's own header comment ("All project data is anonymized before AI processing — No personal identifiers in prompts") and in CLAUDE.md ("All AI calls anonymize data before sending to OpenAI"). A juror writing "Contact applicant Jane Doe at jane@example.com" in feedback would ship that PII to OpenAI verbatim every time an admin generated a shortlist. Same for any names / emails / phone numbers embedded in extracted PDF text. generateCategoryShortlist now mirrors the pattern used by ai-filtering / ai-tagging / ai-award-eligibility: - toProjectWithRelations + anonymizeProjectsForAI(_, 'FILTERING') - validateAnonymizedProjects gate that aborts on detected PII - Aggregates (avgScore, evaluationCount, feedbackSamples) computed separately and merged onto the anonymized projects; each feedback sample passes through sanitizeText (strips email/phone/url/ssn) and is truncated to 1000 chars. Defense-in-depth fix in the shared helper: anonymizeProjectForAI now also runs sanitizeText over each file's text_content before emitting it to AI services. Previously the helper passed extracted file text through unchanged, which would have leaked PII from PDF body text via ai-filtering / ai-tagging / ai-award-eligibility too if those services turn on aiParseFiles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 04:14:58 +02:00
parent fbc42f11fd
commit 7d72ee271f
2 changed files with 88 additions and 27 deletions
--- a/src/server/services/ai-shortlist.ts
+++ b/src/server/services/ai-shortlist.ts
@@ -14,6 +14,12 @@ import { getOpenAI, getConfiguredModel, buildCompletionParams } from '@/lib/open
 import { logAIUsage, extractTokenUsage } from '@/server/utils/ai-usage'
 import { classifyAIError, logAIError } from './ai-errors'
 import { extractMultipleFileContents } from './file-content-extractor'
+import {
+  toProjectWithRelations,
+  anonymizeProjectsForAI,
+  validateAnonymizedProjects,
+  sanitizeText,
+} from './anonymization'
 import type { PrismaClient, CompetitionCategory } from '@prisma/client'

 // ─── Types ──────────────────────────────────────────────────────────────────
@@ -166,8 +172,15 @@ async function generateCategoryShortlist(
    }
  }

-  // Aggregate scores per project
-  const projectSummaries = projects.map((project: any) => {
+  // Aggregate per-project stats and free-text feedback. Sanitize feedback
+  // before it enters the prompt — sanitizeText strips email/phone/url/ssn
+  // patterns embedded in juror free-text. Without this, juror feedback like
+  // "Contact applicant Jane at jane@example.com" leaks PII to OpenAI.
+  const aggregatesByProjectId = new Map<
+    string,
+    { avgScore: number; evaluationCount: number; feedbackSamples: string[] }
+  >()
+  for (const project of projects as any[]) {
    const evaluations = project.assignments
      .map((a: any) => a.evaluation)
      .filter(Boolean)
@@ -178,40 +191,86 @@ async function generateCategoryShortlist(
      ? scores.reduce((sum: number, s: number) => sum + s, 0) / scores.length
      : 0

-    const feedbacks = evaluations
+    const feedbackSamples = evaluations
      .map((e: any) => e.feedbackGeneral || e.feedbackText)
-      .filter(Boolean)
+      .filter((t: unknown): t is string => typeof t === 'string' && t.length > 0)
+      .slice(0, 3)
+      .map((t: string) => sanitizeText(t).slice(0, 1000))

-    return {
-      id: project.id,
-      description: project.description,
-      category: project.competitionCategory,
-      tags: project.projectTags.map((pt: any) => pt.tag.name),
+    aggregatesByProjectId.set(project.id, {
      avgScore,
      evaluationCount: evaluations.length,
-      feedbackSamples: feedbacks.slice(0, 3),
-      files: (project.files || []).map((f: any) => ({
-        file_type: f.fileType ?? 'OTHER',
-        page_count: f.pageCount ?? null,
-        size_kb: Math.round((f.size ?? 0) / 1024),
-        round_name: f.roundId ? (roundNames.get(f.roundId) || null) : null,
-        is_current_round: f.roundId === roundId,
-        ...(fileContents?.get(f.id) ? { text_content: fileContents.get(f.id) } : {}),
+      feedbackSamples,
+    })
+  }
+
+  // Route every project through the canonical anonymization pipeline so
+  // description/title/institution are PII-stripped, free-text is truncated,
+  // and file text_content is sanitized (handled in anonymizeProjectForAI).
+  const projectsWithRelations = (projects as any[]).map((p) =>
+    toProjectWithRelations({
+      id: p.id,
+      title: p.title,
+      description: p.description,
+      competitionCategory: p.competitionCategory,
+      oceanIssue: p.oceanIssue ?? null,
+      country: p.country ?? null,
+      geographicZone: p.geographicZone ?? null,
+      institution: p.institution ?? null,
+      tags: (p.projectTags ?? []).map((pt: any) => pt.tag.name),
+      foundedAt: p.foundedAt ?? null,
+      wantsMentorship: p.wantsMentorship ?? false,
+      submissionSource: p.submissionSource ?? 'MANUAL',
+      submittedAt: p.submittedAt ?? null,
+      _count: { teamMembers: p.teamMembers?.length ?? 0, files: p.files?.length ?? 0 },
+      files: (p.files ?? []).map((f: any) => ({
+        fileType: f.fileType ?? null,
+        size: f.size,
+        pageCount: f.pageCount,
+        roundName: f.roundId ? roundNames.get(f.roundId) : undefined,
+        isCurrentRound: f.roundId === roundId,
+        textContent: fileContents?.get(f.id),
      })),
+    }),
+  )
+
+  const { anonymized: anonymizedBase, mappings } = anonymizeProjectsForAI(
+    projectsWithRelations,
+    'FILTERING',
+  )
+
+  if (!validateAnonymizedProjects(anonymizedBase)) {
+    console.error('[AI Shortlist] Anonymization validation failed')
+    return {
+      recommendations: [],
+      tokensUsed: 0,
+      errors: ['GDPR compliance check failed: PII detected in anonymized data'],
+    }
+  }
+
+  // Merge anonymized base with per-project aggregates, keyed by mapping order.
+  // Use the same anonymousId scheme the AI prompt expects.
+  const anonymized = anonymizedBase.map((p, index) => {
+    const realId = mappings[index].realId
+    const agg = aggregatesByProjectId.get(realId) ?? {
+      avgScore: 0,
+      evaluationCount: 0,
+      feedbackSamples: [],
+    }
+    return {
+      anonymousId: `PROJECT_${String(index + 1).padStart(3, '0')}`,
+      ...p,
+      project_id: undefined,
+      avgScore: agg.avgScore,
+      evaluationCount: agg.evaluationCount,
+      feedbackSamples: agg.feedbackSamples,
    }
  })

-  // Anonymize for AI
-  const anonymized = projectSummaries.map((p: any, index: number) => ({
-    anonymousId: `PROJECT_${String(index + 1).padStart(3, '0')}`,
-    ...p,
-    id: undefined,
-  }))
-
  // Build idMap for de-anonymization
  const idMap = new Map<string, string>()
-  projectSummaries.forEach((p: any, index: number) => {
-    idMap.set(`PROJECT_${String(index + 1).padStart(3, '0')}`, p.id)
+  mappings.forEach((m, index) => {
+    idMap.set(`PROJECT_${String(index + 1).padStart(3, '0')}`, m.realId)
  })

  // Call AI
--- a/src/server/services/anonymization.ts
+++ b/src/server/services/anonymization.ts
@@ -353,7 +353,9 @@ export function anonymizeProjectForAI(
      ...(f.langConfidence != null ? { lang_confidence: f.langConfidence } : {}),
      ...(f.roundName ? { round_name: f.roundName } : {}),
      ...(f.isCurrentRound !== undefined ? { is_current_round: f.isCurrentRound } : {}),
-      ...(f.textContent ? { text_content: f.textContent } : {}),
+      // Strip PII patterns (email/phone/url/ssn) from extracted file text
+      // before it leaves the trust boundary to OpenAI.
+      ...(f.textContent ? { text_content: sanitizeText(f.textContent) } : {}),
    })) ?? [],
    wants_mentorship: project.wantsMentorship ?? false,
    submission_source: project.submissionSource,