Optimize AI system with batching, token tracking, and GDPR compliance

- Add AIUsageLog model for persistent token/cost tracking - Implement batched processing for all AI services: - Assignment: 15 projects/batch - Filtering: 20 projects/batch - Award eligibility: 20 projects/batch - Mentor matching: 15 projects/batch - Create unified error classification (ai-errors.ts) - Enhance anonymization with comprehensive project data - Add AI usage dashboard to Settings page - Add usage stats endpoints to settings router - Create AI system documentation (5 files) - Create GDPR compliance documentation (2 files) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 11:58:12 +01:00
parent a72e815d3a
commit 928b1c65dc
19 changed files with 4103 additions and 601 deletions
--- a/src/server/services/anonymization.ts
+++ b/src/server/services/anonymization.ts
@@ -3,8 +3,44 @@
 *
 * Strips PII (names, emails, etc.) from data before sending to AI services.
 * Returns ID mappings for de-anonymization of results.
+ *
+ * GDPR Compliance:
+ * - All personal identifiers are stripped before AI processing
+ * - Project/user IDs are replaced with sequential anonymous IDs
+ * - Text content is sanitized to remove emails, phones, URLs
+ * - Validation ensures no PII leakage before each AI call
 */

+import type {
+  CompetitionCategory,
+  OceanIssue,
+  FileType,
+  SubmissionSource,
+} from '@prisma/client'
+
+// ─── Description Limits ──────────────────────────────────────────────────────
+
+export const DESCRIPTION_LIMITS = {
+  ASSIGNMENT: 300,
+  FILTERING: 500,
+  ELIGIBILITY: 400,
+  MENTOR: 350,
+} as const
+
+export type DescriptionContext = keyof typeof DESCRIPTION_LIMITS
+
+// ─── PII Patterns ────────────────────────────────────────────────────────────
+
+const PII_PATTERNS = {
+  email: /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g,
+  phone: /(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g,
+  url: /https?:\/\/[^\s]+/g,
+  ssn: /\d{3}-\d{2}-\d{4}/g,
+  ipv4: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/g,
+} as const
+
+// ─── Basic Anonymization Types (Assignment Service) ──────────────────────────
+
 export interface AnonymizedJuror {
  anonymousId: string
  expertiseTags: string[]
@@ -37,9 +73,67 @@ export interface AnonymizationResult {
  projectMappings: ProjectMapping[]
 }

+// ─── Enhanced Project Types (Filtering/Awards) ───────────────────────────────
+
 /**
- * Juror data from database
+ * Comprehensive anonymized project data for AI filtering
+ * Includes all fields needed for flexible filtering criteria
 */
+export interface AnonymizedProjectForAI {
+  project_id: string // P1, P2, etc.
+  title: string // Sanitized
+  description: string // Truncated + PII stripped
+  category: CompetitionCategory | null // STARTUP | BUSINESS_CONCEPT
+  ocean_issue: OceanIssue | null // Enum value
+  country: string | null
+  region: string | null // geographicZone
+  institution: string | null
+  tags: string[]
+  founded_year: number | null // Just the year
+  team_size: number
+  has_description: boolean
+  file_count: number
+  file_types: string[] // FileType values
+  wants_mentorship: boolean
+  submission_source: SubmissionSource
+  submitted_date: string | null // YYYY-MM-DD only
+}
+
+/**
+ * Project input with all relations needed for comprehensive anonymization
+ */
+export interface ProjectWithRelations {
+  id: string
+  title: string
+  description?: string | null
+  teamName?: string | null
+  competitionCategory?: CompetitionCategory | null
+  oceanIssue?: OceanIssue | null
+  country?: string | null
+  geographicZone?: string | null
+  institution?: string | null
+  tags: string[]
+  foundedAt?: Date | null
+  wantsMentorship?: boolean
+  submissionSource: SubmissionSource
+  submittedAt?: Date | null
+  _count?: {
+    teamMembers?: number
+    files?: number
+  }
+  files?: Array<{ fileType: FileType | null }>
+}
+
+/**
+ * Mapping for de-anonymization
+ */
+export interface ProjectAIMapping {
+  anonymousId: string
+  realId: string
+}
+
+// ─── Basic Anonymization (Assignment Service) ────────────────────────────────
+
 interface JurorInput {
  id: string
  name?: string | null
@@ -51,9 +145,6 @@ interface JurorInput {
  }
 }

-/**
- * Project data from database
- */
 interface ProjectInput {
  id: string
  title: string
@@ -63,13 +154,7 @@ interface ProjectInput {
 }

 /**
- * Anonymize juror and project data for AI processing
- *
- * This function:
- * 1. Strips all PII (names, emails) from juror data
- * 2. Replaces real IDs with sequential anonymous IDs
- * 3. Keeps only expertise tags and assignment counts
- * 4. Returns mappings for de-anonymization
+ * Anonymize juror and project data for AI processing (Assignment service)
 */
 export function anonymizeForAI(
  jurors: JurorInput[],
@@ -78,7 +163,6 @@ export function anonymizeForAI(
  const jurorMappings: JurorMapping[] = []
  const projectMappings: ProjectMapping[] = []

-  // Anonymize jurors
  const anonymizedJurors: AnonymizedJuror[] = jurors.map((juror, index) => {
    const anonymousId = `juror_${(index + 1).toString().padStart(3, '0')}`

@@ -95,7 +179,6 @@ export function anonymizeForAI(
    }
  })

-  // Anonymize projects (keep content but replace IDs)
  const anonymizedProjects: AnonymizedProject[] = projects.map(
    (project, index) => {
      const anonymousId = `project_${(index + 1).toString().padStart(3, '0')}`
@@ -109,10 +192,9 @@ export function anonymizeForAI(
        anonymousId,
        title: sanitizeText(project.title),
        description: project.description
-          ? sanitizeText(project.description)
+          ? truncateAndSanitize(project.description, DESCRIPTION_LIMITS.ASSIGNMENT)
          : null,
        tags: project.tags,
-        // Replace specific team names with generic identifier
        teamName: project.teamName ? `Team ${index + 1}` : null,
      }
    }
@@ -126,10 +208,77 @@ export function anonymizeForAI(
  }
 }

+// ─── Enhanced Anonymization (Filtering/Awards) ───────────────────────────────
+
+/**
+ * Anonymize a single project with comprehensive data for AI filtering
+ *
+ * GDPR Compliance:
+ * - Strips team names, email references, phone numbers, URLs
+ * - Replaces IDs with sequential anonymous IDs
+ * - Truncates descriptions to limit data exposure
+ * - Keeps only necessary fields for filtering criteria
+ */
+export function anonymizeProjectForAI(
+  project: ProjectWithRelations,
+  index: number,
+  context: DescriptionContext = 'FILTERING'
+): AnonymizedProjectForAI {
+  const descriptionLimit = DESCRIPTION_LIMITS[context]
+
+  return {
+    project_id: `P${index + 1}`,
+    title: sanitizeText(project.title),
+    description: truncateAndSanitize(project.description, descriptionLimit),
+    category: project.competitionCategory ?? null,
+    ocean_issue: project.oceanIssue ?? null,
+    country: project.country ?? null,
+    region: project.geographicZone ?? null,
+    institution: project.institution ?? null,
+    tags: project.tags,
+    founded_year: project.foundedAt?.getFullYear() ?? null,
+    team_size: project._count?.teamMembers ?? 0,
+    has_description: !!project.description?.trim(),
+    file_count: project._count?.files ?? 0,
+    file_types: project.files
+      ?.map((f) => f.fileType)
+      .filter((ft): ft is FileType => ft !== null) ?? [],
+    wants_mentorship: project.wantsMentorship ?? false,
+    submission_source: project.submissionSource,
+    submitted_date: project.submittedAt?.toISOString().split('T')[0] ?? null,
+  }
+}
+
+/**
+ * Anonymize multiple projects and return mappings
+ */
+export function anonymizeProjectsForAI(
+  projects: ProjectWithRelations[],
+  context: DescriptionContext = 'FILTERING'
+): {
+  anonymized: AnonymizedProjectForAI[]
+  mappings: ProjectAIMapping[]
+} {
+  const mappings: ProjectAIMapping[] = []
+  const anonymized = projects.map((project, index) => {
+    mappings.push({
+      anonymousId: `P${index + 1}`,
+      realId: project.id,
+    })
+    return anonymizeProjectForAI(project, index, context)
+  })
+
+  return { anonymized, mappings }
+}
+
+// ─── De-anonymization ────────────────────────────────────────────────────────
+
 /**
 * De-anonymize AI results back to real IDs
 */
-export function deanonymizeResults<T extends { jurorId: string; projectId: string }>(
+export function deanonymizeResults<
+  T extends { jurorId: string; projectId: string }
+>(
  results: T[],
  jurorMappings: JurorMapping[],
  projectMappings: ProjectMapping[]
@@ -149,50 +298,155 @@ export function deanonymizeResults<T extends { jurorId: string; projectId: strin
 }

 /**
- * Sanitize text to remove potential PII patterns
- * Removes emails, phone numbers, and URLs from text
+ * De-anonymize project-only results (for filtering/awards)
 */
-function sanitizeText(text: string): string {
+export function deanonymizeProjectResults<T extends { project_id: string }>(
+  results: T[],
+  mappings: ProjectAIMapping[]
+): (T & { realProjectId: string })[] {
+  const projectMap = new Map(mappings.map((m) => [m.anonymousId, m.realId]))
+
+  return results.map((result) => ({
+    ...result,
+    realProjectId: projectMap.get(result.project_id) || result.project_id,
+  }))
+}
+
+// ─── Text Sanitization ───────────────────────────────────────────────────────
+
+/**
+ * Sanitize text to remove potential PII patterns
+ * Removes emails, phone numbers, URLs, and other identifying information
+ */
+export function sanitizeText(text: string): string {
+  let sanitized = text
+
  // Remove email addresses
-  let sanitized = text.replace(
-    /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g,
-    '[email removed]'
-  )
+  sanitized = sanitized.replace(PII_PATTERNS.email, '[email removed]')

  // Remove phone numbers (various formats)
-  sanitized = sanitized.replace(
-    /(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g,
-    '[phone removed]'
-  )
+  sanitized = sanitized.replace(PII_PATTERNS.phone, '[phone removed]')

  // Remove URLs
-  sanitized = sanitized.replace(
-    /https?:\/\/[^\s]+/g,
-    '[url removed]'
-  )
+  sanitized = sanitized.replace(PII_PATTERNS.url, '[url removed]')
+
+  // Remove SSN-like patterns
+  sanitized = sanitized.replace(PII_PATTERNS.ssn, '[id removed]')

  return sanitized
 }

+/**
+ * Truncate text to a maximum length and sanitize
+ */
+export function truncateAndSanitize(
+  text: string | null | undefined,
+  maxLength: number
+): string {
+  if (!text) return ''
+
+  const sanitized = sanitizeText(text)
+
+  if (sanitized.length <= maxLength) {
+    return sanitized
+  }
+
+  return sanitized.slice(0, maxLength - 3) + '...'
+}
+
+// ─── GDPR Compliance Validation ──────────────────────────────────────────────
+
+export interface PIIValidationResult {
+  valid: boolean
+  violations: string[]
+}
+
+/**
+ * Validate that data contains no personal information
+ * Used for GDPR compliance before sending data to AI
+ */
+export function validateNoPersonalData(
+  data: Record<string, unknown>
+): PIIValidationResult {
+  const violations: string[] = []
+  const textContent = JSON.stringify(data)
+
+  // Check each PII pattern
+  for (const [type, pattern] of Object.entries(PII_PATTERNS)) {
+    // Reset regex state (global flag)
+    pattern.lastIndex = 0
+
+    if (pattern.test(textContent)) {
+      violations.push(`Potential ${type} detected in data`)
+    }
+  }
+
+  // Additional checks for common PII fields
+  const sensitiveFields = [
+    'email',
+    'phone',
+    'password',
+    'ssn',
+    'socialSecurity',
+    'creditCard',
+    'bankAccount',
+    'drivingLicense',
+  ]
+
+  const keys = Object.keys(data).map((k) => k.toLowerCase())
+  for (const field of sensitiveFields) {
+    if (keys.includes(field)) {
+      violations.push(`Sensitive field "${field}" present in data`)
+    }
+  }
+
+  return {
+    valid: violations.length === 0,
+    violations,
+  }
+}
+
+/**
+ * Enforce GDPR compliance before EVERY AI call
+ * Throws an error if PII is detected
+ */
+export function enforceGDPRCompliance(data: unknown[]): void {
+  for (let i = 0; i < data.length; i++) {
+    const item = data[i]
+    if (typeof item === 'object' && item !== null) {
+      const { valid, violations } = validateNoPersonalData(
+        item as Record<string, unknown>
+      )
+      if (!valid) {
+        console.error(
+          `[GDPR] PII validation failed for item ${i}:`,
+          violations
+        )
+        throw new Error(
+          `GDPR compliance check failed: ${violations.join(', ')}`
+        )
+      }
+    }
+  }
+}
+
 /**
 * Validate that data has been properly anonymized
 * Returns true if no PII patterns are detected
 */
 export function validateAnonymization(data: AnonymizationResult): boolean {
-  const piiPatterns = [
-    /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/, // Email
-    /(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/, // Phone
-  ]
-
  const checkText = (text: string | null | undefined): boolean => {
    if (!text) return true
-    return !piiPatterns.some((pattern) => pattern.test(text))
+    // Reset regex state for each check
+    for (const pattern of Object.values(PII_PATTERNS)) {
+      pattern.lastIndex = 0
+      if (pattern.test(text)) return false
+    }
+    return true
  }

-  // Check jurors (they should only have expertise tags)
+  // Check jurors
  for (const juror of data.jurors) {
-    // Jurors should not have any text fields that could contain PII
-    // Only check expertiseTags
    for (const tag of juror.expertiseTags) {
      if (!checkText(tag)) return false
    }
@@ -209,3 +463,30 @@ export function validateAnonymization(data: AnonymizationResult): boolean {

  return true
 }
+
+/**
+ * Validate anonymized projects for AI (enhanced version)
+ */
+export function validateAnonymizedProjects(
+  projects: AnonymizedProjectForAI[]
+): boolean {
+  const checkText = (text: string | null | undefined): boolean => {
+    if (!text) return true
+    for (const pattern of Object.values(PII_PATTERNS)) {
+      pattern.lastIndex = 0
+      if (pattern.test(text)) return false
+    }
+    return true
+  }
+
+  for (const project of projects) {
+    if (!checkText(project.title)) return false
+    if (!checkText(project.description)) return false
+    if (!checkText(project.institution)) return false
+    for (const tag of project.tags) {
+      if (!checkText(tag)) return false
+    }
+  }
+
+  return true
+}