Optimize AI system with batching, token tracking, and GDPR compliance
- Add AIUsageLog model for persistent token/cost tracking - Implement batched processing for all AI services: - Assignment: 15 projects/batch - Filtering: 20 projects/batch - Award eligibility: 20 projects/batch - Mentor matching: 15 projects/batch - Create unified error classification (ai-errors.ts) - Enhance anonymization with comprehensive project data - Add AI usage dashboard to Settings page - Add usage stats endpoints to settings router - Create AI system documentation (5 files) - Create GDPR compliance documentation (2 files) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -3,8 +3,44 @@
|
||||
*
|
||||
* Strips PII (names, emails, etc.) from data before sending to AI services.
|
||||
* Returns ID mappings for de-anonymization of results.
|
||||
*
|
||||
* GDPR Compliance:
|
||||
* - All personal identifiers are stripped before AI processing
|
||||
* - Project/user IDs are replaced with sequential anonymous IDs
|
||||
* - Text content is sanitized to remove emails, phones, URLs
|
||||
* - Validation ensures no PII leakage before each AI call
|
||||
*/
|
||||
|
||||
import type {
|
||||
CompetitionCategory,
|
||||
OceanIssue,
|
||||
FileType,
|
||||
SubmissionSource,
|
||||
} from '@prisma/client'
|
||||
|
||||
// ─── Description Limits ──────────────────────────────────────────────────────
|
||||
|
||||
export const DESCRIPTION_LIMITS = {
|
||||
ASSIGNMENT: 300,
|
||||
FILTERING: 500,
|
||||
ELIGIBILITY: 400,
|
||||
MENTOR: 350,
|
||||
} as const
|
||||
|
||||
export type DescriptionContext = keyof typeof DESCRIPTION_LIMITS
|
||||
|
||||
// ─── PII Patterns ────────────────────────────────────────────────────────────
|
||||
|
||||
const PII_PATTERNS = {
|
||||
email: /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g,
|
||||
phone: /(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g,
|
||||
url: /https?:\/\/[^\s]+/g,
|
||||
ssn: /\d{3}-\d{2}-\d{4}/g,
|
||||
ipv4: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/g,
|
||||
} as const
|
||||
|
||||
// ─── Basic Anonymization Types (Assignment Service) ──────────────────────────
|
||||
|
||||
export interface AnonymizedJuror {
|
||||
anonymousId: string
|
||||
expertiseTags: string[]
|
||||
@@ -37,9 +73,67 @@ export interface AnonymizationResult {
|
||||
projectMappings: ProjectMapping[]
|
||||
}
|
||||
|
||||
// ─── Enhanced Project Types (Filtering/Awards) ───────────────────────────────
|
||||
|
||||
/**
|
||||
* Juror data from database
|
||||
* Comprehensive anonymized project data for AI filtering
|
||||
* Includes all fields needed for flexible filtering criteria
|
||||
*/
|
||||
export interface AnonymizedProjectForAI {
|
||||
project_id: string // P1, P2, etc.
|
||||
title: string // Sanitized
|
||||
description: string // Truncated + PII stripped
|
||||
category: CompetitionCategory | null // STARTUP | BUSINESS_CONCEPT
|
||||
ocean_issue: OceanIssue | null // Enum value
|
||||
country: string | null
|
||||
region: string | null // geographicZone
|
||||
institution: string | null
|
||||
tags: string[]
|
||||
founded_year: number | null // Just the year
|
||||
team_size: number
|
||||
has_description: boolean
|
||||
file_count: number
|
||||
file_types: string[] // FileType values
|
||||
wants_mentorship: boolean
|
||||
submission_source: SubmissionSource
|
||||
submitted_date: string | null // YYYY-MM-DD only
|
||||
}
|
||||
|
||||
/**
|
||||
* Project input with all relations needed for comprehensive anonymization
|
||||
*/
|
||||
export interface ProjectWithRelations {
|
||||
id: string
|
||||
title: string
|
||||
description?: string | null
|
||||
teamName?: string | null
|
||||
competitionCategory?: CompetitionCategory | null
|
||||
oceanIssue?: OceanIssue | null
|
||||
country?: string | null
|
||||
geographicZone?: string | null
|
||||
institution?: string | null
|
||||
tags: string[]
|
||||
foundedAt?: Date | null
|
||||
wantsMentorship?: boolean
|
||||
submissionSource: SubmissionSource
|
||||
submittedAt?: Date | null
|
||||
_count?: {
|
||||
teamMembers?: number
|
||||
files?: number
|
||||
}
|
||||
files?: Array<{ fileType: FileType | null }>
|
||||
}
|
||||
|
||||
/**
|
||||
* Mapping for de-anonymization
|
||||
*/
|
||||
export interface ProjectAIMapping {
|
||||
anonymousId: string
|
||||
realId: string
|
||||
}
|
||||
|
||||
// ─── Basic Anonymization (Assignment Service) ────────────────────────────────
|
||||
|
||||
interface JurorInput {
|
||||
id: string
|
||||
name?: string | null
|
||||
@@ -51,9 +145,6 @@ interface JurorInput {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Project data from database
|
||||
*/
|
||||
interface ProjectInput {
|
||||
id: string
|
||||
title: string
|
||||
@@ -63,13 +154,7 @@ interface ProjectInput {
|
||||
}
|
||||
|
||||
/**
|
||||
* Anonymize juror and project data for AI processing
|
||||
*
|
||||
* This function:
|
||||
* 1. Strips all PII (names, emails) from juror data
|
||||
* 2. Replaces real IDs with sequential anonymous IDs
|
||||
* 3. Keeps only expertise tags and assignment counts
|
||||
* 4. Returns mappings for de-anonymization
|
||||
* Anonymize juror and project data for AI processing (Assignment service)
|
||||
*/
|
||||
export function anonymizeForAI(
|
||||
jurors: JurorInput[],
|
||||
@@ -78,7 +163,6 @@ export function anonymizeForAI(
|
||||
const jurorMappings: JurorMapping[] = []
|
||||
const projectMappings: ProjectMapping[] = []
|
||||
|
||||
// Anonymize jurors
|
||||
const anonymizedJurors: AnonymizedJuror[] = jurors.map((juror, index) => {
|
||||
const anonymousId = `juror_${(index + 1).toString().padStart(3, '0')}`
|
||||
|
||||
@@ -95,7 +179,6 @@ export function anonymizeForAI(
|
||||
}
|
||||
})
|
||||
|
||||
// Anonymize projects (keep content but replace IDs)
|
||||
const anonymizedProjects: AnonymizedProject[] = projects.map(
|
||||
(project, index) => {
|
||||
const anonymousId = `project_${(index + 1).toString().padStart(3, '0')}`
|
||||
@@ -109,10 +192,9 @@ export function anonymizeForAI(
|
||||
anonymousId,
|
||||
title: sanitizeText(project.title),
|
||||
description: project.description
|
||||
? sanitizeText(project.description)
|
||||
? truncateAndSanitize(project.description, DESCRIPTION_LIMITS.ASSIGNMENT)
|
||||
: null,
|
||||
tags: project.tags,
|
||||
// Replace specific team names with generic identifier
|
||||
teamName: project.teamName ? `Team ${index + 1}` : null,
|
||||
}
|
||||
}
|
||||
@@ -126,10 +208,77 @@ export function anonymizeForAI(
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Enhanced Anonymization (Filtering/Awards) ───────────────────────────────
|
||||
|
||||
/**
|
||||
* Anonymize a single project with comprehensive data for AI filtering
|
||||
*
|
||||
* GDPR Compliance:
|
||||
* - Strips team names, email references, phone numbers, URLs
|
||||
* - Replaces IDs with sequential anonymous IDs
|
||||
* - Truncates descriptions to limit data exposure
|
||||
* - Keeps only necessary fields for filtering criteria
|
||||
*/
|
||||
export function anonymizeProjectForAI(
|
||||
project: ProjectWithRelations,
|
||||
index: number,
|
||||
context: DescriptionContext = 'FILTERING'
|
||||
): AnonymizedProjectForAI {
|
||||
const descriptionLimit = DESCRIPTION_LIMITS[context]
|
||||
|
||||
return {
|
||||
project_id: `P${index + 1}`,
|
||||
title: sanitizeText(project.title),
|
||||
description: truncateAndSanitize(project.description, descriptionLimit),
|
||||
category: project.competitionCategory ?? null,
|
||||
ocean_issue: project.oceanIssue ?? null,
|
||||
country: project.country ?? null,
|
||||
region: project.geographicZone ?? null,
|
||||
institution: project.institution ?? null,
|
||||
tags: project.tags,
|
||||
founded_year: project.foundedAt?.getFullYear() ?? null,
|
||||
team_size: project._count?.teamMembers ?? 0,
|
||||
has_description: !!project.description?.trim(),
|
||||
file_count: project._count?.files ?? 0,
|
||||
file_types: project.files
|
||||
?.map((f) => f.fileType)
|
||||
.filter((ft): ft is FileType => ft !== null) ?? [],
|
||||
wants_mentorship: project.wantsMentorship ?? false,
|
||||
submission_source: project.submissionSource,
|
||||
submitted_date: project.submittedAt?.toISOString().split('T')[0] ?? null,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Anonymize multiple projects and return mappings
|
||||
*/
|
||||
export function anonymizeProjectsForAI(
|
||||
projects: ProjectWithRelations[],
|
||||
context: DescriptionContext = 'FILTERING'
|
||||
): {
|
||||
anonymized: AnonymizedProjectForAI[]
|
||||
mappings: ProjectAIMapping[]
|
||||
} {
|
||||
const mappings: ProjectAIMapping[] = []
|
||||
const anonymized = projects.map((project, index) => {
|
||||
mappings.push({
|
||||
anonymousId: `P${index + 1}`,
|
||||
realId: project.id,
|
||||
})
|
||||
return anonymizeProjectForAI(project, index, context)
|
||||
})
|
||||
|
||||
return { anonymized, mappings }
|
||||
}
|
||||
|
||||
// ─── De-anonymization ────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* De-anonymize AI results back to real IDs
|
||||
*/
|
||||
export function deanonymizeResults<T extends { jurorId: string; projectId: string }>(
|
||||
export function deanonymizeResults<
|
||||
T extends { jurorId: string; projectId: string }
|
||||
>(
|
||||
results: T[],
|
||||
jurorMappings: JurorMapping[],
|
||||
projectMappings: ProjectMapping[]
|
||||
@@ -149,50 +298,155 @@ export function deanonymizeResults<T extends { jurorId: string; projectId: strin
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize text to remove potential PII patterns
|
||||
* Removes emails, phone numbers, and URLs from text
|
||||
* De-anonymize project-only results (for filtering/awards)
|
||||
*/
|
||||
function sanitizeText(text: string): string {
|
||||
export function deanonymizeProjectResults<T extends { project_id: string }>(
|
||||
results: T[],
|
||||
mappings: ProjectAIMapping[]
|
||||
): (T & { realProjectId: string })[] {
|
||||
const projectMap = new Map(mappings.map((m) => [m.anonymousId, m.realId]))
|
||||
|
||||
return results.map((result) => ({
|
||||
...result,
|
||||
realProjectId: projectMap.get(result.project_id) || result.project_id,
|
||||
}))
|
||||
}
|
||||
|
||||
// ─── Text Sanitization ───────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Sanitize text to remove potential PII patterns
|
||||
* Removes emails, phone numbers, URLs, and other identifying information
|
||||
*/
|
||||
export function sanitizeText(text: string): string {
|
||||
let sanitized = text
|
||||
|
||||
// Remove email addresses
|
||||
let sanitized = text.replace(
|
||||
/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g,
|
||||
'[email removed]'
|
||||
)
|
||||
sanitized = sanitized.replace(PII_PATTERNS.email, '[email removed]')
|
||||
|
||||
// Remove phone numbers (various formats)
|
||||
sanitized = sanitized.replace(
|
||||
/(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g,
|
||||
'[phone removed]'
|
||||
)
|
||||
sanitized = sanitized.replace(PII_PATTERNS.phone, '[phone removed]')
|
||||
|
||||
// Remove URLs
|
||||
sanitized = sanitized.replace(
|
||||
/https?:\/\/[^\s]+/g,
|
||||
'[url removed]'
|
||||
)
|
||||
sanitized = sanitized.replace(PII_PATTERNS.url, '[url removed]')
|
||||
|
||||
// Remove SSN-like patterns
|
||||
sanitized = sanitized.replace(PII_PATTERNS.ssn, '[id removed]')
|
||||
|
||||
return sanitized
|
||||
}
|
||||
|
||||
/**
|
||||
* Truncate text to a maximum length and sanitize
|
||||
*/
|
||||
export function truncateAndSanitize(
|
||||
text: string | null | undefined,
|
||||
maxLength: number
|
||||
): string {
|
||||
if (!text) return ''
|
||||
|
||||
const sanitized = sanitizeText(text)
|
||||
|
||||
if (sanitized.length <= maxLength) {
|
||||
return sanitized
|
||||
}
|
||||
|
||||
return sanitized.slice(0, maxLength - 3) + '...'
|
||||
}
|
||||
|
||||
// ─── GDPR Compliance Validation ──────────────────────────────────────────────
|
||||
|
||||
export interface PIIValidationResult {
|
||||
valid: boolean
|
||||
violations: string[]
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate that data contains no personal information
|
||||
* Used for GDPR compliance before sending data to AI
|
||||
*/
|
||||
export function validateNoPersonalData(
|
||||
data: Record<string, unknown>
|
||||
): PIIValidationResult {
|
||||
const violations: string[] = []
|
||||
const textContent = JSON.stringify(data)
|
||||
|
||||
// Check each PII pattern
|
||||
for (const [type, pattern] of Object.entries(PII_PATTERNS)) {
|
||||
// Reset regex state (global flag)
|
||||
pattern.lastIndex = 0
|
||||
|
||||
if (pattern.test(textContent)) {
|
||||
violations.push(`Potential ${type} detected in data`)
|
||||
}
|
||||
}
|
||||
|
||||
// Additional checks for common PII fields
|
||||
const sensitiveFields = [
|
||||
'email',
|
||||
'phone',
|
||||
'password',
|
||||
'ssn',
|
||||
'socialSecurity',
|
||||
'creditCard',
|
||||
'bankAccount',
|
||||
'drivingLicense',
|
||||
]
|
||||
|
||||
const keys = Object.keys(data).map((k) => k.toLowerCase())
|
||||
for (const field of sensitiveFields) {
|
||||
if (keys.includes(field)) {
|
||||
violations.push(`Sensitive field "${field}" present in data`)
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
valid: violations.length === 0,
|
||||
violations,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Enforce GDPR compliance before EVERY AI call
|
||||
* Throws an error if PII is detected
|
||||
*/
|
||||
export function enforceGDPRCompliance(data: unknown[]): void {
|
||||
for (let i = 0; i < data.length; i++) {
|
||||
const item = data[i]
|
||||
if (typeof item === 'object' && item !== null) {
|
||||
const { valid, violations } = validateNoPersonalData(
|
||||
item as Record<string, unknown>
|
||||
)
|
||||
if (!valid) {
|
||||
console.error(
|
||||
`[GDPR] PII validation failed for item ${i}:`,
|
||||
violations
|
||||
)
|
||||
throw new Error(
|
||||
`GDPR compliance check failed: ${violations.join(', ')}`
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate that data has been properly anonymized
|
||||
* Returns true if no PII patterns are detected
|
||||
*/
|
||||
export function validateAnonymization(data: AnonymizationResult): boolean {
|
||||
const piiPatterns = [
|
||||
/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/, // Email
|
||||
/(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/, // Phone
|
||||
]
|
||||
|
||||
const checkText = (text: string | null | undefined): boolean => {
|
||||
if (!text) return true
|
||||
return !piiPatterns.some((pattern) => pattern.test(text))
|
||||
// Reset regex state for each check
|
||||
for (const pattern of Object.values(PII_PATTERNS)) {
|
||||
pattern.lastIndex = 0
|
||||
if (pattern.test(text)) return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Check jurors (they should only have expertise tags)
|
||||
// Check jurors
|
||||
for (const juror of data.jurors) {
|
||||
// Jurors should not have any text fields that could contain PII
|
||||
// Only check expertiseTags
|
||||
for (const tag of juror.expertiseTags) {
|
||||
if (!checkText(tag)) return false
|
||||
}
|
||||
@@ -209,3 +463,30 @@ export function validateAnonymization(data: AnonymizationResult): boolean {
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate anonymized projects for AI (enhanced version)
|
||||
*/
|
||||
export function validateAnonymizedProjects(
|
||||
projects: AnonymizedProjectForAI[]
|
||||
): boolean {
|
||||
const checkText = (text: string | null | undefined): boolean => {
|
||||
if (!text) return true
|
||||
for (const pattern of Object.values(PII_PATTERNS)) {
|
||||
pattern.lastIndex = 0
|
||||
if (pattern.test(text)) return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
for (const project of projects) {
|
||||
if (!checkText(project.title)) return false
|
||||
if (!checkText(project.description)) return false
|
||||
if (!checkText(project.institution)) return false
|
||||
for (const tag of project.tags) {
|
||||
if (!checkText(tag)) return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user