MOPC-Portal/src/server/services/ai-shortlist.ts

/**
 * AI Shortlist Service
 *
 * Generates ranked recommendations at end of evaluation rounds.
 * Runs SEPARATELY for each category (STARTUP / BUSINESS_CONCEPT)
 * to produce independent rankings per the competition's advancement rules.
 *
 * GDPR Compliance:
 * - All project data is anonymized before AI processing
 * - No personal identifiers in prompts or responses
 */

import { getOpenAI, getConfiguredModel, buildCompletionParams } from '@/lib/openai'
import { logAIUsage, extractTokenUsage } from '@/server/utils/ai-usage'
import { classifyAIError, logAIError } from './ai-errors'
import { extractMultipleFileContents } from './file-content-extractor'
import {
  toProjectWithRelations,
  anonymizeProjectsForAI,
  validateAnonymizedProjects,
  sanitizeText,
} from './anonymization'
import type { PrismaClient, CompetitionCategory } from '@prisma/client'

// ─── Types ──────────────────────────────────────────────────────────────────

export type ShortlistResult = {
  success: boolean
  recommendations: CategoryRecommendations
  errors?: string[]
  tokensUsed?: number
}

export type CategoryRecommendations = {
  STARTUP: ShortlistRecommendation[]
  BUSINESS_CONCEPT: ShortlistRecommendation[]
}

export type ShortlistRecommendation = {
  projectId: string
  rank: number
  score: number
  category: string
  strengths: string[]
  concerns: string[]
  recommendation: string
}

// ─── Prompt Building ────────────────────────────────────────────────────────

function buildShortlistPrompt(category: string, topN: number, rubric?: string): string {
  const categoryLabel = category === 'STARTUP' ? 'Startup' : 'Business Concept'

  return `You are a senior jury advisor for the Monaco Ocean Protection Challenge.

## Your Role
Analyze aggregated evaluation data to produce a ranked shortlist of the top ${topN} ${categoryLabel} projects.
You are evaluating ONLY ${categoryLabel} projects in this batch — rank them against each other within this category.

## Ranking Criteria (Weighted)
- Evaluation Scores (40%): Average scores across all jury evaluations
- Innovation & Impact (25%): Novelty of approach and potential environmental impact
- Feasibility (20%): Likelihood of successful implementation
- Alignment (15%): Fit with ocean protection mission and competition goals

## Document Analysis
If document content is provided (text_content field in files), use it for deeper qualitative analysis.
Pay SPECIAL ATTENTION to files marked with is_current_round=true — these are the most recent submissions.
Older documents provide context, but recent ones should carry more weight in your assessment.

${rubric ? `## Custom Evaluation Rubric\n${rubric}\n` : ''}
## Output Format
Return a JSON array:
[
  {
    "anonymousId": "PROJECT_001",
    "rank": 1,
    "score": 0-100,
    "strengths": ["strength 1", "strength 2"],
    "concerns": ["concern 1"],
    "recommendation": "1-2 sentence recommendation"
  }
]

## Guidelines
- Only include the top ${topN} projects in your ranking
- Score should reflect weighted combination of all criteria
- Be specific in strengths and concerns — avoid generic statements
- Consider feedback themes and evaluator consensus
- Higher evaluator consensus should boost confidence in ranking
- Do not include any personal identifiers`
}

// ─── Single Category Processing ─────────────────────────────────────────────

async function generateCategoryShortlist(
  params: {
    roundId: string
    category: string
    topN: number
    rubric?: string
    aiParseFiles: boolean
  },
  prisma: PrismaClient,
): Promise<{ recommendations: ShortlistRecommendation[]; tokensUsed: number; errors: string[] }> {
  const { roundId, category, topN, rubric, aiParseFiles } = params

  // Load projects with evaluations for this category
  const projects = await prisma.project.findMany({
    where: {
      competitionCategory: category as CompetitionCategory,
      assignments: { some: { roundId } },
    },
    include: {
      assignments: {
        where: { roundId },
        include: { evaluation: true },
      },
      projectTags: { include: { tag: true } },
      files: {
        select: {
          id: true,
          fileName: true,
          fileType: true,
          mimeType: true,
          size: true,
          pageCount: true,
          objectKey: true,
          roundId: true,
          createdAt: true,
        },
        orderBy: { createdAt: 'desc' as const },
      },
      teamMembers: { select: { user: { select: { name: true } } } },
    },
  })

  if (projects.length === 0) {
    return {
      recommendations: [],
      tokensUsed: 0,
      errors: [`No ${category} projects found for this round`],
    }
  }

  // Get round names for file tagging
  const roundIds = new Set<string>()
  for (const p of projects) {
    for (const f of (p as any).files || []) {
      if (f.roundId) roundIds.add(f.roundId)
    }
  }
  const roundNames = new Map<string, string>()
  if (roundIds.size > 0) {
    const rounds = await prisma.round.findMany({
      where: { id: { in: [...roundIds] } },
      select: { id: true, name: true },
    })
    for (const r of rounds) roundNames.set(r.id, r.name)
  }

  // Optionally extract file contents
  let fileContents: Map<string, string> | undefined
  if (aiParseFiles) {
    const allFiles = projects.flatMap((p: any) =>
      ((p.files || []) as Array<{ id: string; fileName: string; mimeType: string; objectKey: string }>)
    )
    const extractions = await extractMultipleFileContents(allFiles)
    fileContents = new Map()
    for (const e of extractions) {
      if (e.content) fileContents.set(e.fileId, e.content)
    }
  }

  // Aggregate per-project stats and free-text feedback. Sanitize feedback
  // before it enters the prompt — sanitizeText strips email/phone/url/ssn
  // patterns embedded in juror free-text. Without this, juror feedback like
  // "Contact applicant Jane at jane@example.com" leaks PII to OpenAI.
  const aggregatesByProjectId = new Map<
    string,
    { avgScore: number; evaluationCount: number; feedbackSamples: string[] }
  >()
  for (const project of projects as any[]) {
    const evaluations = project.assignments
      .map((a: any) => a.evaluation)
      .filter(Boolean)
      .filter((e: any) => e.status === 'SUBMITTED')

    const scores = evaluations.map((e: any) => e.globalScore ?? 0)
    const avgScore = scores.length > 0
      ? scores.reduce((sum: number, s: number) => sum + s, 0) / scores.length
      : 0

    const feedbackSamples = evaluations
      .map((e: any) => e.feedbackGeneral || e.feedbackText)
      .filter((t: unknown): t is string => typeof t === 'string' && t.length > 0)
      .slice(0, 3)
      .map((t: string) => sanitizeText(t).slice(0, 1000))

    aggregatesByProjectId.set(project.id, {
      avgScore,
      evaluationCount: evaluations.length,
      feedbackSamples,
    })
  }

  // Route every project through the canonical anonymization pipeline so
  // description/title/institution are PII-stripped, free-text is truncated,
  // and file text_content is sanitized (handled in anonymizeProjectForAI).
  const projectsWithRelations = (projects as any[]).map((p) =>
    toProjectWithRelations({
      id: p.id,
      title: p.title,
      description: p.description,
      competitionCategory: p.competitionCategory,
      oceanIssue: p.oceanIssue ?? null,
      country: p.country ?? null,
      geographicZone: p.geographicZone ?? null,
      institution: p.institution ?? null,
      tags: (p.projectTags ?? []).map((pt: any) => pt.tag.name),
      foundedAt: p.foundedAt ?? null,
      wantsMentorship: p.wantsMentorship ?? false,
      submissionSource: p.submissionSource ?? 'MANUAL',
      submittedAt: p.submittedAt ?? null,
      _count: { teamMembers: p.teamMembers?.length ?? 0, files: p.files?.length ?? 0 },
      files: (p.files ?? []).map((f: any) => ({
        fileType: f.fileType ?? null,
        size: f.size,
        pageCount: f.pageCount,
        roundName: f.roundId ? roundNames.get(f.roundId) : undefined,
        isCurrentRound: f.roundId === roundId,
        textContent: fileContents?.get(f.id),
      })),
    }),
  )

  const { anonymized: anonymizedBase, mappings } = anonymizeProjectsForAI(
    projectsWithRelations,
    'FILTERING',
  )

  if (!validateAnonymizedProjects(anonymizedBase)) {
    console.error('[AI Shortlist] Anonymization validation failed')
    return {
      recommendations: [],
      tokensUsed: 0,
      errors: ['GDPR compliance check failed: PII detected in anonymized data'],
    }
  }

  // Merge anonymized base with per-project aggregates, keyed by mapping order.
  // Use the same anonymousId scheme the AI prompt expects.
  const anonymized = anonymizedBase.map((p, index) => {
    const realId = mappings[index].realId
    const agg = aggregatesByProjectId.get(realId) ?? {
      avgScore: 0,
      evaluationCount: 0,
      feedbackSamples: [],
    }
    return {
      anonymousId: `PROJECT_${String(index + 1).padStart(3, '0')}`,
      ...p,
      project_id: undefined,
      avgScore: agg.avgScore,
      evaluationCount: agg.evaluationCount,
      feedbackSamples: agg.feedbackSamples,
    }
  })

  // Build idMap for de-anonymization
  const idMap = new Map<string, string>()
  mappings.forEach((m, index) => {
    idMap.set(`PROJECT_${String(index + 1).padStart(3, '0')}`, m.realId)
  })

  // Call AI
  const openai = await getOpenAI()
  const model = await getConfiguredModel()

  if (!openai) {
    return { recommendations: [], tokensUsed: 0, errors: ['OpenAI client not configured'] }
  }

  const systemPrompt = buildShortlistPrompt(category, topN, rubric)
  const userPrompt = `Analyze these anonymized ${category} project evaluations and produce a ranked shortlist of the top ${topN}.

Projects (${anonymized.length} total):
${JSON.stringify(anonymized, null, 2)}

Return a JSON array following the format specified. Only include the top ${topN} projects. Rank by overall quality within this category.`

  const MAX_PARSE_RETRIES = 2
  let parseAttempts = 0
  let response = await openai.chat.completions.create(
    buildCompletionParams(model, {
      messages: [
        { role: 'system', content: systemPrompt },
        { role: 'user', content: userPrompt },
      ],
      temperature: 0.1,
      jsonMode: true,
    }),
  )

  let tokenUsage = extractTokenUsage(response)

  await logAIUsage({
    action: 'SHORTLIST',
    model,
    promptTokens: tokenUsage.promptTokens,
    completionTokens: tokenUsage.completionTokens,
    totalTokens: tokenUsage.totalTokens,
    status: 'SUCCESS',
  })

  // Parse response
  let parsed: any[]
  while (true) {
    try {
      const content = response.choices[0]?.message?.content
      if (!content) {
        return { recommendations: [], tokensUsed: tokenUsage.totalTokens, errors: ['Empty AI response'] }
      }
      const json = JSON.parse(content)
      parsed = Array.isArray(json) ? json : json.rankings ?? json.projects ?? json.shortlist ?? []
      break
    } catch (parseError) {
      if (parseError instanceof SyntaxError && parseAttempts < MAX_PARSE_RETRIES) {
        parseAttempts++
        response = await openai.chat.completions.create(
          buildCompletionParams(model, {
            messages: [
              { role: 'system', content: systemPrompt },
              { role: 'user', content: userPrompt + '\n\nIMPORTANT: Please ensure valid JSON output.' },
            ],
            temperature: 0.1,
            jsonMode: true,
          }),
        )
        const retryUsage = extractTokenUsage(response)
        tokenUsage.totalTokens += retryUsage.totalTokens
        continue
      }
      return { recommendations: [], tokensUsed: tokenUsage.totalTokens, errors: ['Failed to parse AI response'] }
    }
  }

  // De-anonymize
  const recommendations: ShortlistRecommendation[] = parsed
    .filter((item: any) => item.anonymousId && idMap.has(item.anonymousId))
    .map((item: any) => ({
      projectId: idMap.get(item.anonymousId)!,
      rank: item.rank ?? 0,
      score: item.score ?? 0,
      category,
      strengths: item.strengths ?? [],
      concerns: item.concerns ?? [],
      recommendation: item.recommendation ?? '',
    }))
    .sort((a: ShortlistRecommendation, b: ShortlistRecommendation) => a.rank - b.rank)

  return { recommendations, tokensUsed: tokenUsage.totalTokens, errors: [] }
}

// ─── Main Function ──────────────────────────────────────────────────────────

/**
 * Generate an AI shortlist for projects in a round, split by category.
 * Runs independently for STARTUP and BUSINESS_CONCEPT.
 */
export async function generateShortlist(
  params: {
    roundId: string
    competitionId: string
    category?: string // If provided, only run for this category
    topN?: number // Global fallback
    startupTopN?: number // Per-category override
    conceptTopN?: number // Per-category override
    rubric?: string
    aiParseFiles?: boolean
  },
  prisma: PrismaClient,
): Promise<ShortlistResult> {
  const {
    roundId,
    category,
    topN = 10,
    startupTopN,
    conceptTopN,
    rubric,
    aiParseFiles = false,
  } = params

  try {
    const categories = category
      ? [category]
      : ['STARTUP', 'BUSINESS_CONCEPT']

    const allRecommendations: CategoryRecommendations = {
      STARTUP: [],
      BUSINESS_CONCEPT: [],
    }
    let totalTokens = 0
    const allErrors: string[] = []

    // Run categories in parallel for efficiency
    const categoryPromises = categories.map(async (cat) => {
      const catTopN = cat === 'STARTUP'
        ? (startupTopN ?? topN)
        : (conceptTopN ?? topN)

      console.log(`[AI Shortlist] Generating top-${catTopN} for ${cat}`)

      const result = await generateCategoryShortlist(
        { roundId, category: cat, topN: catTopN, rubric, aiParseFiles },
        prisma,
      )

      return { cat, result }
    })

    const categoryResults = await Promise.all(categoryPromises)

    for (const { cat, result } of categoryResults) {
      if (cat === 'STARTUP') {
        allRecommendations.STARTUP = result.recommendations
      } else {
        allRecommendations.BUSINESS_CONCEPT = result.recommendations
      }
      totalTokens += result.tokensUsed
      allErrors.push(...result.errors)
    }

    return {
      success: true,
      recommendations: allRecommendations,
      tokensUsed: totalTokens,
      errors: allErrors.length > 0 ? allErrors : undefined,
    }
  } catch (error) {
    const classification = classifyAIError(error)
    logAIError('ai-shortlist', 'generateShortlist', classification)
    console.error('[AIShortlist] generateShortlist failed:', error)

    return {
      success: false,
      recommendations: { STARTUP: [], BUSINESS_CONCEPT: [] },
      errors: [error instanceof Error ? error.message : 'AI shortlist generation failed'],
    }
  }
}