feat: surface juror-balanced scores and AI calibration advisory

Adds a shared juror-balancing utility (z-score normalization per juror, rescaled back onto the raw 1-10 scale) and wires it into: - Admin reports page: Top-10 project table now shows "Raw Avg" and "Balanced" columns side by side, and the summary stats row shows a balanced-average tile. Sort defaults to balanced so harsh and lenient graders no longer skew the ranking. - Ranking dashboard: each project row shows a green/amber balanced-score chip next to the raw average when the two differ by ≥0.05, making it obvious when juror calibration moved a project's effective ranking. Also adds AI Juror Calibration Advisory — a mutation that takes anonymized per-juror stats, calls OpenAI, and produces a plain-language explanation of the cohort's grading patterns plus per-juror severity (normal / notable / outlier) with a one-sentence narrative. The advisory describes the statistical balance that already runs; it does not introduce a new weighting layer. Rendered as a panel in the Juror Consistency tab when a specific round is selected. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 16:19:00 +02:00
parent 07dd7a0692
commit 982d5193c5
7 changed files with 774 additions and 65 deletions
--- a/src/server/routers/analytics.ts
+++ b/src/server/routers/analytics.ts
@@ -1,11 +1,13 @@
 import { z } from 'zod'
-import { router, observerProcedure } from '../trpc'
+import { router, observerProcedure, adminProcedure } from '../trpc'
 import { normalizeCountryToCode } from '@/lib/countries'
 import { getUserAvatarUrl } from '../utils/avatar-url'
 import { getProjectLogoUrl } from '../utils/project-logo-url'
 import { aggregateVotes } from '../services/deliberation'
 import { validateRoundConfig } from '@/types/competition-configs'
 import type { LiveFinalConfig } from '@/types/competition-configs'
+import { computeBalanceContext, computeBalancedProjectScores, type ScorePoint } from '../services/juror-balance'
+import { generateJurorCalibration } from '../services/ai-juror-calibration'

 const editionOrRoundInput = z.object({
  roundId: z.string().optional(),
@@ -185,73 +187,70 @@ export const analyticsRouter = router({
    }),

  /**
-   * Get project rankings with average scores
+   * Get project rankings with raw and balanced (juror-normalized) average scores.
+   *
+   * `averageScore` is the raw mean of per-evaluation criterion averages.
+   * `balancedScore` rescales each juror's contributions via z-score (relative
+   * to their own mean + stddev across projects they reviewed in this round),
+   * then maps back onto the same 1-10 scale using the overall mean + stddev.
+   * A harsh juror's scores are pulled up, a lenient juror's pulled down, so
+   * rankings aren't skewed by a single outlier grader.
   */
  getProjectRankings: observerProcedure
    .input(editionOrRoundInput.and(z.object({ limit: z.number().optional() })))
    .query(async ({ ctx, input }) => {
-      const projects = await ctx.prisma.project.findMany({
-        where: projectWhere(input),
-        select: {
-          id: true,
-          title: true,
-          teamName: true,
-          status: true,
-          assignments: {
-            where: assignmentWhere(input),
-            select: {
-              evaluation: {
-                select: { criterionScoresJson: true, status: true },
-              },
-            },
+      const [projects, evaluations] = await Promise.all([
+        ctx.prisma.project.findMany({
+          where: projectWhere(input),
+          select: {
+            id: true,
+            title: true,
+            teamName: true,
+            status: true,
          },
-        },
-      })
+        }),
+        ctx.prisma.evaluation.findMany({
+          where: evalWhere(input, { status: 'SUBMITTED' }),
+          select: {
+            criterionScoresJson: true,
+            assignment: { select: { userId: true, projectId: true } },
+          },
+        }),
+      ])
+
+      // Extract a single eval-level score (mean of numeric criterion scores) per evaluation.
+      const points: ScorePoint[] = []
+      for (const e of evaluations) {
+        const scores = e.criterionScoresJson as Record<string, unknown> | null
+        if (!scores) continue
+        const vals = Object.values(scores).filter((s): s is number => typeof s === 'number')
+        if (vals.length === 0) continue
+        const rawScore = vals.reduce((a, b) => a + b, 0) / vals.length
+        points.push({ projectId: e.assignment.projectId, userId: e.assignment.userId, rawScore })
+      }
+
+      const balanceCtx = computeBalanceContext(points)
+      const balancedByProject = computeBalancedProjectScores(points, balanceCtx)

-      // Calculate average scores
      const rankings = projects
        .map((project) => {
-          const allScores: number[] = []
-
-          project.assignments.forEach((assignment) => {
-            const evaluation = assignment.evaluation
-            if (evaluation?.status === 'SUBMITTED') {
-              const scores = evaluation.criterionScoresJson as Record<
-                string,
-                number
-              > | null
-              if (scores) {
-                const scoreValues = Object.values(scores).filter(
-                  (s): s is number => typeof s === 'number'
-                )
-                if (scoreValues.length > 0) {
-                  const average =
-                    scoreValues.reduce((a, b) => a + b, 0) / scoreValues.length
-                  allScores.push(average)
-                }
-              }
-            }
-          })
-
-          const averageScore =
-            allScores.length > 0
-              ? allScores.reduce((a, b) => a + b, 0) / allScores.length
-              : null
-
+          const result = balancedByProject.get(project.id)
          return {
            id: project.id,
            title: project.title,
            teamName: project.teamName,
            status: project.status,
-            averageScore,
-            evaluationCount: allScores.length,
+            averageScore: result?.rawAverage ?? null,
+            balancedScore: result?.balancedAverage ?? null,
+            evaluationCount: result?.count ?? 0,
          }
        })
        .sort((a, b) => {
-          // Evaluated projects first (sorted by score desc), unevaluated at bottom
-          if (a.averageScore !== null && b.averageScore !== null) return b.averageScore - a.averageScore
-          if (a.averageScore !== null) return -1
-          if (b.averageScore !== null) return 1
+          const aScore = a.balancedScore ?? a.averageScore
+          const bScore = b.balancedScore ?? b.averageScore
+          if (aScore !== null && bScore !== null) return bScore - aScore
+          if (aScore !== null) return -1
+          if (bScore !== null) return 1
          return 0
        })

@@ -2345,4 +2344,19 @@ export const analyticsRouter = router({
        standings,
      }
    }),
+
+  /**
+   * AI-powered juror calibration analysis for an evaluation round.
+   * Produces a plain-language explanation of the per-juror z-score balance
+   * already applied in ranking — describes, does not prescribe.
+   */
+  generateJurorCalibration: adminProcedure
+    .input(z.object({ roundId: z.string() }))
+    .mutation(async ({ ctx, input }) => {
+      return generateJurorCalibration({
+        roundId: input.roundId,
+        userId: ctx.user.id,
+        prisma: ctx.prisma,
+      })
+    }),
 })
--- a/src/server/routers/ranking.ts
+++ b/src/server/routers/ranking.ts
@@ -12,6 +12,7 @@ import {
 } from '../services/ai-ranking'
 import { logAudit } from '../utils/audit'
 import type { EvaluationConfig } from '@/types/competition-configs'
+import { computeBalanceContext, computeBalancedProjectScores, type ScorePoint } from '../services/juror-balance'

 // ─── Local Types ───────────────────────────────────────────────────────────────

@@ -471,6 +472,7 @@ export const rankingRouter = router({
          evaluation: { status: 'SUBMITTED' },
        },
        select: {
+          userId: true,
          projectId: true,
          user: { select: { name: true, email: true } },
          evaluation: {
@@ -489,6 +491,8 @@ export const rankingRouter = router({
        decision: boolean | null
      }>> = {}

+      const balancePoints: ScorePoint[] = []
+
      for (const a of assignments) {
        if (!a.evaluation) continue
        const list = byProject[a.projectId] ?? []
@@ -511,8 +515,28 @@ export const rankingRouter = router({
          decision,
        })
        byProject[a.projectId] = list
+
+        if (a.evaluation.globalScore != null) {
+          balancePoints.push({
+            projectId: a.projectId,
+            userId: a.userId,
+            rawScore: a.evaluation.globalScore,
+          })
+        }
      }

-      return byProject
+      const balanceCtx = computeBalanceContext(balancePoints)
+      const balancedByProject = computeBalancedProjectScores(balancePoints, balanceCtx)
+
+      // Per-project balanced average on the 1-10 scale, comparable to raw avgs.
+      const balanced: Record<string, { rawAverage: number | null; balancedAverage: number | null }> = {}
+      for (const [projectId, result] of balancedByProject.entries()) {
+        balanced[projectId] = {
+          rawAverage: result.rawAverage,
+          balancedAverage: result.balancedAverage,
+        }
+      }
+
+      return { byProject, balanced }
    }),
 })
--- a/src/server/services/ai-juror-calibration.ts
+++ b/src/server/services/ai-juror-calibration.ts
@@ -0,0 +1,355 @@
+/**
+ * AI-Powered Juror Calibration Advisory
+ *
+ * Analyzes per-juror grading statistics for an evaluation round and
+ * produces a human-readable explanation of how each juror's scores compare
+ * to the cohort. Describes the z-score balance that's already applied in
+ * ranking; does NOT introduce a new weighting layer — only explains the
+ * existing math in plain language so admins can justify results to jurors.
+ *
+ * GDPR: Juror identifiers are replaced with Juror-1, Juror-2, ... before any
+ * call to OpenAI. No names or emails leave the server.
+ */
+
+import { TRPCError } from '@trpc/server'
+import { getOpenAI, getConfiguredModel, buildCompletionParams, AI_MODELS } from '@/lib/openai'
+import { logAIUsage, extractTokenUsage } from '@/server/utils/ai-usage'
+import { classifyAIError, createParseError, logAIError } from './ai-errors'
+import type { PrismaClient } from '@prisma/client'
+import { computeBalanceContext, type ScorePoint } from './juror-balance'
+
+// ─── Types ──────────────────────────────────────────────────────────────────
+
+export type JurorCalibrationEntry = {
+  userId: string
+  name: string
+  evaluationCount: number
+  rawMean: number
+  stddev: number
+  deltaFromCohort: number
+  effectiveInfluence: number | null
+  severity: 'normal' | 'notable' | 'outlier'
+  summary: string
+}
+
+export type JurorCalibrationResult = {
+  roundId: string
+  roundName: string
+  cohortMean: number
+  cohortStddev: number
+  totalEvaluations: number
+  totalJurors: number
+  overallSummary: string
+  keyTakeaways: string[]
+  jurors: JurorCalibrationEntry[]
+  tokensUsed: number
+  model: string
+  generatedAt: Date
+}
+
+type AIResponsePayload = {
+  overallSummary: string
+  keyTakeaways: string[]
+  jurors: Array<{
+    jurorId: string
+    severity: 'normal' | 'notable' | 'outlier'
+    summary: string
+  }>
+}
+
+type InternalJurorRecord = {
+  userId: string
+  name: string
+  evaluationCount: number
+  rawMean: number
+  stddev: number
+  deltaFromCohort: number
+  effectiveInfluence: number | null
+}
+
+// ─── Main Orchestrator ──────────────────────────────────────────────────────
+
+export async function generateJurorCalibration({
+  roundId,
+  userId,
+  prisma,
+}: {
+  roundId: string
+  userId: string
+  prisma: PrismaClient
+}): Promise<JurorCalibrationResult> {
+  const round = await prisma.round.findUnique({
+    where: { id: roundId },
+    select: { id: true, name: true, roundType: true },
+  })
+
+  if (!round) {
+    throw new TRPCError({ code: 'NOT_FOUND', message: 'Round not found' })
+  }
+
+  const evaluations = await prisma.evaluation.findMany({
+    where: {
+      status: 'SUBMITTED',
+      assignment: { roundId },
+    },
+    select: {
+      globalScore: true,
+      criterionScoresJson: true,
+      assignment: {
+        select: {
+          userId: true,
+          projectId: true,
+          user: { select: { id: true, name: true, email: true } },
+        },
+      },
+    },
+  })
+
+  // Build (project, juror, score) points using each eval's mean criterion score,
+  // matching how the reports page reports raw + balanced averages per project.
+  const points: ScorePoint[] = []
+  const nameByUserId = new Map<string, string>()
+
+  for (const e of evaluations) {
+    const scores = e.criterionScoresJson as Record<string, unknown> | null
+    if (!scores) continue
+    const vals = Object.values(scores).filter((s): s is number => typeof s === 'number')
+    if (vals.length === 0) continue
+    const rawScore = vals.reduce((a, b) => a + b, 0) / vals.length
+    points.push({
+      projectId: e.assignment.projectId,
+      userId: e.assignment.userId,
+      rawScore,
+    })
+    nameByUserId.set(
+      e.assignment.userId,
+      e.assignment.user.name ?? e.assignment.user.email ?? 'Unknown',
+    )
+  }
+
+  if (points.length === 0) {
+    throw new TRPCError({
+      code: 'BAD_REQUEST',
+      message: 'No submitted evaluations with numeric scores in this round',
+    })
+  }
+
+  const ctx = computeBalanceContext(points)
+
+  // Build internal juror stats and a per-juror effective influence ratio
+  // (scale of the juror's raw stddev relative to the cohort stddev).
+  const internalJurors: InternalJurorRecord[] = []
+  for (const [uid, stats] of ctx.jurorStats.entries()) {
+    const effectiveInfluence = ctx.overallStddev > 0 && stats.stddev > 0
+      ? Math.min(2, stats.stddev / ctx.overallStddev)
+      : null
+    internalJurors.push({
+      userId: uid,
+      name: nameByUserId.get(uid) ?? 'Unknown',
+      evaluationCount: stats.count,
+      rawMean: stats.mean,
+      stddev: stats.stddev,
+      deltaFromCohort: stats.mean - ctx.overallMean,
+      effectiveInfluence,
+    })
+  }
+
+  // Sort by absolute delta from cohort desc so largest outliers land first in prompts + UI.
+  internalJurors.sort((a, b) => Math.abs(b.deltaFromCohort) - Math.abs(a.deltaFromCohort))
+
+  // Build anonymized payload for the AI call.
+  const anonymizedMap = new Map<string, string>()
+  const anonymizedJurors = internalJurors.map((j, i) => {
+    const id = `Juror-${i + 1}`
+    anonymizedMap.set(j.userId, id)
+    return {
+      jurorId: id,
+      evaluationCount: j.evaluationCount,
+      rawMean: Math.round(j.rawMean * 100) / 100,
+      stddev: Math.round(j.stddev * 100) / 100,
+      deltaFromCohort: Math.round(j.deltaFromCohort * 100) / 100,
+      effectiveInfluence: j.effectiveInfluence != null
+        ? Math.round(j.effectiveInfluence * 100) / 100
+        : null,
+    }
+  })
+
+  const openai = await getOpenAI()
+  if (!openai) {
+    throw new TRPCError({
+      code: 'PRECONDITION_FAILED',
+      message: 'OpenAI is not configured. Please set up your API key in Settings.',
+    })
+  }
+
+  const model = await getConfiguredModel(AI_MODELS.QUICK)
+
+  const prompt = buildCalibrationPrompt({
+    roundName: round.name,
+    cohortMean: Math.round(ctx.overallMean * 100) / 100,
+    cohortStddev: Math.round(ctx.overallStddev * 100) / 100,
+    totalEvaluations: points.length,
+    jurors: anonymizedJurors,
+  })
+
+  let aiResponse: AIResponsePayload
+  let tokensUsed = 0
+
+  try {
+    const params = buildCompletionParams(model, {
+      messages: [{ role: 'user', content: prompt }],
+      jsonMode: true,
+      temperature: 0.2,
+      maxTokens: 2000,
+    })
+
+    const response = await openai.chat.completions.create(params)
+    tokensUsed = extractTokenUsage(response).totalTokens
+    const content = response.choices[0]?.message?.content
+    if (!content) throw new Error('Empty response from AI')
+
+    try {
+      aiResponse = JSON.parse(content) as AIResponsePayload
+    } catch (parseError) {
+      const err = createParseError((parseError as Error).message)
+      logAIError('JurorCalibration', 'generate', err)
+      throw new TRPCError({
+        code: 'INTERNAL_SERVER_ERROR',
+        message: 'Failed to parse AI response. Please try again.',
+      })
+    }
+  } catch (error) {
+    if (error instanceof TRPCError) throw error
+    const classified = classifyAIError(error)
+    logAIError('JurorCalibration', 'generate', classified)
+    await logAIUsage({
+      userId,
+      action: 'JUROR_CALIBRATION',
+      entityType: 'Round',
+      entityId: roundId,
+      model,
+      promptTokens: 0,
+      completionTokens: 0,
+      totalTokens: tokensUsed,
+      itemsProcessed: 0,
+      status: 'ERROR',
+      errorMessage: classified.message,
+    })
+    throw new TRPCError({
+      code: 'INTERNAL_SERVER_ERROR',
+      message: classified.message,
+    })
+  }
+
+  await logAIUsage({
+    userId,
+    action: 'JUROR_CALIBRATION',
+    entityType: 'Round',
+    entityId: roundId,
+    model,
+    promptTokens: 0,
+    completionTokens: 0,
+    totalTokens: tokensUsed,
+    itemsProcessed: internalJurors.length,
+    status: 'SUCCESS',
+  })
+
+  // Merge AI narrative back with internal stats by anonymized id.
+  const narrativeById = new Map(aiResponse.jurors.map((j) => [j.jurorId, j]))
+  const jurors: JurorCalibrationEntry[] = internalJurors.map((j) => {
+    const anonId = anonymizedMap.get(j.userId)!
+    const narrative = narrativeById.get(anonId)
+    return {
+      userId: j.userId,
+      name: j.name,
+      evaluationCount: j.evaluationCount,
+      rawMean: Math.round(j.rawMean * 100) / 100,
+      stddev: Math.round(j.stddev * 100) / 100,
+      deltaFromCohort: Math.round(j.deltaFromCohort * 100) / 100,
+      effectiveInfluence: j.effectiveInfluence != null
+        ? Math.round(j.effectiveInfluence * 100) / 100
+        : null,
+      severity: narrative?.severity ?? classifySeverity(j, ctx.overallStddev),
+      summary: narrative?.summary ?? 'No AI narrative available.',
+    }
+  })
+
+  return {
+    roundId: round.id,
+    roundName: round.name,
+    cohortMean: Math.round(ctx.overallMean * 100) / 100,
+    cohortStddev: Math.round(ctx.overallStddev * 100) / 100,
+    totalEvaluations: points.length,
+    totalJurors: internalJurors.length,
+    overallSummary: aiResponse.overallSummary,
+    keyTakeaways: aiResponse.keyTakeaways,
+    jurors,
+    tokensUsed,
+    model,
+    generatedAt: new Date(),
+  }
+}
+
+// ─── Helpers ────────────────────────────────────────────────────────────────
+
+function classifySeverity(
+  juror: InternalJurorRecord,
+  cohortStddev: number,
+): 'normal' | 'notable' | 'outlier' {
+  if (cohortStddev === 0) return 'normal'
+  const zDelta = Math.abs(juror.deltaFromCohort) / cohortStddev
+  if (zDelta >= 1.5) return 'outlier'
+  if (zDelta >= 0.75) return 'notable'
+  return 'normal'
+}
+
+function buildCalibrationPrompt(payload: {
+  roundName: string
+  cohortMean: number
+  cohortStddev: number
+  totalEvaluations: number
+  jurors: Array<{
+    jurorId: string
+    evaluationCount: number
+    rawMean: number
+    stddev: number
+    deltaFromCohort: number
+    effectiveInfluence: number | null
+  }>
+}): string {
+  return `You are analyzing juror grading patterns for a competition evaluation round. Your job is to EXPLAIN the statistical normalization that has already been applied; you are NOT introducing a new weighting scheme or prescribing changes.
+
+CONTEXT:
+- Round: "${payload.roundName}"
+- Cohort mean: ${payload.cohortMean} (scale 1-10)
+- Cohort stddev: ${payload.cohortStddev}
+- Total submitted evaluations: ${payload.totalEvaluations}
+
+HOW BALANCING WORKS:
+Each juror's scores are z-score normalized against their own mean and stddev, then rescaled back onto the 1-10 range. A juror who averages 2 points below the cohort won't drag projects down more than their peers; a lenient juror won't inflate projects. "effectiveInfluence" is roughly the juror's stddev divided by the cohort stddev — a value near 1.0 means they spread their scores similarly to the cohort; values well under 1 mean compressed scoring, well over 1 mean wide spread.
+
+JUROR DATA (anonymized, sorted by |deltaFromCohort| desc):
+${JSON.stringify(payload.jurors, null, 2)}
+
+Return a JSON object with this exact shape:
+{
+  "overallSummary": "2-3 sentences summarizing grading dispersion across the cohort — is the panel tightly aligned or widely divergent?",
+  "keyTakeaways": ["up to 4 bullets: notable patterns, risks, what to watch"],
+  "jurors": [
+    {
+      "jurorId": "Juror-N (matching the input)",
+      "severity": "normal" | "notable" | "outlier",
+      "summary": "One short sentence about this juror's grading pattern, referring to their rawMean, deltaFromCohort, and stddev. Example: 'Scored on average 2.1 points below cohort across 8 evaluations — consistently harsh, low internal variance.'"
+    }
+  ]
+}
+
+Guidelines:
+- "outlier" = delta from cohort >= 1.5 cohort-stddev in either direction
+- "notable" = delta from cohort 0.75-1.5 cohort-stddev
+- "normal" = delta from cohort < 0.75 cohort-stddev
+- A juror with very few evaluations (< 3) can't be classified confidently — note this in their summary and prefer "normal".
+- Be factual and specific. Reference the numbers. No speculation about intent.
+- Do not include juror names — only the anonymized jurorId.
+- Include every juror from the input in the jurors array. Order matches input.`
+}
--- a/src/server/services/juror-balance.ts
+++ b/src/server/services/juror-balance.ts
@@ -0,0 +1,120 @@
+/**
+ * Juror balancing: z-score normalization to correct for per-juror grading harshness.
+ *
+ * A juror who grades 1 standard deviation below their peers on shared projects
+ * shouldn't punish those projects more than a juror who grades at the mean.
+ * We compute per-juror mean + stddev across their scores in a round, z-normalize
+ * each score, then rescale back onto the same 1-10 scale using the overall
+ * round-level mean + stddev so the balanced number is directly comparable to
+ * the raw average.
+ */
+
+export type ScorePoint = {
+  projectId: string
+  userId: string
+  rawScore: number
+}
+
+export type BalancedProjectResult = {
+  projectId: string
+  rawAverage: number | null
+  balancedAverage: number | null
+  count: number
+}
+
+export type JurorBalance = {
+  userId: string
+  mean: number
+  stddev: number
+  count: number
+}
+
+export type BalanceContext = {
+  overallMean: number
+  overallStddev: number
+  jurorStats: Map<string, JurorBalance>
+}
+
+/**
+ * Build per-juror and overall grading statistics from a flat list of
+ * (project, juror, score) points. Returns the stats plus a helper to
+ * rescale z-scores back onto the raw-score scale.
+ */
+export function computeBalanceContext(points: ScorePoint[]): BalanceContext {
+  const jurorScores = new Map<string, number[]>()
+  for (const p of points) {
+    const arr = jurorScores.get(p.userId) ?? []
+    arr.push(p.rawScore)
+    jurorScores.set(p.userId, arr)
+  }
+
+  const jurorStats = new Map<string, JurorBalance>()
+  for (const [userId, scores] of jurorScores.entries()) {
+    const mean = scores.reduce((a, b) => a + b, 0) / scores.length
+    const variance = scores.length > 1
+      ? scores.reduce((s, v) => s + (v - mean) ** 2, 0) / scores.length
+      : 0
+    jurorStats.set(userId, {
+      userId,
+      mean,
+      stddev: Math.sqrt(variance),
+      count: scores.length,
+    })
+  }
+
+  const allScores = points.map((p) => p.rawScore)
+  const overallMean = allScores.length > 0
+    ? allScores.reduce((a, b) => a + b, 0) / allScores.length
+    : 0
+  const overallStddev = allScores.length > 1
+    ? Math.sqrt(
+        allScores.reduce((s, v) => s + (v - overallMean) ** 2, 0) / allScores.length,
+      )
+    : 0
+
+  return { overallMean, overallStddev, jurorStats }
+}
+
+/**
+ * Aggregate per-project raw + balanced averages from score points.
+ */
+export function computeBalancedProjectScores(
+  points: ScorePoint[],
+  ctx: BalanceContext,
+): Map<string, BalancedProjectResult> {
+  const byProject = new Map<string, ScorePoint[]>()
+  for (const p of points) {
+    const arr = byProject.get(p.projectId) ?? []
+    arr.push(p)
+    byProject.set(p.projectId, arr)
+  }
+
+  const results = new Map<string, BalancedProjectResult>()
+  for (const [projectId, projectPoints] of byProject.entries()) {
+    const rawAverage = projectPoints.reduce((a, b) => a + b.rawScore, 0) / projectPoints.length
+
+    let balancedAverage: number | null = null
+    if (ctx.overallStddev > 0) {
+      const zValues: number[] = []
+      for (const pt of projectPoints) {
+        const stats = ctx.jurorStats.get(pt.userId)
+        if (stats && stats.stddev > 0) {
+          zValues.push((pt.rawScore - stats.mean) / stats.stddev)
+        } else {
+          zValues.push((pt.rawScore - ctx.overallMean) / ctx.overallStddev)
+        }
+      }
+      const avgZ = zValues.reduce((a, b) => a + b, 0) / zValues.length
+      balancedAverage = ctx.overallMean + avgZ * ctx.overallStddev
+    }
+
+    results.set(projectId, {
+      projectId,
+      rawAverage,
+      balancedAverage,
+      count: projectPoints.length,
+    })
+  }
+
+  return results
+}
--- a/src/server/utils/ai-usage.ts
+++ b/src/server/utils/ai-usage.ts
@@ -21,6 +21,7 @@ export type AIAction =
  | 'ROUTING'
  | 'SHORTLIST'
  | 'RANKING'
+  | 'JUROR_CALIBRATION'

 export type AIStatus = 'SUCCESS' | 'PARTIAL' | 'ERROR'