feat: surface juror-balanced scores and AI calibration advisory

Adds a shared juror-balancing utility (z-score normalization per juror, rescaled back onto the raw 1-10 scale) and wires it into: - Admin reports page: Top-10 project table now shows "Raw Avg" and "Balanced" columns side by side, and the summary stats row shows a balanced-average tile. Sort defaults to balanced so harsh and lenient graders no longer skew the ranking. - Ranking dashboard: each project row shows a green/amber balanced-score chip next to the raw average when the two differ by ≥0.05, making it obvious when juror calibration moved a project's effective ranking. Also adds AI Juror Calibration Advisory — a mutation that takes anonymized per-juror stats, calls OpenAI, and produces a plain-language explanation of the cohort's grading patterns plus per-juror severity (normal / notable / outlier) with a one-sentence narrative. The advisory describes the statistical balance that already runs; it does not introduce a new weighting layer. Rendered as a panel in the Juror Consistency tab when a specific round is selected. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 16:19:00 +02:00
parent 07dd7a0692
commit 982d5193c5
7 changed files with 774 additions and 65 deletions
--- a/src/app/(admin)/admin/reports/page.tsx
+++ b/src/app/(admin)/admin/reports/page.tsx
@@ -45,7 +45,11 @@ import {
  Trophy,
  ArrowRight,
  Hash,
  Sparkles,
  Loader2,
  AlertTriangle,
 } from 'lucide-react'
 import { toast } from 'sonner'
 import { formatDateOnly } from '@/lib/utils'
 import {
  ScoreDistributionChart,
@@ -271,6 +275,12 @@ function ReportsOverview() {
                const evaluated = projectRankings.filter(p => p.averageScore !== null)
                const scores = evaluated.map(p => p.averageScore as number)
                const avgScore = scores.length ? scores.reduce((a, b) => a + b, 0) / scores.length : 0
                const balancedScores = projectRankings
                  .map(p => p.balancedScore)
                  .filter((s): s is number => s != null)
                const avgBalanced = balancedScores.length
                  ? balancedScores.reduce((a, b) => a + b, 0) / balancedScores.length
                  : null
                const minScore = scores.length ? Math.min(...scores) : 0
                const maxScore = scores.length ? Math.max(...scores) : 0
                const evalPercent = projectRankings.length ? Math.round((evaluated.length / projectRankings.length) * 100) : 0
@@ -281,14 +291,28 @@ function ReportsOverview() {
                return (
                  <>
-                    <div className="grid grid-cols-2 gap-3 sm:grid-cols-4">
+                    <div className="grid grid-cols-2 gap-3 sm:grid-cols-5">
                      <div className="rounded-lg border p-3 text-center">
                        <p className="text-xs text-muted-foreground">Total Projects</p>
                        <p className="text-xl font-bold tabular-nums">{projectRankings.length}</p>
                      </div>
-                      <div className="rounded-lg border p-3 text-center">
+                      <div
-                        <p className="text-xs text-muted-foreground">Avg Score</p>
+                        className="rounded-lg border p-3 text-center"
-                        <p className="text-xl font-bold tabular-nums">{avgScore ? avgScore.toFixed(1) : '-'}</p>
+                        title="Unweighted mean of all submitted juror scores"
                      >
                        <p className="text-xs text-muted-foreground">Raw Avg</p>
                        <p className="text-xl font-bold tabular-nums text-muted-foreground">
                          {avgScore ? avgScore.toFixed(1) : '-'}
                        </p>
                      </div>
                      <div
                        className="rounded-lg border p-3 text-center"
                        title="Juror-balanced average: per-juror z-score normalization rescaled to the 1–10 range"
                      >
                        <p className="text-xs text-muted-foreground">Balanced Avg</p>
                        <p className="text-xl font-bold tabular-nums">
                          {avgBalanced == null ? '-' : avgBalanced.toFixed(1)}
                        </p>
                      </div>
                      <div className="rounded-lg border p-3 text-center">
                        <p className="text-xs text-muted-foreground">Evaluated</p>
@@ -319,7 +343,7 @@ function ReportsOverview() {
                    {/* Top 10 ranked table */}
                    <div>
                      <p className="text-sm font-medium text-muted-foreground mb-2 flex items-center gap-1.5">
-                        <Trophy className="h-3.5 w-3.5" /> Top 10 by Average Score
+                        <Trophy className="h-3.5 w-3.5" /> Top 10 by Balanced Score
                      </p>
                      <div className="rounded-lg border">
                        <Table>
@@ -328,7 +352,18 @@ function ReportsOverview() {
                              <TableHead className="w-10">#</TableHead>
                              <TableHead>Project</TableHead>
                              <TableHead className="hidden sm:table-cell">Team</TableHead>
-                              <TableHead className="text-right">Avg</TableHead>
+                              <TableHead
                                className="text-right"
                                title="Raw average of juror scores — uncorrected for per-juror harshness"
                              >
                                Raw Avg
                              </TableHead>
                              <TableHead
                                className="text-right"
                                title="Juror-balanced average: each juror's contribution is z-score normalized against their own grading distribution, then rescaled to the 1–10 range. Harsh and lenient jurors contribute on equal footing."
                              >
                                Balanced
                              </TableHead>
                              <TableHead className="text-right">Evals</TableHead>
                              <TableHead>Status</TableHead>
                            </TableRow>
@@ -345,9 +380,12 @@ function ReportsOverview() {
                                <TableCell className="hidden sm:table-cell text-muted-foreground">
                                  {p.teamName || '-'}
                                </TableCell>
-                                <TableCell className="text-right tabular-nums">
+                                <TableCell className="text-right tabular-nums text-muted-foreground">
                                  {p.averageScore === null ? '-' : p.averageScore.toFixed(2)}
                                </TableCell>
                                <TableCell className="text-right tabular-nums font-semibold">
                                  {p.balancedScore == null ? '-' : p.balancedScore.toFixed(2)}
                                </TableCell>
                                <TableCell className="text-right tabular-nums">{p.evaluationCount}</TableCell>
                                <TableCell>
                                  <Badge variant="outline">{formatStatusLabel(p.status)}</Badge>
@@ -870,10 +908,150 @@ function JurorConsistencyTab() {
          }}
        />
      )}
      {queryInput.roundId && (
        <JurorCalibrationPanel roundId={queryInput.roundId} />
      )}
    </div>
  )
 }
 function JurorCalibrationPanel({ roundId }: { roundId: string }) {
  const mutation = trpc.analytics.generateJurorCalibration.useMutation({
    onError: (err) => toast.error(`Calibration analysis failed: ${err.message}`),
  })
  const result = mutation.data
  const severityStyle: Record<string, string> = {
    outlier: 'bg-red-50 text-red-700 border-red-200',
    notable: 'bg-amber-50 text-amber-700 border-amber-200',
    normal: 'bg-muted text-muted-foreground',
  }
  return (
    <Card>
      <CardHeader>
        <div className="flex flex-wrap items-start justify-between gap-3">
          <div>
            <CardTitle className="flex items-center gap-2">
              <Sparkles className="h-5 w-5 text-[#de0f1e]" />
              AI Juror Calibration Advisory
            </CardTitle>
            <CardDescription>
              Plain-language explanation of the per-juror score balancing already applied to rankings.
              Describes, does not prescribe — the math runs regardless.
            </CardDescription>
          </div>
          <Button
            onClick={() => mutation.mutate({ roundId })}
            disabled={mutation.isPending}
            className="gap-2"
          >
            {mutation.isPending ? <Loader2 className="h-4 w-4 animate-spin" /> : <Sparkles className="h-4 w-4" />}
            {mutation.isPending ? 'Analyzing…' : result ? 'Regenerate' : 'Analyze jurors'}
          </Button>
        </div>
      </CardHeader>
      <CardContent className="space-y-5">
        {!result && !mutation.isPending && (
          <p className="text-sm text-muted-foreground">
            Run the analysis to see per-juror grading patterns, cohort stats, and the calibration
            narrative for the selected round.
          </p>
        )}
        {result && (
          <>
            <div className="grid grid-cols-2 gap-3 sm:grid-cols-4">
              <div className="rounded-lg border p-3 text-center">
                <p className="text-xs text-muted-foreground">Cohort Mean</p>
                <p className="text-xl font-bold tabular-nums">{result.cohortMean.toFixed(2)}</p>
              </div>
              <div className="rounded-lg border p-3 text-center">
                <p className="text-xs text-muted-foreground">Cohort Stddev</p>
                <p className="text-xl font-bold tabular-nums">{result.cohortStddev.toFixed(2)}</p>
              </div>
              <div className="rounded-lg border p-3 text-center">
                <p className="text-xs text-muted-foreground">Evaluations</p>
                <p className="text-xl font-bold tabular-nums">{result.totalEvaluations}</p>
              </div>
              <div className="rounded-lg border p-3 text-center">
                <p className="text-xs text-muted-foreground">Jurors</p>
                <p className="text-xl font-bold tabular-nums">{result.totalJurors}</p>
              </div>
            </div>
            <div className="rounded-lg border bg-muted/30 p-4">
              <p className="text-sm leading-relaxed">{result.overallSummary}</p>
              {result.keyTakeaways.length > 0 && (
                <ul className="mt-3 space-y-1.5 text-sm">
                  {result.keyTakeaways.map((t, i) => (
                    <li key={i} className="flex items-start gap-2">
                      <ArrowRight className="mt-1 h-3.5 w-3.5 flex-shrink-0 text-muted-foreground" />
                      <span>{t}</span>
                    </li>
                  ))}
                </ul>
              )}
            </div>
            <div className="rounded-lg border">
              <Table>
                <TableHeader>
                  <TableRow>
                    <TableHead>Juror</TableHead>
                    <TableHead className="text-right">Evals</TableHead>
                    <TableHead className="text-right">Mean</TableHead>
                    <TableHead className="text-right">Δ Cohort</TableHead>
                    <TableHead className="text-right" title="Juror's stddev / cohort stddev">
                      Influence
                    </TableHead>
                    <TableHead>Severity</TableHead>
                    <TableHead>Notes</TableHead>
                  </TableRow>
                </TableHeader>
                <TableBody>
                  {result.jurors.map((j) => (
                    <TableRow key={j.userId}>
                      <TableCell className="font-medium">{j.name}</TableCell>
                      <TableCell className="text-right tabular-nums">{j.evaluationCount}</TableCell>
                      <TableCell className="text-right tabular-nums">{j.rawMean.toFixed(2)}</TableCell>
                      <TableCell
                        className={`text-right tabular-nums ${
                          j.deltaFromCohort < -0.5 ? 'text-red-600' : j.deltaFromCohort > 0.5 ? 'text-emerald-600' : ''
                        }`}
                      >
                        {j.deltaFromCohort > 0 ? '+' : ''}
                        {j.deltaFromCohort.toFixed(2)}
                      </TableCell>
                      <TableCell className="text-right tabular-nums">
                        {j.effectiveInfluence == null ? '-' : j.effectiveInfluence.toFixed(2)}
                      </TableCell>
                      <TableCell>
                        <Badge variant="outline" className={severityStyle[j.severity]}>
                          {j.severity === 'outlier' && <AlertTriangle className="mr-1 h-3 w-3" />}
                          {j.severity}
                        </Badge>
                      </TableCell>
                      <TableCell className="max-w-md text-sm text-muted-foreground">
                        {j.summary}
                      </TableCell>
                    </TableRow>
                  ))}
                </TableBody>
              </Table>
            </div>
            <p className="text-xs text-muted-foreground">
              Generated {result.generatedAt.toLocaleString()} · {result.tokensUsed} tokens · model {result.model}
            </p>
          </>
        )}
      </CardContent>
    </Card>
  )
 }
 function DiversityTab() {
  const [selectedValue, setSelectedValue] = useState<string | null>(null)
--- a/src/components/admin/round/ranking-dashboard.tsx
+++ b/src/components/admin/round/ranking-dashboard.tsx
@@ -82,6 +82,7 @@ type SortableProjectRowProps = {
  entry: (RankedProjectEntry & { originalIndex?: number }) | undefined
  projectInfo: ProjectInfo | undefined
  jurorScores: JurorScore[] | undefined
  balancedScore: number | null
  onSelect: () => void
  isSelected: boolean
  originalRank: number | undefined // from snapshotOrder — always in sync with localOrder
@@ -95,6 +96,7 @@ function SortableProjectRow({
  entry,
  projectInfo,
  jurorScores,
  balancedScore,
  onSelect,
  isSelected,
  originalRank,
@@ -199,11 +201,25 @@ function SortableProjectRow({
          </span>
        ) : null}
-        {/* Average score */}
+        {/* Raw + balanced averages shown side by side */}
        {entry?.avgGlobalScore !== null && entry?.avgGlobalScore !== undefined && jurorScores && jurorScores.length > 1 && (
-          <span className="text-xs font-medium text-muted-foreground" title="Average score">
+          <div className="flex items-center gap-1.5 text-xs" title="Raw juror average vs. juror-balanced average (z-score normalized per juror, rescaled to 1-10)">
-            = {entry.avgGlobalScore.toFixed(1)}
+            <span className="font-medium text-muted-foreground">
-          </span>
+              {entry.avgGlobalScore.toFixed(1)}
            </span>
            {balancedScore != null && Math.abs(balancedScore - entry.avgGlobalScore) >= 0.05 && (
              <span
                className={cn(
                  'font-semibold tabular-nums rounded px-1.5 py-0.5 border',
                  balancedScore > entry.avgGlobalScore
                    ? 'bg-emerald-50 text-emerald-700 border-emerald-200'
                    : 'bg-amber-50 text-amber-700 border-amber-200',
                )}
              >
                ⇢ {balancedScore.toFixed(1)}
              </span>
            )}
          </div>
        )}
        {/* Advance decision indicator */}
@@ -909,7 +925,8 @@ export function RankingDashboard({ competitionId: _competitionId, roundId }: Ran
                                  currentRank={index + 1}
                                  entry={rankingMap.get(projectId)}
                                  projectInfo={projectInfoMap.get(projectId)}
-                                  jurorScores={evalScores?.[projectId]}
+                                  jurorScores={evalScores?.byProject[projectId]}
                                  balancedScore={evalScores?.balanced[projectId]?.balancedAverage ?? null}
                                  onSelect={() => setSelectedProjectId(projectId)}
                                  isSelected={selectedProjectId === projectId}
                                  originalRank={hasReorders ? snapshotOrder[projectId] : undefined}
--- a/src/server/routers/analytics.ts
+++ b/src/server/routers/analytics.ts
@@ -1,11 +1,13 @@
 import { z } from 'zod'
-import { router, observerProcedure } from '../trpc'
+import { router, observerProcedure, adminProcedure } from '../trpc'
 import { normalizeCountryToCode } from '@/lib/countries'
 import { getUserAvatarUrl } from '../utils/avatar-url'
 import { getProjectLogoUrl } from '../utils/project-logo-url'
 import { aggregateVotes } from '../services/deliberation'
 import { validateRoundConfig } from '@/types/competition-configs'
 import type { LiveFinalConfig } from '@/types/competition-configs'
 import { computeBalanceContext, computeBalancedProjectScores, type ScorePoint } from '../services/juror-balance'
 import { generateJurorCalibration } from '../services/ai-juror-calibration'
 const editionOrRoundInput = z.object({
  roundId: z.string().optional(),
@@ -185,73 +187,70 @@ export const analyticsRouter = router({
    }),
  /**
-   * Get project rankings with average scores
+   * Get project rankings with raw and balanced (juror-normalized) average scores.
   *
   * `averageScore` is the raw mean of per-evaluation criterion averages.
   * `balancedScore` rescales each juror's contributions via z-score (relative
   * to their own mean + stddev across projects they reviewed in this round),
   * then maps back onto the same 1-10 scale using the overall mean + stddev.
   * A harsh juror's scores are pulled up, a lenient juror's pulled down, so
   * rankings aren't skewed by a single outlier grader.
   */
  getProjectRankings: observerProcedure
    .input(editionOrRoundInput.and(z.object({ limit: z.number().optional() })))
    .query(async ({ ctx, input }) => {
-      const projects = await ctx.prisma.project.findMany({
+      const [projects, evaluations] = await Promise.all([
-        where: projectWhere(input),
+        ctx.prisma.project.findMany({
-        select: {
+          where: projectWhere(input),
-          id: true,
+          select: {
-          title: true,
+            id: true,
-          teamName: true,
+            title: true,
-          status: true,
+            teamName: true,
-          assignments: {
+            status: true,
            where: assignmentWhere(input),
            select: {
              evaluation: {
                select: { criterionScoresJson: true, status: true },
              },
            },
          },
-        },
+        }),
-      })
+        ctx.prisma.evaluation.findMany({
          where: evalWhere(input, { status: 'SUBMITTED' }),
          select: {
            criterionScoresJson: true,
            assignment: { select: { userId: true, projectId: true } },
          },
        }),
      ])
      // Extract a single eval-level score (mean of numeric criterion scores) per evaluation.
      const points: ScorePoint[] = []
      for (const e of evaluations) {
        const scores = e.criterionScoresJson as Record<string, unknown> | null
        if (!scores) continue
        const vals = Object.values(scores).filter((s): s is number => typeof s === 'number')
        if (vals.length === 0) continue
        const rawScore = vals.reduce((a, b) => a + b, 0) / vals.length
        points.push({ projectId: e.assignment.projectId, userId: e.assignment.userId, rawScore })
      }
      const balanceCtx = computeBalanceContext(points)
      const balancedByProject = computeBalancedProjectScores(points, balanceCtx)
      // Calculate average scores
      const rankings = projects
        .map((project) => {
-          const allScores: number[] = []
+          const result = balancedByProject.get(project.id)
          project.assignments.forEach((assignment) => {
            const evaluation = assignment.evaluation
            if (evaluation?.status === 'SUBMITTED') {
              const scores = evaluation.criterionScoresJson as Record<
                string,
                number
              > | null
              if (scores) {
                const scoreValues = Object.values(scores).filter(
                  (s): s is number => typeof s === 'number'
                )
                if (scoreValues.length > 0) {
                  const average =
                    scoreValues.reduce((a, b) => a + b, 0) / scoreValues.length
                  allScores.push(average)
                }
              }
            }
          })
          const averageScore =
            allScores.length > 0
              ? allScores.reduce((a, b) => a + b, 0) / allScores.length
              : null
          return {
            id: project.id,
            title: project.title,
            teamName: project.teamName,
            status: project.status,
-            averageScore,
+            averageScore: result?.rawAverage ?? null,
-            evaluationCount: allScores.length,
+            balancedScore: result?.balancedAverage ?? null,
            evaluationCount: result?.count ?? 0,
          }
        })
        .sort((a, b) => {
-          // Evaluated projects first (sorted by score desc), unevaluated at bottom
+          const aScore = a.balancedScore ?? a.averageScore
-          if (a.averageScore !== null && b.averageScore !== null) return b.averageScore - a.averageScore
+          const bScore = b.balancedScore ?? b.averageScore
-          if (a.averageScore !== null) return -1
+          if (aScore !== null && bScore !== null) return bScore - aScore
-          if (b.averageScore !== null) return 1
+          if (aScore !== null) return -1
          if (bScore !== null) return 1
          return 0
        })
@@ -2345,4 +2344,19 @@ export const analyticsRouter = router({
        standings,
      }
    }),
  /**
   * AI-powered juror calibration analysis for an evaluation round.
   * Produces a plain-language explanation of the per-juror z-score balance
   * already applied in ranking — describes, does not prescribe.
   */
  generateJurorCalibration: adminProcedure
    .input(z.object({ roundId: z.string() }))
    .mutation(async ({ ctx, input }) => {
      return generateJurorCalibration({
        roundId: input.roundId,
        userId: ctx.user.id,
        prisma: ctx.prisma,
      })
    }),
 })
--- a/src/server/routers/ranking.ts
+++ b/src/server/routers/ranking.ts
@@ -12,6 +12,7 @@ import {
 } from '../services/ai-ranking'
 import { logAudit } from '../utils/audit'
 import type { EvaluationConfig } from '@/types/competition-configs'
 import { computeBalanceContext, computeBalancedProjectScores, type ScorePoint } from '../services/juror-balance'
 // ─── Local Types ───────────────────────────────────────────────────────────────
@@ -471,6 +472,7 @@ export const rankingRouter = router({
          evaluation: { status: 'SUBMITTED' },
        },
        select: {
          userId: true,
          projectId: true,
          user: { select: { name: true, email: true } },
          evaluation: {
@@ -489,6 +491,8 @@ export const rankingRouter = router({
        decision: boolean | null
      }>> = {}
      const balancePoints: ScorePoint[] = []
      for (const a of assignments) {
        if (!a.evaluation) continue
        const list = byProject[a.projectId] ?? []
@@ -511,8 +515,28 @@ export const rankingRouter = router({
          decision,
        })
        byProject[a.projectId] = list
        if (a.evaluation.globalScore != null) {
          balancePoints.push({
            projectId: a.projectId,
            userId: a.userId,
            rawScore: a.evaluation.globalScore,
          })
        }
      }
-      return byProject
+      const balanceCtx = computeBalanceContext(balancePoints)
      const balancedByProject = computeBalancedProjectScores(balancePoints, balanceCtx)
      // Per-project balanced average on the 1-10 scale, comparable to raw avgs.
      const balanced: Record<string, { rawAverage: number | null; balancedAverage: number | null }> = {}
      for (const [projectId, result] of balancedByProject.entries()) {
        balanced[projectId] = {
          rawAverage: result.rawAverage,
          balancedAverage: result.balancedAverage,
        }
      }
      return { byProject, balanced }
    }),
 })
--- a/src/server/services/ai-juror-calibration.ts
+++ b/src/server/services/ai-juror-calibration.ts
@@ -0,0 +1,355 @@
 /**
 * AI-Powered Juror Calibration Advisory
 *
 * Analyzes per-juror grading statistics for an evaluation round and
 * produces a human-readable explanation of how each juror's scores compare
 * to the cohort. Describes the z-score balance that's already applied in
 * ranking; does NOT introduce a new weighting layer — only explains the
 * existing math in plain language so admins can justify results to jurors.
 *
 * GDPR: Juror identifiers are replaced with Juror-1, Juror-2, ... before any
 * call to OpenAI. No names or emails leave the server.
 */
 import { TRPCError } from '@trpc/server'
 import { getOpenAI, getConfiguredModel, buildCompletionParams, AI_MODELS } from '@/lib/openai'
 import { logAIUsage, extractTokenUsage } from '@/server/utils/ai-usage'
 import { classifyAIError, createParseError, logAIError } from './ai-errors'
 import type { PrismaClient } from '@prisma/client'
 import { computeBalanceContext, type ScorePoint } from './juror-balance'
 // ─── Types ──────────────────────────────────────────────────────────────────
 export type JurorCalibrationEntry = {
  userId: string
  name: string
  evaluationCount: number
  rawMean: number
  stddev: number
  deltaFromCohort: number
  effectiveInfluence: number | null
  severity: 'normal' | 'notable' | 'outlier'
  summary: string
 }
 export type JurorCalibrationResult = {
  roundId: string
  roundName: string
  cohortMean: number
  cohortStddev: number
  totalEvaluations: number
  totalJurors: number
  overallSummary: string
  keyTakeaways: string[]
  jurors: JurorCalibrationEntry[]
  tokensUsed: number
  model: string
  generatedAt: Date
 }
 type AIResponsePayload = {
  overallSummary: string
  keyTakeaways: string[]
  jurors: Array<{
    jurorId: string
    severity: 'normal' | 'notable' | 'outlier'
    summary: string
  }>
 }
 type InternalJurorRecord = {
  userId: string
  name: string
  evaluationCount: number
  rawMean: number
  stddev: number
  deltaFromCohort: number
  effectiveInfluence: number | null
 }
 // ─── Main Orchestrator ──────────────────────────────────────────────────────
 export async function generateJurorCalibration({
  roundId,
  userId,
  prisma,
 }: {
  roundId: string
  userId: string
  prisma: PrismaClient
 }): Promise<JurorCalibrationResult> {
  const round = await prisma.round.findUnique({
    where: { id: roundId },
    select: { id: true, name: true, roundType: true },
  })
  if (!round) {
    throw new TRPCError({ code: 'NOT_FOUND', message: 'Round not found' })
  }
  const evaluations = await prisma.evaluation.findMany({
    where: {
      status: 'SUBMITTED',
      assignment: { roundId },
    },
    select: {
      globalScore: true,
      criterionScoresJson: true,
      assignment: {
        select: {
          userId: true,
          projectId: true,
          user: { select: { id: true, name: true, email: true } },
        },
      },
    },
  })
  // Build (project, juror, score) points using each eval's mean criterion score,
  // matching how the reports page reports raw + balanced averages per project.
  const points: ScorePoint[] = []
  const nameByUserId = new Map<string, string>()
  for (const e of evaluations) {
    const scores = e.criterionScoresJson as Record<string, unknown> | null
    if (!scores) continue
    const vals = Object.values(scores).filter((s): s is number => typeof s === 'number')
    if (vals.length === 0) continue
    const rawScore = vals.reduce((a, b) => a + b, 0) / vals.length
    points.push({
      projectId: e.assignment.projectId,
      userId: e.assignment.userId,
      rawScore,
    })
    nameByUserId.set(
      e.assignment.userId,
      e.assignment.user.name ?? e.assignment.user.email ?? 'Unknown',
    )
  }
  if (points.length === 0) {
    throw new TRPCError({
      code: 'BAD_REQUEST',
      message: 'No submitted evaluations with numeric scores in this round',
    })
  }
  const ctx = computeBalanceContext(points)
  // Build internal juror stats and a per-juror effective influence ratio
  // (scale of the juror's raw stddev relative to the cohort stddev).
  const internalJurors: InternalJurorRecord[] = []
  for (const [uid, stats] of ctx.jurorStats.entries()) {
    const effectiveInfluence = ctx.overallStddev > 0 && stats.stddev > 0
      ? Math.min(2, stats.stddev / ctx.overallStddev)
      : null
    internalJurors.push({
      userId: uid,
      name: nameByUserId.get(uid) ?? 'Unknown',
      evaluationCount: stats.count,
      rawMean: stats.mean,
      stddev: stats.stddev,
      deltaFromCohort: stats.mean - ctx.overallMean,
      effectiveInfluence,
    })
  }
  // Sort by absolute delta from cohort desc so largest outliers land first in prompts + UI.
  internalJurors.sort((a, b) => Math.abs(b.deltaFromCohort) - Math.abs(a.deltaFromCohort))
  // Build anonymized payload for the AI call.
  const anonymizedMap = new Map<string, string>()
  const anonymizedJurors = internalJurors.map((j, i) => {
    const id = `Juror-${i + 1}`
    anonymizedMap.set(j.userId, id)
    return {
      jurorId: id,
      evaluationCount: j.evaluationCount,
      rawMean: Math.round(j.rawMean * 100) / 100,
      stddev: Math.round(j.stddev * 100) / 100,
      deltaFromCohort: Math.round(j.deltaFromCohort * 100) / 100,
      effectiveInfluence: j.effectiveInfluence != null
        ? Math.round(j.effectiveInfluence * 100) / 100
        : null,
    }
  })
  const openai = await getOpenAI()
  if (!openai) {
    throw new TRPCError({
      code: 'PRECONDITION_FAILED',
      message: 'OpenAI is not configured. Please set up your API key in Settings.',
    })
  }
  const model = await getConfiguredModel(AI_MODELS.QUICK)
  const prompt = buildCalibrationPrompt({
    roundName: round.name,
    cohortMean: Math.round(ctx.overallMean * 100) / 100,
    cohortStddev: Math.round(ctx.overallStddev * 100) / 100,
    totalEvaluations: points.length,
    jurors: anonymizedJurors,
  })
  let aiResponse: AIResponsePayload
  let tokensUsed = 0
  try {
    const params = buildCompletionParams(model, {
      messages: [{ role: 'user', content: prompt }],
      jsonMode: true,
      temperature: 0.2,
      maxTokens: 2000,
    })
    const response = await openai.chat.completions.create(params)
    tokensUsed = extractTokenUsage(response).totalTokens
    const content = response.choices[0]?.message?.content
    if (!content) throw new Error('Empty response from AI')
    try {
      aiResponse = JSON.parse(content) as AIResponsePayload
    } catch (parseError) {
      const err = createParseError((parseError as Error).message)
      logAIError('JurorCalibration', 'generate', err)
      throw new TRPCError({
        code: 'INTERNAL_SERVER_ERROR',
        message: 'Failed to parse AI response. Please try again.',
      })
    }
  } catch (error) {
    if (error instanceof TRPCError) throw error
    const classified = classifyAIError(error)
    logAIError('JurorCalibration', 'generate', classified)
    await logAIUsage({
      userId,
      action: 'JUROR_CALIBRATION',
      entityType: 'Round',
      entityId: roundId,
      model,
      promptTokens: 0,
      completionTokens: 0,
      totalTokens: tokensUsed,
      itemsProcessed: 0,
      status: 'ERROR',
      errorMessage: classified.message,
    })
    throw new TRPCError({
      code: 'INTERNAL_SERVER_ERROR',
      message: classified.message,
    })
  }
  await logAIUsage({
    userId,
    action: 'JUROR_CALIBRATION',
    entityType: 'Round',
    entityId: roundId,
    model,
    promptTokens: 0,
    completionTokens: 0,
    totalTokens: tokensUsed,
    itemsProcessed: internalJurors.length,
    status: 'SUCCESS',
  })
  // Merge AI narrative back with internal stats by anonymized id.
  const narrativeById = new Map(aiResponse.jurors.map((j) => [j.jurorId, j]))
  const jurors: JurorCalibrationEntry[] = internalJurors.map((j) => {
    const anonId = anonymizedMap.get(j.userId)!
    const narrative = narrativeById.get(anonId)
    return {
      userId: j.userId,
      name: j.name,
      evaluationCount: j.evaluationCount,
      rawMean: Math.round(j.rawMean * 100) / 100,
      stddev: Math.round(j.stddev * 100) / 100,
      deltaFromCohort: Math.round(j.deltaFromCohort * 100) / 100,
      effectiveInfluence: j.effectiveInfluence != null
        ? Math.round(j.effectiveInfluence * 100) / 100
        : null,
      severity: narrative?.severity ?? classifySeverity(j, ctx.overallStddev),
      summary: narrative?.summary ?? 'No AI narrative available.',
    }
  })
  return {
    roundId: round.id,
    roundName: round.name,
    cohortMean: Math.round(ctx.overallMean * 100) / 100,
    cohortStddev: Math.round(ctx.overallStddev * 100) / 100,
    totalEvaluations: points.length,
    totalJurors: internalJurors.length,
    overallSummary: aiResponse.overallSummary,
    keyTakeaways: aiResponse.keyTakeaways,
    jurors,
    tokensUsed,
    model,
    generatedAt: new Date(),
  }
 }
 // ─── Helpers ────────────────────────────────────────────────────────────────
 function classifySeverity(
  juror: InternalJurorRecord,
  cohortStddev: number,
 ): 'normal' | 'notable' | 'outlier' {
  if (cohortStddev === 0) return 'normal'
  const zDelta = Math.abs(juror.deltaFromCohort) / cohortStddev
  if (zDelta >= 1.5) return 'outlier'
  if (zDelta >= 0.75) return 'notable'
  return 'normal'
 }
 function buildCalibrationPrompt(payload: {
  roundName: string
  cohortMean: number
  cohortStddev: number
  totalEvaluations: number
  jurors: Array<{
    jurorId: string
    evaluationCount: number
    rawMean: number
    stddev: number
    deltaFromCohort: number
    effectiveInfluence: number | null
  }>
 }): string {
  return `You are analyzing juror grading patterns for a competition evaluation round. Your job is to EXPLAIN the statistical normalization that has already been applied; you are NOT introducing a new weighting scheme or prescribing changes.
 CONTEXT:
 - Round: "${payload.roundName}"
 - Cohort mean: ${payload.cohortMean} (scale 1-10)
 - Cohort stddev: ${payload.cohortStddev}
 - Total submitted evaluations: ${payload.totalEvaluations}
 HOW BALANCING WORKS:
 Each juror's scores are z-score normalized against their own mean and stddev, then rescaled back onto the 1-10 range. A juror who averages 2 points below the cohort won't drag projects down more than their peers; a lenient juror won't inflate projects. "effectiveInfluence" is roughly the juror's stddev divided by the cohort stddev — a value near 1.0 means they spread their scores similarly to the cohort; values well under 1 mean compressed scoring, well over 1 mean wide spread.
 JUROR DATA (anonymized, sorted by |deltaFromCohort| desc):
 ${JSON.stringify(payload.jurors, null, 2)}
 Return a JSON object with this exact shape:
 {
  "overallSummary": "2-3 sentences summarizing grading dispersion across the cohort — is the panel tightly aligned or widely divergent?",
  "keyTakeaways": ["up to 4 bullets: notable patterns, risks, what to watch"],
  "jurors": [
    {
      "jurorId": "Juror-N (matching the input)",
      "severity": "normal" | "notable" | "outlier",
      "summary": "One short sentence about this juror's grading pattern, referring to their rawMean, deltaFromCohort, and stddev. Example: 'Scored on average 2.1 points below cohort across 8 evaluations — consistently harsh, low internal variance.'"
    }
  ]
 }
 Guidelines:
 - "outlier" = delta from cohort >= 1.5 cohort-stddev in either direction
 - "notable" = delta from cohort 0.75-1.5 cohort-stddev
 - "normal" = delta from cohort < 0.75 cohort-stddev
 - A juror with very few evaluations (< 3) can't be classified confidently — note this in their summary and prefer "normal".
 - Be factual and specific. Reference the numbers. No speculation about intent.
 - Do not include juror names — only the anonymized jurorId.
 - Include every juror from the input in the jurors array. Order matches input.`
 }
--- a/src/server/services/juror-balance.ts
+++ b/src/server/services/juror-balance.ts
@@ -0,0 +1,120 @@
 /**
 * Juror balancing: z-score normalization to correct for per-juror grading harshness.
 *
 * A juror who grades 1 standard deviation below their peers on shared projects
 * shouldn't punish those projects more than a juror who grades at the mean.
 * We compute per-juror mean + stddev across their scores in a round, z-normalize
 * each score, then rescale back onto the same 1-10 scale using the overall
 * round-level mean + stddev so the balanced number is directly comparable to
 * the raw average.
 */
 export type ScorePoint = {
  projectId: string
  userId: string
  rawScore: number
 }
 export type BalancedProjectResult = {
  projectId: string
  rawAverage: number | null
  balancedAverage: number | null
  count: number
 }
 export type JurorBalance = {
  userId: string
  mean: number
  stddev: number
  count: number
 }
 export type BalanceContext = {
  overallMean: number
  overallStddev: number
  jurorStats: Map<string, JurorBalance>
 }
 /**
 * Build per-juror and overall grading statistics from a flat list of
 * (project, juror, score) points. Returns the stats plus a helper to
 * rescale z-scores back onto the raw-score scale.
 */
 export function computeBalanceContext(points: ScorePoint[]): BalanceContext {
  const jurorScores = new Map<string, number[]>()
  for (const p of points) {
    const arr = jurorScores.get(p.userId) ?? []
    arr.push(p.rawScore)
    jurorScores.set(p.userId, arr)
  }
  const jurorStats = new Map<string, JurorBalance>()
  for (const [userId, scores] of jurorScores.entries()) {
    const mean = scores.reduce((a, b) => a + b, 0) / scores.length
    const variance = scores.length > 1
      ? scores.reduce((s, v) => s + (v - mean) ** 2, 0) / scores.length
      : 0
    jurorStats.set(userId, {
      userId,
      mean,
      stddev: Math.sqrt(variance),
      count: scores.length,
    })
  }
  const allScores = points.map((p) => p.rawScore)
  const overallMean = allScores.length > 0
    ? allScores.reduce((a, b) => a + b, 0) / allScores.length
    : 0
  const overallStddev = allScores.length > 1
    ? Math.sqrt(
        allScores.reduce((s, v) => s + (v - overallMean) ** 2, 0) / allScores.length,
      )
    : 0
  return { overallMean, overallStddev, jurorStats }
 }
 /**
 * Aggregate per-project raw + balanced averages from score points.
 */
 export function computeBalancedProjectScores(
  points: ScorePoint[],
  ctx: BalanceContext,
 ): Map<string, BalancedProjectResult> {
  const byProject = new Map<string, ScorePoint[]>()
  for (const p of points) {
    const arr = byProject.get(p.projectId) ?? []
    arr.push(p)
    byProject.set(p.projectId, arr)
  }
  const results = new Map<string, BalancedProjectResult>()
  for (const [projectId, projectPoints] of byProject.entries()) {
    const rawAverage = projectPoints.reduce((a, b) => a + b.rawScore, 0) / projectPoints.length
    let balancedAverage: number | null = null
    if (ctx.overallStddev > 0) {
      const zValues: number[] = []
      for (const pt of projectPoints) {
        const stats = ctx.jurorStats.get(pt.userId)
        if (stats && stats.stddev > 0) {
          zValues.push((pt.rawScore - stats.mean) / stats.stddev)
        } else {
          zValues.push((pt.rawScore - ctx.overallMean) / ctx.overallStddev)
        }
      }
      const avgZ = zValues.reduce((a, b) => a + b, 0) / zValues.length
      balancedAverage = ctx.overallMean + avgZ * ctx.overallStddev
    }
    results.set(projectId, {
      projectId,
      rawAverage,
      balancedAverage,
      count: projectPoints.length,
    })
  }
  return results
 }
--- a/src/server/utils/ai-usage.ts
+++ b/src/server/utils/ai-usage.ts
@@ -21,6 +21,7 @@ export type AIAction =
  | 'ROUTING'
  | 'SHORTLIST'
  | 'RANKING'
  | 'JUROR_CALIBRATION'
 export type AIStatus = 'SUCCESS' | 'PARTIAL' | 'ERROR'