feat: surface juror-balanced scores and AI calibration advisory

Adds a shared juror-balancing utility (z-score normalization per juror, rescaled back onto the raw 1-10 scale) and wires it into: - Admin reports page: Top-10 project table now shows "Raw Avg" and "Balanced" columns side by side, and the summary stats row shows a balanced-average tile. Sort defaults to balanced so harsh and lenient graders no longer skew the ranking. - Ranking dashboard: each project row shows a green/amber balanced-score chip next to the raw average when the two differ by ≥0.05, making it obvious when juror calibration moved a project's effective ranking. Also adds AI Juror Calibration Advisory — a mutation that takes anonymized per-juror stats, calls OpenAI, and produces a plain-language explanation of the cohort's grading patterns plus per-juror severity (normal / notable / outlier) with a one-sentence narrative. The advisory describes the statistical balance that already runs; it does not introduce a new weighting layer. Rendered as a panel in the Juror Consistency tab when a specific round is selected. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 16:19:00 +02:00
parent 07dd7a0692
commit 982d5193c5
7 changed files with 774 additions and 65 deletions
--- a/src/server/routers/analytics.ts
+++ b/src/server/routers/analytics.ts
@@ -1,11 +1,13 @@
 import { z } from 'zod'
-import { router, observerProcedure } from '../trpc'
+import { router, observerProcedure, adminProcedure } from '../trpc'
 import { normalizeCountryToCode } from '@/lib/countries'
 import { getUserAvatarUrl } from '../utils/avatar-url'
 import { getProjectLogoUrl } from '../utils/project-logo-url'
 import { aggregateVotes } from '../services/deliberation'
 import { validateRoundConfig } from '@/types/competition-configs'
 import type { LiveFinalConfig } from '@/types/competition-configs'
+import { computeBalanceContext, computeBalancedProjectScores, type ScorePoint } from '../services/juror-balance'
+import { generateJurorCalibration } from '../services/ai-juror-calibration'

 const editionOrRoundInput = z.object({
  roundId: z.string().optional(),
@@ -185,73 +187,70 @@ export const analyticsRouter = router({
    }),

  /**
-   * Get project rankings with average scores
+   * Get project rankings with raw and balanced (juror-normalized) average scores.
+   *
+   * `averageScore` is the raw mean of per-evaluation criterion averages.
+   * `balancedScore` rescales each juror's contributions via z-score (relative
+   * to their own mean + stddev across projects they reviewed in this round),
+   * then maps back onto the same 1-10 scale using the overall mean + stddev.
+   * A harsh juror's scores are pulled up, a lenient juror's pulled down, so
+   * rankings aren't skewed by a single outlier grader.
   */
  getProjectRankings: observerProcedure
    .input(editionOrRoundInput.and(z.object({ limit: z.number().optional() })))
    .query(async ({ ctx, input }) => {
-      const projects = await ctx.prisma.project.findMany({
-        where: projectWhere(input),
-        select: {
-          id: true,
-          title: true,
-          teamName: true,
-          status: true,
-          assignments: {
-            where: assignmentWhere(input),
-            select: {
-              evaluation: {
-                select: { criterionScoresJson: true, status: true },
-              },
-            },
+      const [projects, evaluations] = await Promise.all([
+        ctx.prisma.project.findMany({
+          where: projectWhere(input),
+          select: {
+            id: true,
+            title: true,
+            teamName: true,
+            status: true,
          },
-        },
-      })
+        }),
+        ctx.prisma.evaluation.findMany({
+          where: evalWhere(input, { status: 'SUBMITTED' }),
+          select: {
+            criterionScoresJson: true,
+            assignment: { select: { userId: true, projectId: true } },
+          },
+        }),
+      ])
+
+      // Extract a single eval-level score (mean of numeric criterion scores) per evaluation.
+      const points: ScorePoint[] = []
+      for (const e of evaluations) {
+        const scores = e.criterionScoresJson as Record<string, unknown> | null
+        if (!scores) continue
+        const vals = Object.values(scores).filter((s): s is number => typeof s === 'number')
+        if (vals.length === 0) continue
+        const rawScore = vals.reduce((a, b) => a + b, 0) / vals.length
+        points.push({ projectId: e.assignment.projectId, userId: e.assignment.userId, rawScore })
+      }
+
+      const balanceCtx = computeBalanceContext(points)
+      const balancedByProject = computeBalancedProjectScores(points, balanceCtx)

-      // Calculate average scores
      const rankings = projects
        .map((project) => {
-          const allScores: number[] = []
-
-          project.assignments.forEach((assignment) => {
-            const evaluation = assignment.evaluation
-            if (evaluation?.status === 'SUBMITTED') {
-              const scores = evaluation.criterionScoresJson as Record<
-                string,
-                number
-              > | null
-              if (scores) {
-                const scoreValues = Object.values(scores).filter(
-                  (s): s is number => typeof s === 'number'
-                )
-                if (scoreValues.length > 0) {
-                  const average =
-                    scoreValues.reduce((a, b) => a + b, 0) / scoreValues.length
-                  allScores.push(average)
-                }
-              }
-            }
-          })
-
-          const averageScore =
-            allScores.length > 0
-              ? allScores.reduce((a, b) => a + b, 0) / allScores.length
-              : null
-
+          const result = balancedByProject.get(project.id)
          return {
            id: project.id,
            title: project.title,
            teamName: project.teamName,
            status: project.status,
-            averageScore,
-            evaluationCount: allScores.length,
+            averageScore: result?.rawAverage ?? null,
+            balancedScore: result?.balancedAverage ?? null,
+            evaluationCount: result?.count ?? 0,
          }
        })
        .sort((a, b) => {
-          // Evaluated projects first (sorted by score desc), unevaluated at bottom
-          if (a.averageScore !== null && b.averageScore !== null) return b.averageScore - a.averageScore
-          if (a.averageScore !== null) return -1
-          if (b.averageScore !== null) return 1
+          const aScore = a.balancedScore ?? a.averageScore
+          const bScore = b.balancedScore ?? b.averageScore
+          if (aScore !== null && bScore !== null) return bScore - aScore
+          if (aScore !== null) return -1
+          if (bScore !== null) return 1
          return 0
        })

@@ -2345,4 +2344,19 @@ export const analyticsRouter = router({
        standings,
      }
    }),
+
+  /**
+   * AI-powered juror calibration analysis for an evaluation round.
+   * Produces a plain-language explanation of the per-juror z-score balance
+   * already applied in ranking — describes, does not prescribe.
+   */
+  generateJurorCalibration: adminProcedure
+    .input(z.object({ roundId: z.string() }))
+    .mutation(async ({ ctx, input }) => {
+      return generateJurorCalibration({
+        roundId: input.roundId,
+        userId: ctx.user.id,
+        prisma: ctx.prisma,
+      })
+    }),
 })