feat: surface juror-balanced scores and AI calibration advisory
All checks were successful
Build and Push Docker Image / build (push) Successful in 7m27s

Adds a shared juror-balancing utility (z-score normalization per juror,
rescaled back onto the raw 1-10 scale) and wires it into:

- Admin reports page: Top-10 project table now shows "Raw Avg" and
  "Balanced" columns side by side, and the summary stats row shows a
  balanced-average tile. Sort defaults to balanced so harsh and lenient
  graders no longer skew the ranking.
- Ranking dashboard: each project row shows a green/amber balanced-score
  chip next to the raw average when the two differ by ≥0.05, making it
  obvious when juror calibration moved a project's effective ranking.

Also adds AI Juror Calibration Advisory — a mutation that takes
anonymized per-juror stats, calls OpenAI, and produces a plain-language
explanation of the cohort's grading patterns plus per-juror severity
(normal / notable / outlier) with a one-sentence narrative. The advisory
describes the statistical balance that already runs; it does not
introduce a new weighting layer. Rendered as a panel in the Juror
Consistency tab when a specific round is selected.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Matt
2026-04-24 16:19:00 +02:00
parent 07dd7a0692
commit 982d5193c5
7 changed files with 774 additions and 65 deletions

View File

@@ -1,11 +1,13 @@
import { z } from 'zod'
import { router, observerProcedure } from '../trpc'
import { router, observerProcedure, adminProcedure } from '../trpc'
import { normalizeCountryToCode } from '@/lib/countries'
import { getUserAvatarUrl } from '../utils/avatar-url'
import { getProjectLogoUrl } from '../utils/project-logo-url'
import { aggregateVotes } from '../services/deliberation'
import { validateRoundConfig } from '@/types/competition-configs'
import type { LiveFinalConfig } from '@/types/competition-configs'
import { computeBalanceContext, computeBalancedProjectScores, type ScorePoint } from '../services/juror-balance'
import { generateJurorCalibration } from '../services/ai-juror-calibration'
const editionOrRoundInput = z.object({
roundId: z.string().optional(),
@@ -185,73 +187,70 @@ export const analyticsRouter = router({
}),
/**
* Get project rankings with average scores
* Get project rankings with raw and balanced (juror-normalized) average scores.
*
* `averageScore` is the raw mean of per-evaluation criterion averages.
* `balancedScore` rescales each juror's contributions via z-score (relative
* to their own mean + stddev across projects they reviewed in this round),
* then maps back onto the same 1-10 scale using the overall mean + stddev.
* A harsh juror's scores are pulled up, a lenient juror's pulled down, so
* rankings aren't skewed by a single outlier grader.
*/
getProjectRankings: observerProcedure
.input(editionOrRoundInput.and(z.object({ limit: z.number().optional() })))
.query(async ({ ctx, input }) => {
const projects = await ctx.prisma.project.findMany({
where: projectWhere(input),
select: {
id: true,
title: true,
teamName: true,
status: true,
assignments: {
where: assignmentWhere(input),
select: {
evaluation: {
select: { criterionScoresJson: true, status: true },
},
},
const [projects, evaluations] = await Promise.all([
ctx.prisma.project.findMany({
where: projectWhere(input),
select: {
id: true,
title: true,
teamName: true,
status: true,
},
},
})
}),
ctx.prisma.evaluation.findMany({
where: evalWhere(input, { status: 'SUBMITTED' }),
select: {
criterionScoresJson: true,
assignment: { select: { userId: true, projectId: true } },
},
}),
])
// Extract a single eval-level score (mean of numeric criterion scores) per evaluation.
const points: ScorePoint[] = []
for (const e of evaluations) {
const scores = e.criterionScoresJson as Record<string, unknown> | null
if (!scores) continue
const vals = Object.values(scores).filter((s): s is number => typeof s === 'number')
if (vals.length === 0) continue
const rawScore = vals.reduce((a, b) => a + b, 0) / vals.length
points.push({ projectId: e.assignment.projectId, userId: e.assignment.userId, rawScore })
}
const balanceCtx = computeBalanceContext(points)
const balancedByProject = computeBalancedProjectScores(points, balanceCtx)
// Calculate average scores
const rankings = projects
.map((project) => {
const allScores: number[] = []
project.assignments.forEach((assignment) => {
const evaluation = assignment.evaluation
if (evaluation?.status === 'SUBMITTED') {
const scores = evaluation.criterionScoresJson as Record<
string,
number
> | null
if (scores) {
const scoreValues = Object.values(scores).filter(
(s): s is number => typeof s === 'number'
)
if (scoreValues.length > 0) {
const average =
scoreValues.reduce((a, b) => a + b, 0) / scoreValues.length
allScores.push(average)
}
}
}
})
const averageScore =
allScores.length > 0
? allScores.reduce((a, b) => a + b, 0) / allScores.length
: null
const result = balancedByProject.get(project.id)
return {
id: project.id,
title: project.title,
teamName: project.teamName,
status: project.status,
averageScore,
evaluationCount: allScores.length,
averageScore: result?.rawAverage ?? null,
balancedScore: result?.balancedAverage ?? null,
evaluationCount: result?.count ?? 0,
}
})
.sort((a, b) => {
// Evaluated projects first (sorted by score desc), unevaluated at bottom
if (a.averageScore !== null && b.averageScore !== null) return b.averageScore - a.averageScore
if (a.averageScore !== null) return -1
if (b.averageScore !== null) return 1
const aScore = a.balancedScore ?? a.averageScore
const bScore = b.balancedScore ?? b.averageScore
if (aScore !== null && bScore !== null) return bScore - aScore
if (aScore !== null) return -1
if (bScore !== null) return 1
return 0
})
@@ -2345,4 +2344,19 @@ export const analyticsRouter = router({
standings,
}
}),
/**
* AI-powered juror calibration analysis for an evaluation round.
* Produces a plain-language explanation of the per-juror z-score balance
* already applied in ranking — describes, does not prescribe.
*/
generateJurorCalibration: adminProcedure
.input(z.object({ roundId: z.string() }))
.mutation(async ({ ctx, input }) => {
return generateJurorCalibration({
roundId: input.roundId,
userId: ctx.user.id,
prisma: ctx.prisma,
})
}),
})

View File

@@ -12,6 +12,7 @@ import {
} from '../services/ai-ranking'
import { logAudit } from '../utils/audit'
import type { EvaluationConfig } from '@/types/competition-configs'
import { computeBalanceContext, computeBalancedProjectScores, type ScorePoint } from '../services/juror-balance'
// ─── Local Types ───────────────────────────────────────────────────────────────
@@ -471,6 +472,7 @@ export const rankingRouter = router({
evaluation: { status: 'SUBMITTED' },
},
select: {
userId: true,
projectId: true,
user: { select: { name: true, email: true } },
evaluation: {
@@ -489,6 +491,8 @@ export const rankingRouter = router({
decision: boolean | null
}>> = {}
const balancePoints: ScorePoint[] = []
for (const a of assignments) {
if (!a.evaluation) continue
const list = byProject[a.projectId] ?? []
@@ -511,8 +515,28 @@ export const rankingRouter = router({
decision,
})
byProject[a.projectId] = list
if (a.evaluation.globalScore != null) {
balancePoints.push({
projectId: a.projectId,
userId: a.userId,
rawScore: a.evaluation.globalScore,
})
}
}
return byProject
const balanceCtx = computeBalanceContext(balancePoints)
const balancedByProject = computeBalancedProjectScores(balancePoints, balanceCtx)
// Per-project balanced average on the 1-10 scale, comparable to raw avgs.
const balanced: Record<string, { rawAverage: number | null; balancedAverage: number | null }> = {}
for (const [projectId, result] of balancedByProject.entries()) {
balanced[projectId] = {
rawAverage: result.rawAverage,
balancedAverage: result.balancedAverage,
}
}
return { byProject, balanced }
}),
})