diff --git a/src/app/(admin)/admin/reports/page.tsx b/src/app/(admin)/admin/reports/page.tsx
index ab5b58a..cf22a20 100644
--- a/src/app/(admin)/admin/reports/page.tsx
+++ b/src/app/(admin)/admin/reports/page.tsx
@@ -45,7 +45,11 @@ import {
Trophy,
ArrowRight,
Hash,
+ Sparkles,
+ Loader2,
+ AlertTriangle,
} from 'lucide-react'
+import { toast } from 'sonner'
import { formatDateOnly } from '@/lib/utils'
import {
ScoreDistributionChart,
@@ -271,6 +275,12 @@ function ReportsOverview() {
const evaluated = projectRankings.filter(p => p.averageScore !== null)
const scores = evaluated.map(p => p.averageScore as number)
const avgScore = scores.length ? scores.reduce((a, b) => a + b, 0) / scores.length : 0
+ const balancedScores = projectRankings
+ .map(p => p.balancedScore)
+ .filter((s): s is number => s != null)
+ const avgBalanced = balancedScores.length
+ ? balancedScores.reduce((a, b) => a + b, 0) / balancedScores.length
+ : null
const minScore = scores.length ? Math.min(...scores) : 0
const maxScore = scores.length ? Math.max(...scores) : 0
const evalPercent = projectRankings.length ? Math.round((evaluated.length / projectRankings.length) * 100) : 0
@@ -281,14 +291,28 @@ function ReportsOverview() {
return (
<>
-
+
Total Projects
{projectRankings.length}
-
-
Avg Score
-
{avgScore ? avgScore.toFixed(1) : '-'}
+
+
Raw Avg
+
+ {avgScore ? avgScore.toFixed(1) : '-'}
+
+
+
+
Balanced Avg
+
+ {avgBalanced == null ? '-' : avgBalanced.toFixed(1)}
+
Evaluated
@@ -319,7 +343,7 @@ function ReportsOverview() {
{/* Top 10 ranked table */}
- Top 10 by Average Score
+ Top 10 by Balanced Score
@@ -328,7 +352,18 @@ function ReportsOverview() {
#
Project
Team
- Avg
+
+ Raw Avg
+
+
+ Balanced
+
Evals
Status
@@ -345,9 +380,12 @@ function ReportsOverview() {
{p.teamName || '-'}
-
+
{p.averageScore === null ? '-' : p.averageScore.toFixed(2)}
+
+ {p.balancedScore == null ? '-' : p.balancedScore.toFixed(2)}
+
{p.evaluationCount}
{formatStatusLabel(p.status)}
@@ -870,10 +908,150 @@ function JurorConsistencyTab() {
}}
/>
)}
+
+ {queryInput.roundId && (
+
+ )}
)
}
+function JurorCalibrationPanel({ roundId }: { roundId: string }) {
+ const mutation = trpc.analytics.generateJurorCalibration.useMutation({
+ onError: (err) => toast.error(`Calibration analysis failed: ${err.message}`),
+ })
+ const result = mutation.data
+
+ const severityStyle: Record = {
+ outlier: 'bg-red-50 text-red-700 border-red-200',
+ notable: 'bg-amber-50 text-amber-700 border-amber-200',
+ normal: 'bg-muted text-muted-foreground',
+ }
+
+ return (
+
+
+
+
+
+
+ AI Juror Calibration Advisory
+
+
+ Plain-language explanation of the per-juror score balancing already applied to rankings.
+ Describes, does not prescribe — the math runs regardless.
+
+
+
mutation.mutate({ roundId })}
+ disabled={mutation.isPending}
+ className="gap-2"
+ >
+ {mutation.isPending ? : }
+ {mutation.isPending ? 'Analyzing…' : result ? 'Regenerate' : 'Analyze jurors'}
+
+
+
+
+ {!result && !mutation.isPending && (
+
+ Run the analysis to see per-juror grading patterns, cohort stats, and the calibration
+ narrative for the selected round.
+
+ )}
+
+ {result && (
+ <>
+
+
+
Cohort Mean
+
{result.cohortMean.toFixed(2)}
+
+
+
Cohort Stddev
+
{result.cohortStddev.toFixed(2)}
+
+
+
Evaluations
+
{result.totalEvaluations}
+
+
+
Jurors
+
{result.totalJurors}
+
+
+
+
+
{result.overallSummary}
+ {result.keyTakeaways.length > 0 && (
+
+ {result.keyTakeaways.map((t, i) => (
+
+
+ {t}
+
+ ))}
+
+ )}
+
+
+
+
+
+
+ Juror
+ Evals
+ Mean
+ Δ Cohort
+
+ Influence
+
+ Severity
+ Notes
+
+
+
+ {result.jurors.map((j) => (
+
+ {j.name}
+ {j.evaluationCount}
+ {j.rawMean.toFixed(2)}
+ 0.5 ? 'text-emerald-600' : ''
+ }`}
+ >
+ {j.deltaFromCohort > 0 ? '+' : ''}
+ {j.deltaFromCohort.toFixed(2)}
+
+
+ {j.effectiveInfluence == null ? '-' : j.effectiveInfluence.toFixed(2)}
+
+
+
+ {j.severity === 'outlier' && }
+ {j.severity}
+
+
+
+ {j.summary}
+
+
+ ))}
+
+
+
+
+
+ Generated {result.generatedAt.toLocaleString()} · {result.tokensUsed} tokens · model {result.model}
+
+ >
+ )}
+
+
+ )
+}
+
function DiversityTab() {
const [selectedValue, setSelectedValue] = useState(null)
diff --git a/src/components/admin/round/ranking-dashboard.tsx b/src/components/admin/round/ranking-dashboard.tsx
index 32158ee..b0429d2 100644
--- a/src/components/admin/round/ranking-dashboard.tsx
+++ b/src/components/admin/round/ranking-dashboard.tsx
@@ -82,6 +82,7 @@ type SortableProjectRowProps = {
entry: (RankedProjectEntry & { originalIndex?: number }) | undefined
projectInfo: ProjectInfo | undefined
jurorScores: JurorScore[] | undefined
+ balancedScore: number | null
onSelect: () => void
isSelected: boolean
originalRank: number | undefined // from snapshotOrder — always in sync with localOrder
@@ -95,6 +96,7 @@ function SortableProjectRow({
entry,
projectInfo,
jurorScores,
+ balancedScore,
onSelect,
isSelected,
originalRank,
@@ -199,11 +201,25 @@ function SortableProjectRow({
) : null}
- {/* Average score */}
+ {/* Raw + balanced averages shown side by side */}
{entry?.avgGlobalScore !== null && entry?.avgGlobalScore !== undefined && jurorScores && jurorScores.length > 1 && (
-
- = {entry.avgGlobalScore.toFixed(1)}
-
+
+
+ {entry.avgGlobalScore.toFixed(1)}
+
+ {balancedScore != null && Math.abs(balancedScore - entry.avgGlobalScore) >= 0.05 && (
+ entry.avgGlobalScore
+ ? 'bg-emerald-50 text-emerald-700 border-emerald-200'
+ : 'bg-amber-50 text-amber-700 border-amber-200',
+ )}
+ >
+ ⇢ {balancedScore.toFixed(1)}
+
+ )}
+
)}
{/* Advance decision indicator */}
@@ -909,7 +925,8 @@ export function RankingDashboard({ competitionId: _competitionId, roundId }: Ran
currentRank={index + 1}
entry={rankingMap.get(projectId)}
projectInfo={projectInfoMap.get(projectId)}
- jurorScores={evalScores?.[projectId]}
+ jurorScores={evalScores?.byProject[projectId]}
+ balancedScore={evalScores?.balanced[projectId]?.balancedAverage ?? null}
onSelect={() => setSelectedProjectId(projectId)}
isSelected={selectedProjectId === projectId}
originalRank={hasReorders ? snapshotOrder[projectId] : undefined}
diff --git a/src/server/routers/analytics.ts b/src/server/routers/analytics.ts
index 951573e..46fd950 100644
--- a/src/server/routers/analytics.ts
+++ b/src/server/routers/analytics.ts
@@ -1,11 +1,13 @@
import { z } from 'zod'
-import { router, observerProcedure } from '../trpc'
+import { router, observerProcedure, adminProcedure } from '../trpc'
import { normalizeCountryToCode } from '@/lib/countries'
import { getUserAvatarUrl } from '../utils/avatar-url'
import { getProjectLogoUrl } from '../utils/project-logo-url'
import { aggregateVotes } from '../services/deliberation'
import { validateRoundConfig } from '@/types/competition-configs'
import type { LiveFinalConfig } from '@/types/competition-configs'
+import { computeBalanceContext, computeBalancedProjectScores, type ScorePoint } from '../services/juror-balance'
+import { generateJurorCalibration } from '../services/ai-juror-calibration'
const editionOrRoundInput = z.object({
roundId: z.string().optional(),
@@ -185,73 +187,70 @@ export const analyticsRouter = router({
}),
/**
- * Get project rankings with average scores
+ * Get project rankings with raw and balanced (juror-normalized) average scores.
+ *
+ * `averageScore` is the raw mean of per-evaluation criterion averages.
+ * `balancedScore` rescales each juror's contributions via z-score (relative
+ * to their own mean + stddev across projects they reviewed in this round),
+ * then maps back onto the same 1-10 scale using the overall mean + stddev.
+ * A harsh juror's scores are pulled up, a lenient juror's pulled down, so
+ * rankings aren't skewed by a single outlier grader.
*/
getProjectRankings: observerProcedure
.input(editionOrRoundInput.and(z.object({ limit: z.number().optional() })))
.query(async ({ ctx, input }) => {
- const projects = await ctx.prisma.project.findMany({
- where: projectWhere(input),
- select: {
- id: true,
- title: true,
- teamName: true,
- status: true,
- assignments: {
- where: assignmentWhere(input),
- select: {
- evaluation: {
- select: { criterionScoresJson: true, status: true },
- },
- },
+ const [projects, evaluations] = await Promise.all([
+ ctx.prisma.project.findMany({
+ where: projectWhere(input),
+ select: {
+ id: true,
+ title: true,
+ teamName: true,
+ status: true,
},
- },
- })
+ }),
+ ctx.prisma.evaluation.findMany({
+ where: evalWhere(input, { status: 'SUBMITTED' }),
+ select: {
+ criterionScoresJson: true,
+ assignment: { select: { userId: true, projectId: true } },
+ },
+ }),
+ ])
+
+ // Extract a single eval-level score (mean of numeric criterion scores) per evaluation.
+ const points: ScorePoint[] = []
+ for (const e of evaluations) {
+ const scores = e.criterionScoresJson as Record | null
+ if (!scores) continue
+ const vals = Object.values(scores).filter((s): s is number => typeof s === 'number')
+ if (vals.length === 0) continue
+ const rawScore = vals.reduce((a, b) => a + b, 0) / vals.length
+ points.push({ projectId: e.assignment.projectId, userId: e.assignment.userId, rawScore })
+ }
+
+ const balanceCtx = computeBalanceContext(points)
+ const balancedByProject = computeBalancedProjectScores(points, balanceCtx)
- // Calculate average scores
const rankings = projects
.map((project) => {
- const allScores: number[] = []
-
- project.assignments.forEach((assignment) => {
- const evaluation = assignment.evaluation
- if (evaluation?.status === 'SUBMITTED') {
- const scores = evaluation.criterionScoresJson as Record<
- string,
- number
- > | null
- if (scores) {
- const scoreValues = Object.values(scores).filter(
- (s): s is number => typeof s === 'number'
- )
- if (scoreValues.length > 0) {
- const average =
- scoreValues.reduce((a, b) => a + b, 0) / scoreValues.length
- allScores.push(average)
- }
- }
- }
- })
-
- const averageScore =
- allScores.length > 0
- ? allScores.reduce((a, b) => a + b, 0) / allScores.length
- : null
-
+ const result = balancedByProject.get(project.id)
return {
id: project.id,
title: project.title,
teamName: project.teamName,
status: project.status,
- averageScore,
- evaluationCount: allScores.length,
+ averageScore: result?.rawAverage ?? null,
+ balancedScore: result?.balancedAverage ?? null,
+ evaluationCount: result?.count ?? 0,
}
})
.sort((a, b) => {
- // Evaluated projects first (sorted by score desc), unevaluated at bottom
- if (a.averageScore !== null && b.averageScore !== null) return b.averageScore - a.averageScore
- if (a.averageScore !== null) return -1
- if (b.averageScore !== null) return 1
+ const aScore = a.balancedScore ?? a.averageScore
+ const bScore = b.balancedScore ?? b.averageScore
+ if (aScore !== null && bScore !== null) return bScore - aScore
+ if (aScore !== null) return -1
+ if (bScore !== null) return 1
return 0
})
@@ -2345,4 +2344,19 @@ export const analyticsRouter = router({
standings,
}
}),
+
+ /**
+ * AI-powered juror calibration analysis for an evaluation round.
+ * Produces a plain-language explanation of the per-juror z-score balance
+ * already applied in ranking — describes, does not prescribe.
+ */
+ generateJurorCalibration: adminProcedure
+ .input(z.object({ roundId: z.string() }))
+ .mutation(async ({ ctx, input }) => {
+ return generateJurorCalibration({
+ roundId: input.roundId,
+ userId: ctx.user.id,
+ prisma: ctx.prisma,
+ })
+ }),
})
diff --git a/src/server/routers/ranking.ts b/src/server/routers/ranking.ts
index 0a17e81..2e1e5dd 100644
--- a/src/server/routers/ranking.ts
+++ b/src/server/routers/ranking.ts
@@ -12,6 +12,7 @@ import {
} from '../services/ai-ranking'
import { logAudit } from '../utils/audit'
import type { EvaluationConfig } from '@/types/competition-configs'
+import { computeBalanceContext, computeBalancedProjectScores, type ScorePoint } from '../services/juror-balance'
// ─── Local Types ───────────────────────────────────────────────────────────────
@@ -471,6 +472,7 @@ export const rankingRouter = router({
evaluation: { status: 'SUBMITTED' },
},
select: {
+ userId: true,
projectId: true,
user: { select: { name: true, email: true } },
evaluation: {
@@ -489,6 +491,8 @@ export const rankingRouter = router({
decision: boolean | null
}>> = {}
+ const balancePoints: ScorePoint[] = []
+
for (const a of assignments) {
if (!a.evaluation) continue
const list = byProject[a.projectId] ?? []
@@ -511,8 +515,28 @@ export const rankingRouter = router({
decision,
})
byProject[a.projectId] = list
+
+ if (a.evaluation.globalScore != null) {
+ balancePoints.push({
+ projectId: a.projectId,
+ userId: a.userId,
+ rawScore: a.evaluation.globalScore,
+ })
+ }
}
- return byProject
+ const balanceCtx = computeBalanceContext(balancePoints)
+ const balancedByProject = computeBalancedProjectScores(balancePoints, balanceCtx)
+
+ // Per-project balanced average on the 1-10 scale, comparable to raw avgs.
+ const balanced: Record = {}
+ for (const [projectId, result] of balancedByProject.entries()) {
+ balanced[projectId] = {
+ rawAverage: result.rawAverage,
+ balancedAverage: result.balancedAverage,
+ }
+ }
+
+ return { byProject, balanced }
}),
})
diff --git a/src/server/services/ai-juror-calibration.ts b/src/server/services/ai-juror-calibration.ts
new file mode 100644
index 0000000..2e64cec
--- /dev/null
+++ b/src/server/services/ai-juror-calibration.ts
@@ -0,0 +1,355 @@
+/**
+ * AI-Powered Juror Calibration Advisory
+ *
+ * Analyzes per-juror grading statistics for an evaluation round and
+ * produces a human-readable explanation of how each juror's scores compare
+ * to the cohort. Describes the z-score balance that's already applied in
+ * ranking; does NOT introduce a new weighting layer — only explains the
+ * existing math in plain language so admins can justify results to jurors.
+ *
+ * GDPR: Juror identifiers are replaced with Juror-1, Juror-2, ... before any
+ * call to OpenAI. No names or emails leave the server.
+ */
+
+import { TRPCError } from '@trpc/server'
+import { getOpenAI, getConfiguredModel, buildCompletionParams, AI_MODELS } from '@/lib/openai'
+import { logAIUsage, extractTokenUsage } from '@/server/utils/ai-usage'
+import { classifyAIError, createParseError, logAIError } from './ai-errors'
+import type { PrismaClient } from '@prisma/client'
+import { computeBalanceContext, type ScorePoint } from './juror-balance'
+
+// ─── Types ──────────────────────────────────────────────────────────────────
+
+export type JurorCalibrationEntry = {
+ userId: string
+ name: string
+ evaluationCount: number
+ rawMean: number
+ stddev: number
+ deltaFromCohort: number
+ effectiveInfluence: number | null
+ severity: 'normal' | 'notable' | 'outlier'
+ summary: string
+}
+
+export type JurorCalibrationResult = {
+ roundId: string
+ roundName: string
+ cohortMean: number
+ cohortStddev: number
+ totalEvaluations: number
+ totalJurors: number
+ overallSummary: string
+ keyTakeaways: string[]
+ jurors: JurorCalibrationEntry[]
+ tokensUsed: number
+ model: string
+ generatedAt: Date
+}
+
+type AIResponsePayload = {
+ overallSummary: string
+ keyTakeaways: string[]
+ jurors: Array<{
+ jurorId: string
+ severity: 'normal' | 'notable' | 'outlier'
+ summary: string
+ }>
+}
+
+type InternalJurorRecord = {
+ userId: string
+ name: string
+ evaluationCount: number
+ rawMean: number
+ stddev: number
+ deltaFromCohort: number
+ effectiveInfluence: number | null
+}
+
+// ─── Main Orchestrator ──────────────────────────────────────────────────────
+
+export async function generateJurorCalibration({
+ roundId,
+ userId,
+ prisma,
+}: {
+ roundId: string
+ userId: string
+ prisma: PrismaClient
+}): Promise {
+ const round = await prisma.round.findUnique({
+ where: { id: roundId },
+ select: { id: true, name: true, roundType: true },
+ })
+
+ if (!round) {
+ throw new TRPCError({ code: 'NOT_FOUND', message: 'Round not found' })
+ }
+
+ const evaluations = await prisma.evaluation.findMany({
+ where: {
+ status: 'SUBMITTED',
+ assignment: { roundId },
+ },
+ select: {
+ globalScore: true,
+ criterionScoresJson: true,
+ assignment: {
+ select: {
+ userId: true,
+ projectId: true,
+ user: { select: { id: true, name: true, email: true } },
+ },
+ },
+ },
+ })
+
+ // Build (project, juror, score) points using each eval's mean criterion score,
+ // matching how the reports page reports raw + balanced averages per project.
+ const points: ScorePoint[] = []
+ const nameByUserId = new Map()
+
+ for (const e of evaluations) {
+ const scores = e.criterionScoresJson as Record | null
+ if (!scores) continue
+ const vals = Object.values(scores).filter((s): s is number => typeof s === 'number')
+ if (vals.length === 0) continue
+ const rawScore = vals.reduce((a, b) => a + b, 0) / vals.length
+ points.push({
+ projectId: e.assignment.projectId,
+ userId: e.assignment.userId,
+ rawScore,
+ })
+ nameByUserId.set(
+ e.assignment.userId,
+ e.assignment.user.name ?? e.assignment.user.email ?? 'Unknown',
+ )
+ }
+
+ if (points.length === 0) {
+ throw new TRPCError({
+ code: 'BAD_REQUEST',
+ message: 'No submitted evaluations with numeric scores in this round',
+ })
+ }
+
+ const ctx = computeBalanceContext(points)
+
+ // Build internal juror stats and a per-juror effective influence ratio
+ // (scale of the juror's raw stddev relative to the cohort stddev).
+ const internalJurors: InternalJurorRecord[] = []
+ for (const [uid, stats] of ctx.jurorStats.entries()) {
+ const effectiveInfluence = ctx.overallStddev > 0 && stats.stddev > 0
+ ? Math.min(2, stats.stddev / ctx.overallStddev)
+ : null
+ internalJurors.push({
+ userId: uid,
+ name: nameByUserId.get(uid) ?? 'Unknown',
+ evaluationCount: stats.count,
+ rawMean: stats.mean,
+ stddev: stats.stddev,
+ deltaFromCohort: stats.mean - ctx.overallMean,
+ effectiveInfluence,
+ })
+ }
+
+ // Sort by absolute delta from cohort desc so largest outliers land first in prompts + UI.
+ internalJurors.sort((a, b) => Math.abs(b.deltaFromCohort) - Math.abs(a.deltaFromCohort))
+
+ // Build anonymized payload for the AI call.
+ const anonymizedMap = new Map()
+ const anonymizedJurors = internalJurors.map((j, i) => {
+ const id = `Juror-${i + 1}`
+ anonymizedMap.set(j.userId, id)
+ return {
+ jurorId: id,
+ evaluationCount: j.evaluationCount,
+ rawMean: Math.round(j.rawMean * 100) / 100,
+ stddev: Math.round(j.stddev * 100) / 100,
+ deltaFromCohort: Math.round(j.deltaFromCohort * 100) / 100,
+ effectiveInfluence: j.effectiveInfluence != null
+ ? Math.round(j.effectiveInfluence * 100) / 100
+ : null,
+ }
+ })
+
+ const openai = await getOpenAI()
+ if (!openai) {
+ throw new TRPCError({
+ code: 'PRECONDITION_FAILED',
+ message: 'OpenAI is not configured. Please set up your API key in Settings.',
+ })
+ }
+
+ const model = await getConfiguredModel(AI_MODELS.QUICK)
+
+ const prompt = buildCalibrationPrompt({
+ roundName: round.name,
+ cohortMean: Math.round(ctx.overallMean * 100) / 100,
+ cohortStddev: Math.round(ctx.overallStddev * 100) / 100,
+ totalEvaluations: points.length,
+ jurors: anonymizedJurors,
+ })
+
+ let aiResponse: AIResponsePayload
+ let tokensUsed = 0
+
+ try {
+ const params = buildCompletionParams(model, {
+ messages: [{ role: 'user', content: prompt }],
+ jsonMode: true,
+ temperature: 0.2,
+ maxTokens: 2000,
+ })
+
+ const response = await openai.chat.completions.create(params)
+ tokensUsed = extractTokenUsage(response).totalTokens
+ const content = response.choices[0]?.message?.content
+ if (!content) throw new Error('Empty response from AI')
+
+ try {
+ aiResponse = JSON.parse(content) as AIResponsePayload
+ } catch (parseError) {
+ const err = createParseError((parseError as Error).message)
+ logAIError('JurorCalibration', 'generate', err)
+ throw new TRPCError({
+ code: 'INTERNAL_SERVER_ERROR',
+ message: 'Failed to parse AI response. Please try again.',
+ })
+ }
+ } catch (error) {
+ if (error instanceof TRPCError) throw error
+ const classified = classifyAIError(error)
+ logAIError('JurorCalibration', 'generate', classified)
+ await logAIUsage({
+ userId,
+ action: 'JUROR_CALIBRATION',
+ entityType: 'Round',
+ entityId: roundId,
+ model,
+ promptTokens: 0,
+ completionTokens: 0,
+ totalTokens: tokensUsed,
+ itemsProcessed: 0,
+ status: 'ERROR',
+ errorMessage: classified.message,
+ })
+ throw new TRPCError({
+ code: 'INTERNAL_SERVER_ERROR',
+ message: classified.message,
+ })
+ }
+
+ await logAIUsage({
+ userId,
+ action: 'JUROR_CALIBRATION',
+ entityType: 'Round',
+ entityId: roundId,
+ model,
+ promptTokens: 0,
+ completionTokens: 0,
+ totalTokens: tokensUsed,
+ itemsProcessed: internalJurors.length,
+ status: 'SUCCESS',
+ })
+
+ // Merge AI narrative back with internal stats by anonymized id.
+ const narrativeById = new Map(aiResponse.jurors.map((j) => [j.jurorId, j]))
+ const jurors: JurorCalibrationEntry[] = internalJurors.map((j) => {
+ const anonId = anonymizedMap.get(j.userId)!
+ const narrative = narrativeById.get(anonId)
+ return {
+ userId: j.userId,
+ name: j.name,
+ evaluationCount: j.evaluationCount,
+ rawMean: Math.round(j.rawMean * 100) / 100,
+ stddev: Math.round(j.stddev * 100) / 100,
+ deltaFromCohort: Math.round(j.deltaFromCohort * 100) / 100,
+ effectiveInfluence: j.effectiveInfluence != null
+ ? Math.round(j.effectiveInfluence * 100) / 100
+ : null,
+ severity: narrative?.severity ?? classifySeverity(j, ctx.overallStddev),
+ summary: narrative?.summary ?? 'No AI narrative available.',
+ }
+ })
+
+ return {
+ roundId: round.id,
+ roundName: round.name,
+ cohortMean: Math.round(ctx.overallMean * 100) / 100,
+ cohortStddev: Math.round(ctx.overallStddev * 100) / 100,
+ totalEvaluations: points.length,
+ totalJurors: internalJurors.length,
+ overallSummary: aiResponse.overallSummary,
+ keyTakeaways: aiResponse.keyTakeaways,
+ jurors,
+ tokensUsed,
+ model,
+ generatedAt: new Date(),
+ }
+}
+
+// ─── Helpers ────────────────────────────────────────────────────────────────
+
+function classifySeverity(
+ juror: InternalJurorRecord,
+ cohortStddev: number,
+): 'normal' | 'notable' | 'outlier' {
+ if (cohortStddev === 0) return 'normal'
+ const zDelta = Math.abs(juror.deltaFromCohort) / cohortStddev
+ if (zDelta >= 1.5) return 'outlier'
+ if (zDelta >= 0.75) return 'notable'
+ return 'normal'
+}
+
+function buildCalibrationPrompt(payload: {
+ roundName: string
+ cohortMean: number
+ cohortStddev: number
+ totalEvaluations: number
+ jurors: Array<{
+ jurorId: string
+ evaluationCount: number
+ rawMean: number
+ stddev: number
+ deltaFromCohort: number
+ effectiveInfluence: number | null
+ }>
+}): string {
+ return `You are analyzing juror grading patterns for a competition evaluation round. Your job is to EXPLAIN the statistical normalization that has already been applied; you are NOT introducing a new weighting scheme or prescribing changes.
+
+CONTEXT:
+- Round: "${payload.roundName}"
+- Cohort mean: ${payload.cohortMean} (scale 1-10)
+- Cohort stddev: ${payload.cohortStddev}
+- Total submitted evaluations: ${payload.totalEvaluations}
+
+HOW BALANCING WORKS:
+Each juror's scores are z-score normalized against their own mean and stddev, then rescaled back onto the 1-10 range. A juror who averages 2 points below the cohort won't drag projects down more than their peers; a lenient juror won't inflate projects. "effectiveInfluence" is roughly the juror's stddev divided by the cohort stddev — a value near 1.0 means they spread their scores similarly to the cohort; values well under 1 mean compressed scoring, well over 1 mean wide spread.
+
+JUROR DATA (anonymized, sorted by |deltaFromCohort| desc):
+${JSON.stringify(payload.jurors, null, 2)}
+
+Return a JSON object with this exact shape:
+{
+ "overallSummary": "2-3 sentences summarizing grading dispersion across the cohort — is the panel tightly aligned or widely divergent?",
+ "keyTakeaways": ["up to 4 bullets: notable patterns, risks, what to watch"],
+ "jurors": [
+ {
+ "jurorId": "Juror-N (matching the input)",
+ "severity": "normal" | "notable" | "outlier",
+ "summary": "One short sentence about this juror's grading pattern, referring to their rawMean, deltaFromCohort, and stddev. Example: 'Scored on average 2.1 points below cohort across 8 evaluations — consistently harsh, low internal variance.'"
+ }
+ ]
+}
+
+Guidelines:
+- "outlier" = delta from cohort >= 1.5 cohort-stddev in either direction
+- "notable" = delta from cohort 0.75-1.5 cohort-stddev
+- "normal" = delta from cohort < 0.75 cohort-stddev
+- A juror with very few evaluations (< 3) can't be classified confidently — note this in their summary and prefer "normal".
+- Be factual and specific. Reference the numbers. No speculation about intent.
+- Do not include juror names — only the anonymized jurorId.
+- Include every juror from the input in the jurors array. Order matches input.`
+}
diff --git a/src/server/services/juror-balance.ts b/src/server/services/juror-balance.ts
new file mode 100644
index 0000000..7c89977
--- /dev/null
+++ b/src/server/services/juror-balance.ts
@@ -0,0 +1,120 @@
+/**
+ * Juror balancing: z-score normalization to correct for per-juror grading harshness.
+ *
+ * A juror who grades 1 standard deviation below their peers on shared projects
+ * shouldn't punish those projects more than a juror who grades at the mean.
+ * We compute per-juror mean + stddev across their scores in a round, z-normalize
+ * each score, then rescale back onto the same 1-10 scale using the overall
+ * round-level mean + stddev so the balanced number is directly comparable to
+ * the raw average.
+ */
+
+export type ScorePoint = {
+ projectId: string
+ userId: string
+ rawScore: number
+}
+
+export type BalancedProjectResult = {
+ projectId: string
+ rawAverage: number | null
+ balancedAverage: number | null
+ count: number
+}
+
+export type JurorBalance = {
+ userId: string
+ mean: number
+ stddev: number
+ count: number
+}
+
+export type BalanceContext = {
+ overallMean: number
+ overallStddev: number
+ jurorStats: Map
+}
+
+/**
+ * Build per-juror and overall grading statistics from a flat list of
+ * (project, juror, score) points. Returns the stats plus a helper to
+ * rescale z-scores back onto the raw-score scale.
+ */
+export function computeBalanceContext(points: ScorePoint[]): BalanceContext {
+ const jurorScores = new Map()
+ for (const p of points) {
+ const arr = jurorScores.get(p.userId) ?? []
+ arr.push(p.rawScore)
+ jurorScores.set(p.userId, arr)
+ }
+
+ const jurorStats = new Map()
+ for (const [userId, scores] of jurorScores.entries()) {
+ const mean = scores.reduce((a, b) => a + b, 0) / scores.length
+ const variance = scores.length > 1
+ ? scores.reduce((s, v) => s + (v - mean) ** 2, 0) / scores.length
+ : 0
+ jurorStats.set(userId, {
+ userId,
+ mean,
+ stddev: Math.sqrt(variance),
+ count: scores.length,
+ })
+ }
+
+ const allScores = points.map((p) => p.rawScore)
+ const overallMean = allScores.length > 0
+ ? allScores.reduce((a, b) => a + b, 0) / allScores.length
+ : 0
+ const overallStddev = allScores.length > 1
+ ? Math.sqrt(
+ allScores.reduce((s, v) => s + (v - overallMean) ** 2, 0) / allScores.length,
+ )
+ : 0
+
+ return { overallMean, overallStddev, jurorStats }
+}
+
+/**
+ * Aggregate per-project raw + balanced averages from score points.
+ */
+export function computeBalancedProjectScores(
+ points: ScorePoint[],
+ ctx: BalanceContext,
+): Map {
+ const byProject = new Map()
+ for (const p of points) {
+ const arr = byProject.get(p.projectId) ?? []
+ arr.push(p)
+ byProject.set(p.projectId, arr)
+ }
+
+ const results = new Map()
+ for (const [projectId, projectPoints] of byProject.entries()) {
+ const rawAverage = projectPoints.reduce((a, b) => a + b.rawScore, 0) / projectPoints.length
+
+ let balancedAverage: number | null = null
+ if (ctx.overallStddev > 0) {
+ const zValues: number[] = []
+ for (const pt of projectPoints) {
+ const stats = ctx.jurorStats.get(pt.userId)
+ if (stats && stats.stddev > 0) {
+ zValues.push((pt.rawScore - stats.mean) / stats.stddev)
+ } else {
+ zValues.push((pt.rawScore - ctx.overallMean) / ctx.overallStddev)
+ }
+ }
+ const avgZ = zValues.reduce((a, b) => a + b, 0) / zValues.length
+ balancedAverage = ctx.overallMean + avgZ * ctx.overallStddev
+ }
+
+ results.set(projectId, {
+ projectId,
+ rawAverage,
+ balancedAverage,
+ count: projectPoints.length,
+ })
+ }
+
+ return results
+}
diff --git a/src/server/utils/ai-usage.ts b/src/server/utils/ai-usage.ts
index 8889d1d..4abcefb 100644
--- a/src/server/utils/ai-usage.ts
+++ b/src/server/utils/ai-usage.ts
@@ -21,6 +21,7 @@ export type AIAction =
| 'ROUTING'
| 'SHORTLIST'
| 'RANKING'
+ | 'JUROR_CALIBRATION'
export type AIStatus = 'SUCCESS' | 'PARTIAL' | 'ERROR'