From 982d5193c50adee0f3c22607e068e552ed9097d3 Mon Sep 17 00:00:00 2001 From: Matt Date: Fri, 24 Apr 2026 16:19:00 +0200 Subject: [PATCH] feat: surface juror-balanced scores and AI calibration advisory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a shared juror-balancing utility (z-score normalization per juror, rescaled back onto the raw 1-10 scale) and wires it into: - Admin reports page: Top-10 project table now shows "Raw Avg" and "Balanced" columns side by side, and the summary stats row shows a balanced-average tile. Sort defaults to balanced so harsh and lenient graders no longer skew the ranking. - Ranking dashboard: each project row shows a green/amber balanced-score chip next to the raw average when the two differ by ≥0.05, making it obvious when juror calibration moved a project's effective ranking. Also adds AI Juror Calibration Advisory — a mutation that takes anonymized per-juror stats, calls OpenAI, and produces a plain-language explanation of the cohort's grading patterns plus per-juror severity (normal / notable / outlier) with a one-sentence narrative. The advisory describes the statistical balance that already runs; it does not introduce a new weighting layer. Rendered as a panel in the Juror Consistency tab when a specific round is selected. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/app/(admin)/admin/reports/page.tsx | 192 +++++++++- .../admin/round/ranking-dashboard.tsx | 27 +- src/server/routers/analytics.ts | 118 +++--- src/server/routers/ranking.ts | 26 +- src/server/services/ai-juror-calibration.ts | 355 ++++++++++++++++++ src/server/services/juror-balance.ts | 120 ++++++ src/server/utils/ai-usage.ts | 1 + 7 files changed, 774 insertions(+), 65 deletions(-) create mode 100644 src/server/services/ai-juror-calibration.ts create mode 100644 src/server/services/juror-balance.ts diff --git a/src/app/(admin)/admin/reports/page.tsx b/src/app/(admin)/admin/reports/page.tsx index ab5b58a..cf22a20 100644 --- a/src/app/(admin)/admin/reports/page.tsx +++ b/src/app/(admin)/admin/reports/page.tsx @@ -45,7 +45,11 @@ import { Trophy, ArrowRight, Hash, + Sparkles, + Loader2, + AlertTriangle, } from 'lucide-react' +import { toast } from 'sonner' import { formatDateOnly } from '@/lib/utils' import { ScoreDistributionChart, @@ -271,6 +275,12 @@ function ReportsOverview() { const evaluated = projectRankings.filter(p => p.averageScore !== null) const scores = evaluated.map(p => p.averageScore as number) const avgScore = scores.length ? scores.reduce((a, b) => a + b, 0) / scores.length : 0 + const balancedScores = projectRankings + .map(p => p.balancedScore) + .filter((s): s is number => s != null) + const avgBalanced = balancedScores.length + ? balancedScores.reduce((a, b) => a + b, 0) / balancedScores.length + : null const minScore = scores.length ? Math.min(...scores) : 0 const maxScore = scores.length ? Math.max(...scores) : 0 const evalPercent = projectRankings.length ? Math.round((evaluated.length / projectRankings.length) * 100) : 0 @@ -281,14 +291,28 @@ function ReportsOverview() { return ( <> -
+

Total Projects

{projectRankings.length}

-
-

Avg Score

-

{avgScore ? avgScore.toFixed(1) : '-'}

+
+

Raw Avg

+

+ {avgScore ? avgScore.toFixed(1) : '-'} +

+
+
+

Balanced Avg

+

+ {avgBalanced == null ? '-' : avgBalanced.toFixed(1)} +

Evaluated

@@ -319,7 +343,7 @@ function ReportsOverview() { {/* Top 10 ranked table */}

- Top 10 by Average Score + Top 10 by Balanced Score

@@ -328,7 +352,18 @@ function ReportsOverview() { #ProjectTeam - Avg + + Raw Avg + + + Balanced + EvalsStatus @@ -345,9 +380,12 @@ function ReportsOverview() { {p.teamName || '-'} - + {p.averageScore === null ? '-' : p.averageScore.toFixed(2)} + + {p.balancedScore == null ? '-' : p.balancedScore.toFixed(2)} + {p.evaluationCount} {formatStatusLabel(p.status)} @@ -870,10 +908,150 @@ function JurorConsistencyTab() { }} /> )} + + {queryInput.roundId && ( + + )} ) } +function JurorCalibrationPanel({ roundId }: { roundId: string }) { + const mutation = trpc.analytics.generateJurorCalibration.useMutation({ + onError: (err) => toast.error(`Calibration analysis failed: ${err.message}`), + }) + const result = mutation.data + + const severityStyle: Record = { + outlier: 'bg-red-50 text-red-700 border-red-200', + notable: 'bg-amber-50 text-amber-700 border-amber-200', + normal: 'bg-muted text-muted-foreground', + } + + return ( + + +
+
+ + + AI Juror Calibration Advisory + + + Plain-language explanation of the per-juror score balancing already applied to rankings. + Describes, does not prescribe — the math runs regardless. + +
+ +
+
+ + {!result && !mutation.isPending && ( +

+ Run the analysis to see per-juror grading patterns, cohort stats, and the calibration + narrative for the selected round. +

+ )} + + {result && ( + <> +
+
+

Cohort Mean

+

{result.cohortMean.toFixed(2)}

+
+
+

Cohort Stddev

+

{result.cohortStddev.toFixed(2)}

+
+
+

Evaluations

+

{result.totalEvaluations}

+
+
+

Jurors

+

{result.totalJurors}

+
+
+ +
+

{result.overallSummary}

+ {result.keyTakeaways.length > 0 && ( +
    + {result.keyTakeaways.map((t, i) => ( +
  • + + {t} +
  • + ))} +
+ )} +
+ +
+
+ + + Juror + Evals + Mean + Δ Cohort + + Influence + + Severity + Notes + + + + {result.jurors.map((j) => ( + + {j.name} + {j.evaluationCount} + {j.rawMean.toFixed(2)} + 0.5 ? 'text-emerald-600' : '' + }`} + > + {j.deltaFromCohort > 0 ? '+' : ''} + {j.deltaFromCohort.toFixed(2)} + + + {j.effectiveInfluence == null ? '-' : j.effectiveInfluence.toFixed(2)} + + + + {j.severity === 'outlier' && } + {j.severity} + + + + {j.summary} + + + ))} + +
+
+ +

+ Generated {result.generatedAt.toLocaleString()} · {result.tokensUsed} tokens · model {result.model} +

+ + )} + + + ) +} + function DiversityTab() { const [selectedValue, setSelectedValue] = useState(null) diff --git a/src/components/admin/round/ranking-dashboard.tsx b/src/components/admin/round/ranking-dashboard.tsx index 32158ee..b0429d2 100644 --- a/src/components/admin/round/ranking-dashboard.tsx +++ b/src/components/admin/round/ranking-dashboard.tsx @@ -82,6 +82,7 @@ type SortableProjectRowProps = { entry: (RankedProjectEntry & { originalIndex?: number }) | undefined projectInfo: ProjectInfo | undefined jurorScores: JurorScore[] | undefined + balancedScore: number | null onSelect: () => void isSelected: boolean originalRank: number | undefined // from snapshotOrder — always in sync with localOrder @@ -95,6 +96,7 @@ function SortableProjectRow({ entry, projectInfo, jurorScores, + balancedScore, onSelect, isSelected, originalRank, @@ -199,11 +201,25 @@ function SortableProjectRow({ ) : null} - {/* Average score */} + {/* Raw + balanced averages shown side by side */} {entry?.avgGlobalScore !== null && entry?.avgGlobalScore !== undefined && jurorScores && jurorScores.length > 1 && ( - - = {entry.avgGlobalScore.toFixed(1)} - +
+ + {entry.avgGlobalScore.toFixed(1)} + + {balancedScore != null && Math.abs(balancedScore - entry.avgGlobalScore) >= 0.05 && ( + entry.avgGlobalScore + ? 'bg-emerald-50 text-emerald-700 border-emerald-200' + : 'bg-amber-50 text-amber-700 border-amber-200', + )} + > + ⇢ {balancedScore.toFixed(1)} + + )} +
)} {/* Advance decision indicator */} @@ -909,7 +925,8 @@ export function RankingDashboard({ competitionId: _competitionId, roundId }: Ran currentRank={index + 1} entry={rankingMap.get(projectId)} projectInfo={projectInfoMap.get(projectId)} - jurorScores={evalScores?.[projectId]} + jurorScores={evalScores?.byProject[projectId]} + balancedScore={evalScores?.balanced[projectId]?.balancedAverage ?? null} onSelect={() => setSelectedProjectId(projectId)} isSelected={selectedProjectId === projectId} originalRank={hasReorders ? snapshotOrder[projectId] : undefined} diff --git a/src/server/routers/analytics.ts b/src/server/routers/analytics.ts index 951573e..46fd950 100644 --- a/src/server/routers/analytics.ts +++ b/src/server/routers/analytics.ts @@ -1,11 +1,13 @@ import { z } from 'zod' -import { router, observerProcedure } from '../trpc' +import { router, observerProcedure, adminProcedure } from '../trpc' import { normalizeCountryToCode } from '@/lib/countries' import { getUserAvatarUrl } from '../utils/avatar-url' import { getProjectLogoUrl } from '../utils/project-logo-url' import { aggregateVotes } from '../services/deliberation' import { validateRoundConfig } from '@/types/competition-configs' import type { LiveFinalConfig } from '@/types/competition-configs' +import { computeBalanceContext, computeBalancedProjectScores, type ScorePoint } from '../services/juror-balance' +import { generateJurorCalibration } from '../services/ai-juror-calibration' const editionOrRoundInput = z.object({ roundId: z.string().optional(), @@ -185,73 +187,70 @@ export const analyticsRouter = router({ }), /** - * Get project rankings with average scores + * Get project rankings with raw and balanced (juror-normalized) average scores. + * + * `averageScore` is the raw mean of per-evaluation criterion averages. + * `balancedScore` rescales each juror's contributions via z-score (relative + * to their own mean + stddev across projects they reviewed in this round), + * then maps back onto the same 1-10 scale using the overall mean + stddev. + * A harsh juror's scores are pulled up, a lenient juror's pulled down, so + * rankings aren't skewed by a single outlier grader. */ getProjectRankings: observerProcedure .input(editionOrRoundInput.and(z.object({ limit: z.number().optional() }))) .query(async ({ ctx, input }) => { - const projects = await ctx.prisma.project.findMany({ - where: projectWhere(input), - select: { - id: true, - title: true, - teamName: true, - status: true, - assignments: { - where: assignmentWhere(input), - select: { - evaluation: { - select: { criterionScoresJson: true, status: true }, - }, - }, + const [projects, evaluations] = await Promise.all([ + ctx.prisma.project.findMany({ + where: projectWhere(input), + select: { + id: true, + title: true, + teamName: true, + status: true, }, - }, - }) + }), + ctx.prisma.evaluation.findMany({ + where: evalWhere(input, { status: 'SUBMITTED' }), + select: { + criterionScoresJson: true, + assignment: { select: { userId: true, projectId: true } }, + }, + }), + ]) + + // Extract a single eval-level score (mean of numeric criterion scores) per evaluation. + const points: ScorePoint[] = [] + for (const e of evaluations) { + const scores = e.criterionScoresJson as Record | null + if (!scores) continue + const vals = Object.values(scores).filter((s): s is number => typeof s === 'number') + if (vals.length === 0) continue + const rawScore = vals.reduce((a, b) => a + b, 0) / vals.length + points.push({ projectId: e.assignment.projectId, userId: e.assignment.userId, rawScore }) + } + + const balanceCtx = computeBalanceContext(points) + const balancedByProject = computeBalancedProjectScores(points, balanceCtx) - // Calculate average scores const rankings = projects .map((project) => { - const allScores: number[] = [] - - project.assignments.forEach((assignment) => { - const evaluation = assignment.evaluation - if (evaluation?.status === 'SUBMITTED') { - const scores = evaluation.criterionScoresJson as Record< - string, - number - > | null - if (scores) { - const scoreValues = Object.values(scores).filter( - (s): s is number => typeof s === 'number' - ) - if (scoreValues.length > 0) { - const average = - scoreValues.reduce((a, b) => a + b, 0) / scoreValues.length - allScores.push(average) - } - } - } - }) - - const averageScore = - allScores.length > 0 - ? allScores.reduce((a, b) => a + b, 0) / allScores.length - : null - + const result = balancedByProject.get(project.id) return { id: project.id, title: project.title, teamName: project.teamName, status: project.status, - averageScore, - evaluationCount: allScores.length, + averageScore: result?.rawAverage ?? null, + balancedScore: result?.balancedAverage ?? null, + evaluationCount: result?.count ?? 0, } }) .sort((a, b) => { - // Evaluated projects first (sorted by score desc), unevaluated at bottom - if (a.averageScore !== null && b.averageScore !== null) return b.averageScore - a.averageScore - if (a.averageScore !== null) return -1 - if (b.averageScore !== null) return 1 + const aScore = a.balancedScore ?? a.averageScore + const bScore = b.balancedScore ?? b.averageScore + if (aScore !== null && bScore !== null) return bScore - aScore + if (aScore !== null) return -1 + if (bScore !== null) return 1 return 0 }) @@ -2345,4 +2344,19 @@ export const analyticsRouter = router({ standings, } }), + + /** + * AI-powered juror calibration analysis for an evaluation round. + * Produces a plain-language explanation of the per-juror z-score balance + * already applied in ranking — describes, does not prescribe. + */ + generateJurorCalibration: adminProcedure + .input(z.object({ roundId: z.string() })) + .mutation(async ({ ctx, input }) => { + return generateJurorCalibration({ + roundId: input.roundId, + userId: ctx.user.id, + prisma: ctx.prisma, + }) + }), }) diff --git a/src/server/routers/ranking.ts b/src/server/routers/ranking.ts index 0a17e81..2e1e5dd 100644 --- a/src/server/routers/ranking.ts +++ b/src/server/routers/ranking.ts @@ -12,6 +12,7 @@ import { } from '../services/ai-ranking' import { logAudit } from '../utils/audit' import type { EvaluationConfig } from '@/types/competition-configs' +import { computeBalanceContext, computeBalancedProjectScores, type ScorePoint } from '../services/juror-balance' // ─── Local Types ─────────────────────────────────────────────────────────────── @@ -471,6 +472,7 @@ export const rankingRouter = router({ evaluation: { status: 'SUBMITTED' }, }, select: { + userId: true, projectId: true, user: { select: { name: true, email: true } }, evaluation: { @@ -489,6 +491,8 @@ export const rankingRouter = router({ decision: boolean | null }>> = {} + const balancePoints: ScorePoint[] = [] + for (const a of assignments) { if (!a.evaluation) continue const list = byProject[a.projectId] ?? [] @@ -511,8 +515,28 @@ export const rankingRouter = router({ decision, }) byProject[a.projectId] = list + + if (a.evaluation.globalScore != null) { + balancePoints.push({ + projectId: a.projectId, + userId: a.userId, + rawScore: a.evaluation.globalScore, + }) + } } - return byProject + const balanceCtx = computeBalanceContext(balancePoints) + const balancedByProject = computeBalancedProjectScores(balancePoints, balanceCtx) + + // Per-project balanced average on the 1-10 scale, comparable to raw avgs. + const balanced: Record = {} + for (const [projectId, result] of balancedByProject.entries()) { + balanced[projectId] = { + rawAverage: result.rawAverage, + balancedAverage: result.balancedAverage, + } + } + + return { byProject, balanced } }), }) diff --git a/src/server/services/ai-juror-calibration.ts b/src/server/services/ai-juror-calibration.ts new file mode 100644 index 0000000..2e64cec --- /dev/null +++ b/src/server/services/ai-juror-calibration.ts @@ -0,0 +1,355 @@ +/** + * AI-Powered Juror Calibration Advisory + * + * Analyzes per-juror grading statistics for an evaluation round and + * produces a human-readable explanation of how each juror's scores compare + * to the cohort. Describes the z-score balance that's already applied in + * ranking; does NOT introduce a new weighting layer — only explains the + * existing math in plain language so admins can justify results to jurors. + * + * GDPR: Juror identifiers are replaced with Juror-1, Juror-2, ... before any + * call to OpenAI. No names or emails leave the server. + */ + +import { TRPCError } from '@trpc/server' +import { getOpenAI, getConfiguredModel, buildCompletionParams, AI_MODELS } from '@/lib/openai' +import { logAIUsage, extractTokenUsage } from '@/server/utils/ai-usage' +import { classifyAIError, createParseError, logAIError } from './ai-errors' +import type { PrismaClient } from '@prisma/client' +import { computeBalanceContext, type ScorePoint } from './juror-balance' + +// ─── Types ────────────────────────────────────────────────────────────────── + +export type JurorCalibrationEntry = { + userId: string + name: string + evaluationCount: number + rawMean: number + stddev: number + deltaFromCohort: number + effectiveInfluence: number | null + severity: 'normal' | 'notable' | 'outlier' + summary: string +} + +export type JurorCalibrationResult = { + roundId: string + roundName: string + cohortMean: number + cohortStddev: number + totalEvaluations: number + totalJurors: number + overallSummary: string + keyTakeaways: string[] + jurors: JurorCalibrationEntry[] + tokensUsed: number + model: string + generatedAt: Date +} + +type AIResponsePayload = { + overallSummary: string + keyTakeaways: string[] + jurors: Array<{ + jurorId: string + severity: 'normal' | 'notable' | 'outlier' + summary: string + }> +} + +type InternalJurorRecord = { + userId: string + name: string + evaluationCount: number + rawMean: number + stddev: number + deltaFromCohort: number + effectiveInfluence: number | null +} + +// ─── Main Orchestrator ────────────────────────────────────────────────────── + +export async function generateJurorCalibration({ + roundId, + userId, + prisma, +}: { + roundId: string + userId: string + prisma: PrismaClient +}): Promise { + const round = await prisma.round.findUnique({ + where: { id: roundId }, + select: { id: true, name: true, roundType: true }, + }) + + if (!round) { + throw new TRPCError({ code: 'NOT_FOUND', message: 'Round not found' }) + } + + const evaluations = await prisma.evaluation.findMany({ + where: { + status: 'SUBMITTED', + assignment: { roundId }, + }, + select: { + globalScore: true, + criterionScoresJson: true, + assignment: { + select: { + userId: true, + projectId: true, + user: { select: { id: true, name: true, email: true } }, + }, + }, + }, + }) + + // Build (project, juror, score) points using each eval's mean criterion score, + // matching how the reports page reports raw + balanced averages per project. + const points: ScorePoint[] = [] + const nameByUserId = new Map() + + for (const e of evaluations) { + const scores = e.criterionScoresJson as Record | null + if (!scores) continue + const vals = Object.values(scores).filter((s): s is number => typeof s === 'number') + if (vals.length === 0) continue + const rawScore = vals.reduce((a, b) => a + b, 0) / vals.length + points.push({ + projectId: e.assignment.projectId, + userId: e.assignment.userId, + rawScore, + }) + nameByUserId.set( + e.assignment.userId, + e.assignment.user.name ?? e.assignment.user.email ?? 'Unknown', + ) + } + + if (points.length === 0) { + throw new TRPCError({ + code: 'BAD_REQUEST', + message: 'No submitted evaluations with numeric scores in this round', + }) + } + + const ctx = computeBalanceContext(points) + + // Build internal juror stats and a per-juror effective influence ratio + // (scale of the juror's raw stddev relative to the cohort stddev). + const internalJurors: InternalJurorRecord[] = [] + for (const [uid, stats] of ctx.jurorStats.entries()) { + const effectiveInfluence = ctx.overallStddev > 0 && stats.stddev > 0 + ? Math.min(2, stats.stddev / ctx.overallStddev) + : null + internalJurors.push({ + userId: uid, + name: nameByUserId.get(uid) ?? 'Unknown', + evaluationCount: stats.count, + rawMean: stats.mean, + stddev: stats.stddev, + deltaFromCohort: stats.mean - ctx.overallMean, + effectiveInfluence, + }) + } + + // Sort by absolute delta from cohort desc so largest outliers land first in prompts + UI. + internalJurors.sort((a, b) => Math.abs(b.deltaFromCohort) - Math.abs(a.deltaFromCohort)) + + // Build anonymized payload for the AI call. + const anonymizedMap = new Map() + const anonymizedJurors = internalJurors.map((j, i) => { + const id = `Juror-${i + 1}` + anonymizedMap.set(j.userId, id) + return { + jurorId: id, + evaluationCount: j.evaluationCount, + rawMean: Math.round(j.rawMean * 100) / 100, + stddev: Math.round(j.stddev * 100) / 100, + deltaFromCohort: Math.round(j.deltaFromCohort * 100) / 100, + effectiveInfluence: j.effectiveInfluence != null + ? Math.round(j.effectiveInfluence * 100) / 100 + : null, + } + }) + + const openai = await getOpenAI() + if (!openai) { + throw new TRPCError({ + code: 'PRECONDITION_FAILED', + message: 'OpenAI is not configured. Please set up your API key in Settings.', + }) + } + + const model = await getConfiguredModel(AI_MODELS.QUICK) + + const prompt = buildCalibrationPrompt({ + roundName: round.name, + cohortMean: Math.round(ctx.overallMean * 100) / 100, + cohortStddev: Math.round(ctx.overallStddev * 100) / 100, + totalEvaluations: points.length, + jurors: anonymizedJurors, + }) + + let aiResponse: AIResponsePayload + let tokensUsed = 0 + + try { + const params = buildCompletionParams(model, { + messages: [{ role: 'user', content: prompt }], + jsonMode: true, + temperature: 0.2, + maxTokens: 2000, + }) + + const response = await openai.chat.completions.create(params) + tokensUsed = extractTokenUsage(response).totalTokens + const content = response.choices[0]?.message?.content + if (!content) throw new Error('Empty response from AI') + + try { + aiResponse = JSON.parse(content) as AIResponsePayload + } catch (parseError) { + const err = createParseError((parseError as Error).message) + logAIError('JurorCalibration', 'generate', err) + throw new TRPCError({ + code: 'INTERNAL_SERVER_ERROR', + message: 'Failed to parse AI response. Please try again.', + }) + } + } catch (error) { + if (error instanceof TRPCError) throw error + const classified = classifyAIError(error) + logAIError('JurorCalibration', 'generate', classified) + await logAIUsage({ + userId, + action: 'JUROR_CALIBRATION', + entityType: 'Round', + entityId: roundId, + model, + promptTokens: 0, + completionTokens: 0, + totalTokens: tokensUsed, + itemsProcessed: 0, + status: 'ERROR', + errorMessage: classified.message, + }) + throw new TRPCError({ + code: 'INTERNAL_SERVER_ERROR', + message: classified.message, + }) + } + + await logAIUsage({ + userId, + action: 'JUROR_CALIBRATION', + entityType: 'Round', + entityId: roundId, + model, + promptTokens: 0, + completionTokens: 0, + totalTokens: tokensUsed, + itemsProcessed: internalJurors.length, + status: 'SUCCESS', + }) + + // Merge AI narrative back with internal stats by anonymized id. + const narrativeById = new Map(aiResponse.jurors.map((j) => [j.jurorId, j])) + const jurors: JurorCalibrationEntry[] = internalJurors.map((j) => { + const anonId = anonymizedMap.get(j.userId)! + const narrative = narrativeById.get(anonId) + return { + userId: j.userId, + name: j.name, + evaluationCount: j.evaluationCount, + rawMean: Math.round(j.rawMean * 100) / 100, + stddev: Math.round(j.stddev * 100) / 100, + deltaFromCohort: Math.round(j.deltaFromCohort * 100) / 100, + effectiveInfluence: j.effectiveInfluence != null + ? Math.round(j.effectiveInfluence * 100) / 100 + : null, + severity: narrative?.severity ?? classifySeverity(j, ctx.overallStddev), + summary: narrative?.summary ?? 'No AI narrative available.', + } + }) + + return { + roundId: round.id, + roundName: round.name, + cohortMean: Math.round(ctx.overallMean * 100) / 100, + cohortStddev: Math.round(ctx.overallStddev * 100) / 100, + totalEvaluations: points.length, + totalJurors: internalJurors.length, + overallSummary: aiResponse.overallSummary, + keyTakeaways: aiResponse.keyTakeaways, + jurors, + tokensUsed, + model, + generatedAt: new Date(), + } +} + +// ─── Helpers ──────────────────────────────────────────────────────────────── + +function classifySeverity( + juror: InternalJurorRecord, + cohortStddev: number, +): 'normal' | 'notable' | 'outlier' { + if (cohortStddev === 0) return 'normal' + const zDelta = Math.abs(juror.deltaFromCohort) / cohortStddev + if (zDelta >= 1.5) return 'outlier' + if (zDelta >= 0.75) return 'notable' + return 'normal' +} + +function buildCalibrationPrompt(payload: { + roundName: string + cohortMean: number + cohortStddev: number + totalEvaluations: number + jurors: Array<{ + jurorId: string + evaluationCount: number + rawMean: number + stddev: number + deltaFromCohort: number + effectiveInfluence: number | null + }> +}): string { + return `You are analyzing juror grading patterns for a competition evaluation round. Your job is to EXPLAIN the statistical normalization that has already been applied; you are NOT introducing a new weighting scheme or prescribing changes. + +CONTEXT: +- Round: "${payload.roundName}" +- Cohort mean: ${payload.cohortMean} (scale 1-10) +- Cohort stddev: ${payload.cohortStddev} +- Total submitted evaluations: ${payload.totalEvaluations} + +HOW BALANCING WORKS: +Each juror's scores are z-score normalized against their own mean and stddev, then rescaled back onto the 1-10 range. A juror who averages 2 points below the cohort won't drag projects down more than their peers; a lenient juror won't inflate projects. "effectiveInfluence" is roughly the juror's stddev divided by the cohort stddev — a value near 1.0 means they spread their scores similarly to the cohort; values well under 1 mean compressed scoring, well over 1 mean wide spread. + +JUROR DATA (anonymized, sorted by |deltaFromCohort| desc): +${JSON.stringify(payload.jurors, null, 2)} + +Return a JSON object with this exact shape: +{ + "overallSummary": "2-3 sentences summarizing grading dispersion across the cohort — is the panel tightly aligned or widely divergent?", + "keyTakeaways": ["up to 4 bullets: notable patterns, risks, what to watch"], + "jurors": [ + { + "jurorId": "Juror-N (matching the input)", + "severity": "normal" | "notable" | "outlier", + "summary": "One short sentence about this juror's grading pattern, referring to their rawMean, deltaFromCohort, and stddev. Example: 'Scored on average 2.1 points below cohort across 8 evaluations — consistently harsh, low internal variance.'" + } + ] +} + +Guidelines: +- "outlier" = delta from cohort >= 1.5 cohort-stddev in either direction +- "notable" = delta from cohort 0.75-1.5 cohort-stddev +- "normal" = delta from cohort < 0.75 cohort-stddev +- A juror with very few evaluations (< 3) can't be classified confidently — note this in their summary and prefer "normal". +- Be factual and specific. Reference the numbers. No speculation about intent. +- Do not include juror names — only the anonymized jurorId. +- Include every juror from the input in the jurors array. Order matches input.` +} diff --git a/src/server/services/juror-balance.ts b/src/server/services/juror-balance.ts new file mode 100644 index 0000000..7c89977 --- /dev/null +++ b/src/server/services/juror-balance.ts @@ -0,0 +1,120 @@ +/** + * Juror balancing: z-score normalization to correct for per-juror grading harshness. + * + * A juror who grades 1 standard deviation below their peers on shared projects + * shouldn't punish those projects more than a juror who grades at the mean. + * We compute per-juror mean + stddev across their scores in a round, z-normalize + * each score, then rescale back onto the same 1-10 scale using the overall + * round-level mean + stddev so the balanced number is directly comparable to + * the raw average. + */ + +export type ScorePoint = { + projectId: string + userId: string + rawScore: number +} + +export type BalancedProjectResult = { + projectId: string + rawAverage: number | null + balancedAverage: number | null + count: number +} + +export type JurorBalance = { + userId: string + mean: number + stddev: number + count: number +} + +export type BalanceContext = { + overallMean: number + overallStddev: number + jurorStats: Map +} + +/** + * Build per-juror and overall grading statistics from a flat list of + * (project, juror, score) points. Returns the stats plus a helper to + * rescale z-scores back onto the raw-score scale. + */ +export function computeBalanceContext(points: ScorePoint[]): BalanceContext { + const jurorScores = new Map() + for (const p of points) { + const arr = jurorScores.get(p.userId) ?? [] + arr.push(p.rawScore) + jurorScores.set(p.userId, arr) + } + + const jurorStats = new Map() + for (const [userId, scores] of jurorScores.entries()) { + const mean = scores.reduce((a, b) => a + b, 0) / scores.length + const variance = scores.length > 1 + ? scores.reduce((s, v) => s + (v - mean) ** 2, 0) / scores.length + : 0 + jurorStats.set(userId, { + userId, + mean, + stddev: Math.sqrt(variance), + count: scores.length, + }) + } + + const allScores = points.map((p) => p.rawScore) + const overallMean = allScores.length > 0 + ? allScores.reduce((a, b) => a + b, 0) / allScores.length + : 0 + const overallStddev = allScores.length > 1 + ? Math.sqrt( + allScores.reduce((s, v) => s + (v - overallMean) ** 2, 0) / allScores.length, + ) + : 0 + + return { overallMean, overallStddev, jurorStats } +} + +/** + * Aggregate per-project raw + balanced averages from score points. + */ +export function computeBalancedProjectScores( + points: ScorePoint[], + ctx: BalanceContext, +): Map { + const byProject = new Map() + for (const p of points) { + const arr = byProject.get(p.projectId) ?? [] + arr.push(p) + byProject.set(p.projectId, arr) + } + + const results = new Map() + for (const [projectId, projectPoints] of byProject.entries()) { + const rawAverage = projectPoints.reduce((a, b) => a + b.rawScore, 0) / projectPoints.length + + let balancedAverage: number | null = null + if (ctx.overallStddev > 0) { + const zValues: number[] = [] + for (const pt of projectPoints) { + const stats = ctx.jurorStats.get(pt.userId) + if (stats && stats.stddev > 0) { + zValues.push((pt.rawScore - stats.mean) / stats.stddev) + } else { + zValues.push((pt.rawScore - ctx.overallMean) / ctx.overallStddev) + } + } + const avgZ = zValues.reduce((a, b) => a + b, 0) / zValues.length + balancedAverage = ctx.overallMean + avgZ * ctx.overallStddev + } + + results.set(projectId, { + projectId, + rawAverage, + balancedAverage, + count: projectPoints.length, + }) + } + + return results +} diff --git a/src/server/utils/ai-usage.ts b/src/server/utils/ai-usage.ts index 8889d1d..4abcefb 100644 --- a/src/server/utils/ai-usage.ts +++ b/src/server/utils/ai-usage.ts @@ -21,6 +21,7 @@ export type AIAction = | 'ROUTING' | 'SHORTLIST' | 'RANKING' + | 'JUROR_CALIBRATION' export type AIStatus = 'SUCCESS' | 'PARTIAL' | 'ERROR'