feat: surface juror-balanced scores and AI calibration advisory
All checks were successful
Build and Push Docker Image / build (push) Successful in 7m27s
All checks were successful
Build and Push Docker Image / build (push) Successful in 7m27s
Adds a shared juror-balancing utility (z-score normalization per juror, rescaled back onto the raw 1-10 scale) and wires it into: - Admin reports page: Top-10 project table now shows "Raw Avg" and "Balanced" columns side by side, and the summary stats row shows a balanced-average tile. Sort defaults to balanced so harsh and lenient graders no longer skew the ranking. - Ranking dashboard: each project row shows a green/amber balanced-score chip next to the raw average when the two differ by ≥0.05, making it obvious when juror calibration moved a project's effective ranking. Also adds AI Juror Calibration Advisory — a mutation that takes anonymized per-juror stats, calls OpenAI, and produces a plain-language explanation of the cohort's grading patterns plus per-juror severity (normal / notable / outlier) with a one-sentence narrative. The advisory describes the statistical balance that already runs; it does not introduce a new weighting layer. Rendered as a panel in the Juror Consistency tab when a specific round is selected. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
355
src/server/services/ai-juror-calibration.ts
Normal file
355
src/server/services/ai-juror-calibration.ts
Normal file
@@ -0,0 +1,355 @@
|
||||
/**
|
||||
* AI-Powered Juror Calibration Advisory
|
||||
*
|
||||
* Analyzes per-juror grading statistics for an evaluation round and
|
||||
* produces a human-readable explanation of how each juror's scores compare
|
||||
* to the cohort. Describes the z-score balance that's already applied in
|
||||
* ranking; does NOT introduce a new weighting layer — only explains the
|
||||
* existing math in plain language so admins can justify results to jurors.
|
||||
*
|
||||
* GDPR: Juror identifiers are replaced with Juror-1, Juror-2, ... before any
|
||||
* call to OpenAI. No names or emails leave the server.
|
||||
*/
|
||||
|
||||
import { TRPCError } from '@trpc/server'
|
||||
import { getOpenAI, getConfiguredModel, buildCompletionParams, AI_MODELS } from '@/lib/openai'
|
||||
import { logAIUsage, extractTokenUsage } from '@/server/utils/ai-usage'
|
||||
import { classifyAIError, createParseError, logAIError } from './ai-errors'
|
||||
import type { PrismaClient } from '@prisma/client'
|
||||
import { computeBalanceContext, type ScorePoint } from './juror-balance'
|
||||
|
||||
// ─── Types ──────────────────────────────────────────────────────────────────
|
||||
|
||||
export type JurorCalibrationEntry = {
|
||||
userId: string
|
||||
name: string
|
||||
evaluationCount: number
|
||||
rawMean: number
|
||||
stddev: number
|
||||
deltaFromCohort: number
|
||||
effectiveInfluence: number | null
|
||||
severity: 'normal' | 'notable' | 'outlier'
|
||||
summary: string
|
||||
}
|
||||
|
||||
export type JurorCalibrationResult = {
|
||||
roundId: string
|
||||
roundName: string
|
||||
cohortMean: number
|
||||
cohortStddev: number
|
||||
totalEvaluations: number
|
||||
totalJurors: number
|
||||
overallSummary: string
|
||||
keyTakeaways: string[]
|
||||
jurors: JurorCalibrationEntry[]
|
||||
tokensUsed: number
|
||||
model: string
|
||||
generatedAt: Date
|
||||
}
|
||||
|
||||
type AIResponsePayload = {
|
||||
overallSummary: string
|
||||
keyTakeaways: string[]
|
||||
jurors: Array<{
|
||||
jurorId: string
|
||||
severity: 'normal' | 'notable' | 'outlier'
|
||||
summary: string
|
||||
}>
|
||||
}
|
||||
|
||||
type InternalJurorRecord = {
|
||||
userId: string
|
||||
name: string
|
||||
evaluationCount: number
|
||||
rawMean: number
|
||||
stddev: number
|
||||
deltaFromCohort: number
|
||||
effectiveInfluence: number | null
|
||||
}
|
||||
|
||||
// ─── Main Orchestrator ──────────────────────────────────────────────────────
|
||||
|
||||
export async function generateJurorCalibration({
|
||||
roundId,
|
||||
userId,
|
||||
prisma,
|
||||
}: {
|
||||
roundId: string
|
||||
userId: string
|
||||
prisma: PrismaClient
|
||||
}): Promise<JurorCalibrationResult> {
|
||||
const round = await prisma.round.findUnique({
|
||||
where: { id: roundId },
|
||||
select: { id: true, name: true, roundType: true },
|
||||
})
|
||||
|
||||
if (!round) {
|
||||
throw new TRPCError({ code: 'NOT_FOUND', message: 'Round not found' })
|
||||
}
|
||||
|
||||
const evaluations = await prisma.evaluation.findMany({
|
||||
where: {
|
||||
status: 'SUBMITTED',
|
||||
assignment: { roundId },
|
||||
},
|
||||
select: {
|
||||
globalScore: true,
|
||||
criterionScoresJson: true,
|
||||
assignment: {
|
||||
select: {
|
||||
userId: true,
|
||||
projectId: true,
|
||||
user: { select: { id: true, name: true, email: true } },
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
// Build (project, juror, score) points using each eval's mean criterion score,
|
||||
// matching how the reports page reports raw + balanced averages per project.
|
||||
const points: ScorePoint[] = []
|
||||
const nameByUserId = new Map<string, string>()
|
||||
|
||||
for (const e of evaluations) {
|
||||
const scores = e.criterionScoresJson as Record<string, unknown> | null
|
||||
if (!scores) continue
|
||||
const vals = Object.values(scores).filter((s): s is number => typeof s === 'number')
|
||||
if (vals.length === 0) continue
|
||||
const rawScore = vals.reduce((a, b) => a + b, 0) / vals.length
|
||||
points.push({
|
||||
projectId: e.assignment.projectId,
|
||||
userId: e.assignment.userId,
|
||||
rawScore,
|
||||
})
|
||||
nameByUserId.set(
|
||||
e.assignment.userId,
|
||||
e.assignment.user.name ?? e.assignment.user.email ?? 'Unknown',
|
||||
)
|
||||
}
|
||||
|
||||
if (points.length === 0) {
|
||||
throw new TRPCError({
|
||||
code: 'BAD_REQUEST',
|
||||
message: 'No submitted evaluations with numeric scores in this round',
|
||||
})
|
||||
}
|
||||
|
||||
const ctx = computeBalanceContext(points)
|
||||
|
||||
// Build internal juror stats and a per-juror effective influence ratio
|
||||
// (scale of the juror's raw stddev relative to the cohort stddev).
|
||||
const internalJurors: InternalJurorRecord[] = []
|
||||
for (const [uid, stats] of ctx.jurorStats.entries()) {
|
||||
const effectiveInfluence = ctx.overallStddev > 0 && stats.stddev > 0
|
||||
? Math.min(2, stats.stddev / ctx.overallStddev)
|
||||
: null
|
||||
internalJurors.push({
|
||||
userId: uid,
|
||||
name: nameByUserId.get(uid) ?? 'Unknown',
|
||||
evaluationCount: stats.count,
|
||||
rawMean: stats.mean,
|
||||
stddev: stats.stddev,
|
||||
deltaFromCohort: stats.mean - ctx.overallMean,
|
||||
effectiveInfluence,
|
||||
})
|
||||
}
|
||||
|
||||
// Sort by absolute delta from cohort desc so largest outliers land first in prompts + UI.
|
||||
internalJurors.sort((a, b) => Math.abs(b.deltaFromCohort) - Math.abs(a.deltaFromCohort))
|
||||
|
||||
// Build anonymized payload for the AI call.
|
||||
const anonymizedMap = new Map<string, string>()
|
||||
const anonymizedJurors = internalJurors.map((j, i) => {
|
||||
const id = `Juror-${i + 1}`
|
||||
anonymizedMap.set(j.userId, id)
|
||||
return {
|
||||
jurorId: id,
|
||||
evaluationCount: j.evaluationCount,
|
||||
rawMean: Math.round(j.rawMean * 100) / 100,
|
||||
stddev: Math.round(j.stddev * 100) / 100,
|
||||
deltaFromCohort: Math.round(j.deltaFromCohort * 100) / 100,
|
||||
effectiveInfluence: j.effectiveInfluence != null
|
||||
? Math.round(j.effectiveInfluence * 100) / 100
|
||||
: null,
|
||||
}
|
||||
})
|
||||
|
||||
const openai = await getOpenAI()
|
||||
if (!openai) {
|
||||
throw new TRPCError({
|
||||
code: 'PRECONDITION_FAILED',
|
||||
message: 'OpenAI is not configured. Please set up your API key in Settings.',
|
||||
})
|
||||
}
|
||||
|
||||
const model = await getConfiguredModel(AI_MODELS.QUICK)
|
||||
|
||||
const prompt = buildCalibrationPrompt({
|
||||
roundName: round.name,
|
||||
cohortMean: Math.round(ctx.overallMean * 100) / 100,
|
||||
cohortStddev: Math.round(ctx.overallStddev * 100) / 100,
|
||||
totalEvaluations: points.length,
|
||||
jurors: anonymizedJurors,
|
||||
})
|
||||
|
||||
let aiResponse: AIResponsePayload
|
||||
let tokensUsed = 0
|
||||
|
||||
try {
|
||||
const params = buildCompletionParams(model, {
|
||||
messages: [{ role: 'user', content: prompt }],
|
||||
jsonMode: true,
|
||||
temperature: 0.2,
|
||||
maxTokens: 2000,
|
||||
})
|
||||
|
||||
const response = await openai.chat.completions.create(params)
|
||||
tokensUsed = extractTokenUsage(response).totalTokens
|
||||
const content = response.choices[0]?.message?.content
|
||||
if (!content) throw new Error('Empty response from AI')
|
||||
|
||||
try {
|
||||
aiResponse = JSON.parse(content) as AIResponsePayload
|
||||
} catch (parseError) {
|
||||
const err = createParseError((parseError as Error).message)
|
||||
logAIError('JurorCalibration', 'generate', err)
|
||||
throw new TRPCError({
|
||||
code: 'INTERNAL_SERVER_ERROR',
|
||||
message: 'Failed to parse AI response. Please try again.',
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
if (error instanceof TRPCError) throw error
|
||||
const classified = classifyAIError(error)
|
||||
logAIError('JurorCalibration', 'generate', classified)
|
||||
await logAIUsage({
|
||||
userId,
|
||||
action: 'JUROR_CALIBRATION',
|
||||
entityType: 'Round',
|
||||
entityId: roundId,
|
||||
model,
|
||||
promptTokens: 0,
|
||||
completionTokens: 0,
|
||||
totalTokens: tokensUsed,
|
||||
itemsProcessed: 0,
|
||||
status: 'ERROR',
|
||||
errorMessage: classified.message,
|
||||
})
|
||||
throw new TRPCError({
|
||||
code: 'INTERNAL_SERVER_ERROR',
|
||||
message: classified.message,
|
||||
})
|
||||
}
|
||||
|
||||
await logAIUsage({
|
||||
userId,
|
||||
action: 'JUROR_CALIBRATION',
|
||||
entityType: 'Round',
|
||||
entityId: roundId,
|
||||
model,
|
||||
promptTokens: 0,
|
||||
completionTokens: 0,
|
||||
totalTokens: tokensUsed,
|
||||
itemsProcessed: internalJurors.length,
|
||||
status: 'SUCCESS',
|
||||
})
|
||||
|
||||
// Merge AI narrative back with internal stats by anonymized id.
|
||||
const narrativeById = new Map(aiResponse.jurors.map((j) => [j.jurorId, j]))
|
||||
const jurors: JurorCalibrationEntry[] = internalJurors.map((j) => {
|
||||
const anonId = anonymizedMap.get(j.userId)!
|
||||
const narrative = narrativeById.get(anonId)
|
||||
return {
|
||||
userId: j.userId,
|
||||
name: j.name,
|
||||
evaluationCount: j.evaluationCount,
|
||||
rawMean: Math.round(j.rawMean * 100) / 100,
|
||||
stddev: Math.round(j.stddev * 100) / 100,
|
||||
deltaFromCohort: Math.round(j.deltaFromCohort * 100) / 100,
|
||||
effectiveInfluence: j.effectiveInfluence != null
|
||||
? Math.round(j.effectiveInfluence * 100) / 100
|
||||
: null,
|
||||
severity: narrative?.severity ?? classifySeverity(j, ctx.overallStddev),
|
||||
summary: narrative?.summary ?? 'No AI narrative available.',
|
||||
}
|
||||
})
|
||||
|
||||
return {
|
||||
roundId: round.id,
|
||||
roundName: round.name,
|
||||
cohortMean: Math.round(ctx.overallMean * 100) / 100,
|
||||
cohortStddev: Math.round(ctx.overallStddev * 100) / 100,
|
||||
totalEvaluations: points.length,
|
||||
totalJurors: internalJurors.length,
|
||||
overallSummary: aiResponse.overallSummary,
|
||||
keyTakeaways: aiResponse.keyTakeaways,
|
||||
jurors,
|
||||
tokensUsed,
|
||||
model,
|
||||
generatedAt: new Date(),
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
function classifySeverity(
|
||||
juror: InternalJurorRecord,
|
||||
cohortStddev: number,
|
||||
): 'normal' | 'notable' | 'outlier' {
|
||||
if (cohortStddev === 0) return 'normal'
|
||||
const zDelta = Math.abs(juror.deltaFromCohort) / cohortStddev
|
||||
if (zDelta >= 1.5) return 'outlier'
|
||||
if (zDelta >= 0.75) return 'notable'
|
||||
return 'normal'
|
||||
}
|
||||
|
||||
function buildCalibrationPrompt(payload: {
|
||||
roundName: string
|
||||
cohortMean: number
|
||||
cohortStddev: number
|
||||
totalEvaluations: number
|
||||
jurors: Array<{
|
||||
jurorId: string
|
||||
evaluationCount: number
|
||||
rawMean: number
|
||||
stddev: number
|
||||
deltaFromCohort: number
|
||||
effectiveInfluence: number | null
|
||||
}>
|
||||
}): string {
|
||||
return `You are analyzing juror grading patterns for a competition evaluation round. Your job is to EXPLAIN the statistical normalization that has already been applied; you are NOT introducing a new weighting scheme or prescribing changes.
|
||||
|
||||
CONTEXT:
|
||||
- Round: "${payload.roundName}"
|
||||
- Cohort mean: ${payload.cohortMean} (scale 1-10)
|
||||
- Cohort stddev: ${payload.cohortStddev}
|
||||
- Total submitted evaluations: ${payload.totalEvaluations}
|
||||
|
||||
HOW BALANCING WORKS:
|
||||
Each juror's scores are z-score normalized against their own mean and stddev, then rescaled back onto the 1-10 range. A juror who averages 2 points below the cohort won't drag projects down more than their peers; a lenient juror won't inflate projects. "effectiveInfluence" is roughly the juror's stddev divided by the cohort stddev — a value near 1.0 means they spread their scores similarly to the cohort; values well under 1 mean compressed scoring, well over 1 mean wide spread.
|
||||
|
||||
JUROR DATA (anonymized, sorted by |deltaFromCohort| desc):
|
||||
${JSON.stringify(payload.jurors, null, 2)}
|
||||
|
||||
Return a JSON object with this exact shape:
|
||||
{
|
||||
"overallSummary": "2-3 sentences summarizing grading dispersion across the cohort — is the panel tightly aligned or widely divergent?",
|
||||
"keyTakeaways": ["up to 4 bullets: notable patterns, risks, what to watch"],
|
||||
"jurors": [
|
||||
{
|
||||
"jurorId": "Juror-N (matching the input)",
|
||||
"severity": "normal" | "notable" | "outlier",
|
||||
"summary": "One short sentence about this juror's grading pattern, referring to their rawMean, deltaFromCohort, and stddev. Example: 'Scored on average 2.1 points below cohort across 8 evaluations — consistently harsh, low internal variance.'"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Guidelines:
|
||||
- "outlier" = delta from cohort >= 1.5 cohort-stddev in either direction
|
||||
- "notable" = delta from cohort 0.75-1.5 cohort-stddev
|
||||
- "normal" = delta from cohort < 0.75 cohort-stddev
|
||||
- A juror with very few evaluations (< 3) can't be classified confidently — note this in their summary and prefer "normal".
|
||||
- Be factual and specific. Reference the numbers. No speculation about intent.
|
||||
- Do not include juror names — only the anonymized jurorId.
|
||||
- Include every juror from the input in the jurors array. Order matches input.`
|
||||
}
|
||||
120
src/server/services/juror-balance.ts
Normal file
120
src/server/services/juror-balance.ts
Normal file
@@ -0,0 +1,120 @@
|
||||
/**
|
||||
* Juror balancing: z-score normalization to correct for per-juror grading harshness.
|
||||
*
|
||||
* A juror who grades 1 standard deviation below their peers on shared projects
|
||||
* shouldn't punish those projects more than a juror who grades at the mean.
|
||||
* We compute per-juror mean + stddev across their scores in a round, z-normalize
|
||||
* each score, then rescale back onto the same 1-10 scale using the overall
|
||||
* round-level mean + stddev so the balanced number is directly comparable to
|
||||
* the raw average.
|
||||
*/
|
||||
|
||||
export type ScorePoint = {
|
||||
projectId: string
|
||||
userId: string
|
||||
rawScore: number
|
||||
}
|
||||
|
||||
export type BalancedProjectResult = {
|
||||
projectId: string
|
||||
rawAverage: number | null
|
||||
balancedAverage: number | null
|
||||
count: number
|
||||
}
|
||||
|
||||
export type JurorBalance = {
|
||||
userId: string
|
||||
mean: number
|
||||
stddev: number
|
||||
count: number
|
||||
}
|
||||
|
||||
export type BalanceContext = {
|
||||
overallMean: number
|
||||
overallStddev: number
|
||||
jurorStats: Map<string, JurorBalance>
|
||||
}
|
||||
|
||||
/**
|
||||
* Build per-juror and overall grading statistics from a flat list of
|
||||
* (project, juror, score) points. Returns the stats plus a helper to
|
||||
* rescale z-scores back onto the raw-score scale.
|
||||
*/
|
||||
export function computeBalanceContext(points: ScorePoint[]): BalanceContext {
|
||||
const jurorScores = new Map<string, number[]>()
|
||||
for (const p of points) {
|
||||
const arr = jurorScores.get(p.userId) ?? []
|
||||
arr.push(p.rawScore)
|
||||
jurorScores.set(p.userId, arr)
|
||||
}
|
||||
|
||||
const jurorStats = new Map<string, JurorBalance>()
|
||||
for (const [userId, scores] of jurorScores.entries()) {
|
||||
const mean = scores.reduce((a, b) => a + b, 0) / scores.length
|
||||
const variance = scores.length > 1
|
||||
? scores.reduce((s, v) => s + (v - mean) ** 2, 0) / scores.length
|
||||
: 0
|
||||
jurorStats.set(userId, {
|
||||
userId,
|
||||
mean,
|
||||
stddev: Math.sqrt(variance),
|
||||
count: scores.length,
|
||||
})
|
||||
}
|
||||
|
||||
const allScores = points.map((p) => p.rawScore)
|
||||
const overallMean = allScores.length > 0
|
||||
? allScores.reduce((a, b) => a + b, 0) / allScores.length
|
||||
: 0
|
||||
const overallStddev = allScores.length > 1
|
||||
? Math.sqrt(
|
||||
allScores.reduce((s, v) => s + (v - overallMean) ** 2, 0) / allScores.length,
|
||||
)
|
||||
: 0
|
||||
|
||||
return { overallMean, overallStddev, jurorStats }
|
||||
}
|
||||
|
||||
/**
|
||||
* Aggregate per-project raw + balanced averages from score points.
|
||||
*/
|
||||
export function computeBalancedProjectScores(
|
||||
points: ScorePoint[],
|
||||
ctx: BalanceContext,
|
||||
): Map<string, BalancedProjectResult> {
|
||||
const byProject = new Map<string, ScorePoint[]>()
|
||||
for (const p of points) {
|
||||
const arr = byProject.get(p.projectId) ?? []
|
||||
arr.push(p)
|
||||
byProject.set(p.projectId, arr)
|
||||
}
|
||||
|
||||
const results = new Map<string, BalancedProjectResult>()
|
||||
for (const [projectId, projectPoints] of byProject.entries()) {
|
||||
const rawAverage = projectPoints.reduce((a, b) => a + b.rawScore, 0) / projectPoints.length
|
||||
|
||||
let balancedAverage: number | null = null
|
||||
if (ctx.overallStddev > 0) {
|
||||
const zValues: number[] = []
|
||||
for (const pt of projectPoints) {
|
||||
const stats = ctx.jurorStats.get(pt.userId)
|
||||
if (stats && stats.stddev > 0) {
|
||||
zValues.push((pt.rawScore - stats.mean) / stats.stddev)
|
||||
} else {
|
||||
zValues.push((pt.rawScore - ctx.overallMean) / ctx.overallStddev)
|
||||
}
|
||||
}
|
||||
const avgZ = zValues.reduce((a, b) => a + b, 0) / zValues.length
|
||||
balancedAverage = ctx.overallMean + avgZ * ctx.overallStddev
|
||||
}
|
||||
|
||||
results.set(projectId, {
|
||||
projectId,
|
||||
rawAverage,
|
||||
balancedAverage,
|
||||
count: projectPoints.length,
|
||||
})
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
Reference in New Issue
Block a user