feat: surface juror-balanced scores and AI calibration advisory
All checks were successful
Build and Push Docker Image / build (push) Successful in 7m27s
All checks were successful
Build and Push Docker Image / build (push) Successful in 7m27s
Adds a shared juror-balancing utility (z-score normalization per juror, rescaled back onto the raw 1-10 scale) and wires it into: - Admin reports page: Top-10 project table now shows "Raw Avg" and "Balanced" columns side by side, and the summary stats row shows a balanced-average tile. Sort defaults to balanced so harsh and lenient graders no longer skew the ranking. - Ranking dashboard: each project row shows a green/amber balanced-score chip next to the raw average when the two differ by ≥0.05, making it obvious when juror calibration moved a project's effective ranking. Also adds AI Juror Calibration Advisory — a mutation that takes anonymized per-juror stats, calls OpenAI, and produces a plain-language explanation of the cohort's grading patterns plus per-juror severity (normal / notable / outlier) with a one-sentence narrative. The advisory describes the statistical balance that already runs; it does not introduce a new weighting layer. Rendered as a panel in the Juror Consistency tab when a specific round is selected. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
120
src/server/services/juror-balance.ts
Normal file
120
src/server/services/juror-balance.ts
Normal file
@@ -0,0 +1,120 @@
|
||||
/**
|
||||
* Juror balancing: z-score normalization to correct for per-juror grading harshness.
|
||||
*
|
||||
* A juror who grades 1 standard deviation below their peers on shared projects
|
||||
* shouldn't punish those projects more than a juror who grades at the mean.
|
||||
* We compute per-juror mean + stddev across their scores in a round, z-normalize
|
||||
* each score, then rescale back onto the same 1-10 scale using the overall
|
||||
* round-level mean + stddev so the balanced number is directly comparable to
|
||||
* the raw average.
|
||||
*/
|
||||
|
||||
export type ScorePoint = {
|
||||
projectId: string
|
||||
userId: string
|
||||
rawScore: number
|
||||
}
|
||||
|
||||
export type BalancedProjectResult = {
|
||||
projectId: string
|
||||
rawAverage: number | null
|
||||
balancedAverage: number | null
|
||||
count: number
|
||||
}
|
||||
|
||||
export type JurorBalance = {
|
||||
userId: string
|
||||
mean: number
|
||||
stddev: number
|
||||
count: number
|
||||
}
|
||||
|
||||
export type BalanceContext = {
|
||||
overallMean: number
|
||||
overallStddev: number
|
||||
jurorStats: Map<string, JurorBalance>
|
||||
}
|
||||
|
||||
/**
|
||||
* Build per-juror and overall grading statistics from a flat list of
|
||||
* (project, juror, score) points. Returns the stats plus a helper to
|
||||
* rescale z-scores back onto the raw-score scale.
|
||||
*/
|
||||
export function computeBalanceContext(points: ScorePoint[]): BalanceContext {
|
||||
const jurorScores = new Map<string, number[]>()
|
||||
for (const p of points) {
|
||||
const arr = jurorScores.get(p.userId) ?? []
|
||||
arr.push(p.rawScore)
|
||||
jurorScores.set(p.userId, arr)
|
||||
}
|
||||
|
||||
const jurorStats = new Map<string, JurorBalance>()
|
||||
for (const [userId, scores] of jurorScores.entries()) {
|
||||
const mean = scores.reduce((a, b) => a + b, 0) / scores.length
|
||||
const variance = scores.length > 1
|
||||
? scores.reduce((s, v) => s + (v - mean) ** 2, 0) / scores.length
|
||||
: 0
|
||||
jurorStats.set(userId, {
|
||||
userId,
|
||||
mean,
|
||||
stddev: Math.sqrt(variance),
|
||||
count: scores.length,
|
||||
})
|
||||
}
|
||||
|
||||
const allScores = points.map((p) => p.rawScore)
|
||||
const overallMean = allScores.length > 0
|
||||
? allScores.reduce((a, b) => a + b, 0) / allScores.length
|
||||
: 0
|
||||
const overallStddev = allScores.length > 1
|
||||
? Math.sqrt(
|
||||
allScores.reduce((s, v) => s + (v - overallMean) ** 2, 0) / allScores.length,
|
||||
)
|
||||
: 0
|
||||
|
||||
return { overallMean, overallStddev, jurorStats }
|
||||
}
|
||||
|
||||
/**
|
||||
* Aggregate per-project raw + balanced averages from score points.
|
||||
*/
|
||||
export function computeBalancedProjectScores(
|
||||
points: ScorePoint[],
|
||||
ctx: BalanceContext,
|
||||
): Map<string, BalancedProjectResult> {
|
||||
const byProject = new Map<string, ScorePoint[]>()
|
||||
for (const p of points) {
|
||||
const arr = byProject.get(p.projectId) ?? []
|
||||
arr.push(p)
|
||||
byProject.set(p.projectId, arr)
|
||||
}
|
||||
|
||||
const results = new Map<string, BalancedProjectResult>()
|
||||
for (const [projectId, projectPoints] of byProject.entries()) {
|
||||
const rawAverage = projectPoints.reduce((a, b) => a + b.rawScore, 0) / projectPoints.length
|
||||
|
||||
let balancedAverage: number | null = null
|
||||
if (ctx.overallStddev > 0) {
|
||||
const zValues: number[] = []
|
||||
for (const pt of projectPoints) {
|
||||
const stats = ctx.jurorStats.get(pt.userId)
|
||||
if (stats && stats.stddev > 0) {
|
||||
zValues.push((pt.rawScore - stats.mean) / stats.stddev)
|
||||
} else {
|
||||
zValues.push((pt.rawScore - ctx.overallMean) / ctx.overallStddev)
|
||||
}
|
||||
}
|
||||
const avgZ = zValues.reduce((a, b) => a + b, 0) / zValues.length
|
||||
balancedAverage = ctx.overallMean + avgZ * ctx.overallStddev
|
||||
}
|
||||
|
||||
results.set(projectId, {
|
||||
projectId,
|
||||
rawAverage,
|
||||
balancedAverage,
|
||||
count: projectPoints.length,
|
||||
})
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
Reference in New Issue
Block a user