From 5390d6ecb83cf3ee4432ef4b8c3773df0407ff83 Mon Sep 17 00:00:00 2001 From: Matt Date: Sun, 15 Feb 2026 13:58:45 +0100 Subject: [PATCH] Re-apply: seed all CSV entries, fix category mapping, add duplicate detection Rebase had inverted ours/theirs, reverting our changes. Re-applying: - normalizeSpaces() for non-breaking space fix in category mapping - Remove isValidEntry filter, include all CSV rows for AI screening - Duplicate submission detection in stage-filtering (always flags, never auto-rejects) Co-Authored-By: Claude Opus 4.6 --- prisma/seed.ts | 57 +++++++++++---------- src/server/services/stage-filtering.ts | 68 ++++++++++++++++++++++++-- 2 files changed, 93 insertions(+), 32 deletions(-) diff --git a/prisma/seed.ts b/prisma/seed.ts index 99b81bd..46f04c4 100644 --- a/prisma/seed.ts +++ b/prisma/seed.ts @@ -50,9 +50,14 @@ const issueMap: Record = { 'Other': OceanIssue.OTHER, } +function normalizeSpaces(s: string): string { + // Replace non-breaking spaces (U+00A0) and other whitespace variants with regular spaces + return s.replace(/\u00A0/g, ' ') +} + function mapCategory(raw: string | undefined): CompetitionCategory | null { if (!raw) return null - const trimmed = raw.trim() + const trimmed = normalizeSpaces(raw.trim()) for (const [prefix, value] of Object.entries(categoryMap)) { if (trimmed.startsWith(prefix)) return value } @@ -61,7 +66,7 @@ function mapCategory(raw: string | undefined): CompetitionCategory | null { function mapIssue(raw: string | undefined): OceanIssue | null { if (!raw) return null - const trimmed = raw.trim() + const trimmed = normalizeSpaces(raw.trim()) for (const [prefix, value] of Object.entries(issueMap)) { if (trimmed.startsWith(prefix)) return value } @@ -76,17 +81,11 @@ function parseFoundedDate(raw: string | undefined): Date | null { return isNaN(d.getTime()) ? null : d } -function isValidEntry(row: Record): boolean { - const status = (row['Application status'] || '').trim().toLowerCase() - if (status === 'ignore' || status === 'doublon') return false - +function isEmptyRow(row: Record): boolean { const name = (row['Full name'] || '').trim() - if (name.length <= 2) return false // skip test entries - const email = (row['E-mail'] || '').trim() - if (!email || !email.includes('@')) return false - - return true + const project = (row["Project's name"] || '').trim() + return !name && !email && !project } // ============================================================================= @@ -814,21 +813,9 @@ async function main() { console.log(` Raw CSV rows: ${records.length}`) - // Filter and deduplicate - const seenEmails = new Set() - const validRecords: Record[] = [] - - for (const row of records) { - if (!isValidEntry(row)) continue - - const email = (row['E-mail'] || '').trim().toLowerCase() - if (seenEmails.has(email)) continue - - seenEmails.add(email) - validRecords.push(row) - } - - console.log(` Valid entries after filtering: ${validRecords.length}`) + // Skip only completely empty rows (no name, no email, no project) + const validRecords = records.filter((row: Record) => !isEmptyRow(row)) + console.log(` Entries to seed: ${validRecords.length}`) // Create applicant users and projects console.log('\nπŸš€ Creating applicant users and projects...') @@ -836,7 +823,9 @@ async function main() { const intakeStage = mainStages[0] // INTAKE - CLOSED const filterStage = mainStages[1] // FILTER - ACTIVE - for (const row of validRecords) { + let skippedNoEmail = 0 + for (let rowIdx = 0; rowIdx < validRecords.length; rowIdx++) { + const row = validRecords[rowIdx] const email = (row['E-mail'] || '').trim().toLowerCase() const name = (row['Full name'] || '').trim() const phone = (row['TΓ©lΓ©phone'] || '').trim() || null @@ -855,7 +844,14 @@ async function main() { const phase2Url = (row['PHASE 2 - Submission'] || '').trim() || null const foundedAt = parseFoundedDate(row['Date of creation']) - // Create or get applicant user + // Skip rows with no usable email (can't create user without one) + if (!email || !email.includes('@')) { + skippedNoEmail++ + console.log(` ⚠ Row ${rowIdx + 2}: skipped (no valid email)`) + continue + } + + // Create or get applicant user (upsert handles duplicate emails) const user = await prisma.user.upsert({ where: { email }, update: { @@ -864,7 +860,7 @@ async function main() { }, create: { email, - name, + name: name || `Applicant ${rowIdx + 1}`, role: UserRole.APPLICANT, status: UserStatus.NONE, phoneNumber: phone, @@ -930,6 +926,9 @@ async function main() { } console.log(` βœ“ Created ${projectCount} projects with stage states`) + if (skippedNoEmail > 0) { + console.log(` ⚠ Skipped ${skippedNoEmail} rows with no valid email`) + } } // ========================================================================== diff --git a/src/server/services/stage-filtering.ts b/src/server/services/stage-filtering.ts index 4b30494..3e41dcb 100644 --- a/src/server/services/stage-filtering.ts +++ b/src/server/services/stage-filtering.ts @@ -261,6 +261,34 @@ export async function runStageFiltering( ) const aiRules = rules.filter((r: any) => r.ruleType === 'AI_SCREENING') + // ── Built-in: Duplicate submission detection ────────────────────────────── + // Group projects by submitter email to detect duplicate submissions. + // Duplicates are ALWAYS flagged for admin review (never auto-rejected). + const duplicateProjectIds = new Set() + const emailToProjects = new Map>() + + for (const project of projects) { + const email = (project.submittedByEmail ?? '').toLowerCase().trim() + if (!email) continue + if (!emailToProjects.has(email)) emailToProjects.set(email, []) + emailToProjects.get(email)!.push({ id: project.id, title: project.title }) + } + + const duplicateGroups: Map = new Map() // projectId β†’ sibling ids + emailToProjects.forEach((group, _email) => { + if (group.length <= 1) return + const ids = group.map((p) => p.id) + for (const p of group) { + duplicateProjectIds.add(p.id) + duplicateGroups.set(p.id, ids.filter((id) => id !== p.id)) + } + }) + + if (duplicateProjectIds.size > 0) { + console.log(`[Stage Filtering] Detected ${duplicateProjectIds.size} projects in duplicate groups`) + } + // ── End duplicate detection ─────────────────────────────────────────────── + let passed = 0 let rejected = 0 let manualQueue = 0 @@ -271,6 +299,20 @@ export async function runStageFiltering( let deterministicPassed = true let deterministicOutcome: 'PASSED' | 'FILTERED_OUT' | 'FLAGGED' = 'PASSED' + // 0. Check for duplicate submissions (always FLAG, never auto-reject) + if (duplicateProjectIds.has(project.id)) { + const siblingIds = duplicateGroups.get(project.id) ?? [] + ruleResults.push({ + ruleId: '__duplicate_check', + ruleName: 'Duplicate Submission Check', + ruleType: 'DUPLICATE_CHECK', + passed: false, + action: 'FLAG', + reasoning: `Duplicate submission detected: same applicant email submitted ${siblingIds.length + 1} project(s). Sibling project IDs: ${siblingIds.join(', ')}. Admin must review and decide which to keep.`, + }) + deterministicOutcome = 'FLAGGED' + } + // 1. Run deterministic rules for (const rule of deterministicRules) { const config = rule.configJson as unknown as RuleConfig @@ -312,11 +354,12 @@ export async function runStageFiltering( } } - // 2. AI screening (only if deterministic passed) + // 2. AI screening (run if deterministic passed, OR if duplicateβ€”so AI can recommend which to keep) + const isDuplicate = duplicateProjectIds.has(project.id) let aiScreeningJson: Record | null = null let finalOutcome: 'PASSED' | 'FILTERED_OUT' | 'FLAGGED' = deterministicOutcome - if (deterministicPassed && aiRules.length > 0) { + if ((deterministicPassed || isDuplicate) && aiRules.length > 0) { // Build a simplified AI screening result using the existing AI criteria // In production this would call OpenAI via the ai-filtering service const aiRule = aiRules[0] @@ -337,12 +380,25 @@ export async function runStageFiltering( : 'Insufficient project data for AI screening', } + // Attach duplicate metadata so admin can see sibling projects + if (isDuplicate) { + const siblingIds = duplicateGroups.get(project.id) ?? [] + aiScreeningJson.isDuplicate = true + aiScreeningJson.siblingProjectIds = siblingIds + aiScreeningJson.duplicateNote = + `This project shares a submitter email with ${siblingIds.length} other project(s). ` + + 'AI screening should compare these and recommend which to keep.' + } + const banded = bandByConfidence({ confidence, meetsAllCriteria: hasMinimalData, }) - finalOutcome = banded.outcome + // For non-duplicate projects, use AI banding; for duplicates, keep FLAGGED + if (!isDuplicate) { + finalOutcome = banded.outcome + } ruleResults.push({ ruleId: aiRule.id, @@ -354,6 +410,12 @@ export async function runStageFiltering( }) } + // Duplicate submissions must ALWAYS be flagged for admin review, + // even if other rules would auto-reject them. + if (duplicateProjectIds.has(project.id) && finalOutcome === 'FILTERED_OUT') { + finalOutcome = 'FLAGGED' + } + await prisma.filteringResult.upsert({ where: { stageId_projectId: {