Re-apply: seed all CSV entries, fix category mapping, add duplicate detection
Some checks failed
Build and Push Docker Image / build (push) Failing after 3m40s

Rebase had inverted ours/theirs, reverting our changes. Re-applying:
- normalizeSpaces() for non-breaking space fix in category mapping
- Remove isValidEntry filter, include all CSV rows for AI screening
- Duplicate submission detection in stage-filtering (always flags, never auto-rejects)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-15 13:58:45 +01:00
parent 882b98be19
commit 5390d6ecb8
2 changed files with 93 additions and 32 deletions

View File

@@ -50,9 +50,14 @@ const issueMap: Record<string, OceanIssue> = {
'Other': OceanIssue.OTHER,
}
function normalizeSpaces(s: string): string {
// Replace non-breaking spaces (U+00A0) and other whitespace variants with regular spaces
return s.replace(/\u00A0/g, ' ')
}
function mapCategory(raw: string | undefined): CompetitionCategory | null {
if (!raw) return null
const trimmed = raw.trim()
const trimmed = normalizeSpaces(raw.trim())
for (const [prefix, value] of Object.entries(categoryMap)) {
if (trimmed.startsWith(prefix)) return value
}
@@ -61,7 +66,7 @@ function mapCategory(raw: string | undefined): CompetitionCategory | null {
function mapIssue(raw: string | undefined): OceanIssue | null {
if (!raw) return null
const trimmed = raw.trim()
const trimmed = normalizeSpaces(raw.trim())
for (const [prefix, value] of Object.entries(issueMap)) {
if (trimmed.startsWith(prefix)) return value
}
@@ -76,17 +81,11 @@ function parseFoundedDate(raw: string | undefined): Date | null {
return isNaN(d.getTime()) ? null : d
}
function isValidEntry(row: Record<string, string>): boolean {
const status = (row['Application status'] || '').trim().toLowerCase()
if (status === 'ignore' || status === 'doublon') return false
function isEmptyRow(row: Record<string, string>): boolean {
const name = (row['Full name'] || '').trim()
if (name.length <= 2) return false // skip test entries
const email = (row['E-mail'] || '').trim()
if (!email || !email.includes('@')) return false
return true
const project = (row["Project's name"] || '').trim()
return !name && !email && !project
}
// =============================================================================
@@ -814,21 +813,9 @@ async function main() {
console.log(` Raw CSV rows: ${records.length}`)
// Filter and deduplicate
const seenEmails = new Set<string>()
const validRecords: Record<string, string>[] = []
for (const row of records) {
if (!isValidEntry(row)) continue
const email = (row['E-mail'] || '').trim().toLowerCase()
if (seenEmails.has(email)) continue
seenEmails.add(email)
validRecords.push(row)
}
console.log(` Valid entries after filtering: ${validRecords.length}`)
// Skip only completely empty rows (no name, no email, no project)
const validRecords = records.filter((row: Record<string, string>) => !isEmptyRow(row))
console.log(` Entries to seed: ${validRecords.length}`)
// Create applicant users and projects
console.log('\n🚀 Creating applicant users and projects...')
@@ -836,7 +823,9 @@ async function main() {
const intakeStage = mainStages[0] // INTAKE - CLOSED
const filterStage = mainStages[1] // FILTER - ACTIVE
for (const row of validRecords) {
let skippedNoEmail = 0
for (let rowIdx = 0; rowIdx < validRecords.length; rowIdx++) {
const row = validRecords[rowIdx]
const email = (row['E-mail'] || '').trim().toLowerCase()
const name = (row['Full name'] || '').trim()
const phone = (row['Téléphone'] || '').trim() || null
@@ -855,7 +844,14 @@ async function main() {
const phase2Url = (row['PHASE 2 - Submission'] || '').trim() || null
const foundedAt = parseFoundedDate(row['Date of creation'])
// Create or get applicant user
// Skip rows with no usable email (can't create user without one)
if (!email || !email.includes('@')) {
skippedNoEmail++
console.log(` ⚠ Row ${rowIdx + 2}: skipped (no valid email)`)
continue
}
// Create or get applicant user (upsert handles duplicate emails)
const user = await prisma.user.upsert({
where: { email },
update: {
@@ -864,7 +860,7 @@ async function main() {
},
create: {
email,
name,
name: name || `Applicant ${rowIdx + 1}`,
role: UserRole.APPLICANT,
status: UserStatus.NONE,
phoneNumber: phone,
@@ -930,6 +926,9 @@ async function main() {
}
console.log(` ✓ Created ${projectCount} projects with stage states`)
if (skippedNoEmail > 0) {
console.log(` ⚠ Skipped ${skippedNoEmail} rows with no valid email`)
}
}
// ==========================================================================

View File

@@ -261,6 +261,34 @@ export async function runStageFiltering(
)
const aiRules = rules.filter((r: any) => r.ruleType === 'AI_SCREENING')
// ── Built-in: Duplicate submission detection ──────────────────────────────
// Group projects by submitter email to detect duplicate submissions.
// Duplicates are ALWAYS flagged for admin review (never auto-rejected).
const duplicateProjectIds = new Set<string>()
const emailToProjects = new Map<string, Array<{ id: string; title: string }>>()
for (const project of projects) {
const email = (project.submittedByEmail ?? '').toLowerCase().trim()
if (!email) continue
if (!emailToProjects.has(email)) emailToProjects.set(email, [])
emailToProjects.get(email)!.push({ id: project.id, title: project.title })
}
const duplicateGroups: Map<string, string[]> = new Map() // projectId → sibling ids
emailToProjects.forEach((group, _email) => {
if (group.length <= 1) return
const ids = group.map((p) => p.id)
for (const p of group) {
duplicateProjectIds.add(p.id)
duplicateGroups.set(p.id, ids.filter((id) => id !== p.id))
}
})
if (duplicateProjectIds.size > 0) {
console.log(`[Stage Filtering] Detected ${duplicateProjectIds.size} projects in duplicate groups`)
}
// ── End duplicate detection ───────────────────────────────────────────────
let passed = 0
let rejected = 0
let manualQueue = 0
@@ -271,6 +299,20 @@ export async function runStageFiltering(
let deterministicPassed = true
let deterministicOutcome: 'PASSED' | 'FILTERED_OUT' | 'FLAGGED' = 'PASSED'
// 0. Check for duplicate submissions (always FLAG, never auto-reject)
if (duplicateProjectIds.has(project.id)) {
const siblingIds = duplicateGroups.get(project.id) ?? []
ruleResults.push({
ruleId: '__duplicate_check',
ruleName: 'Duplicate Submission Check',
ruleType: 'DUPLICATE_CHECK',
passed: false,
action: 'FLAG',
reasoning: `Duplicate submission detected: same applicant email submitted ${siblingIds.length + 1} project(s). Sibling project IDs: ${siblingIds.join(', ')}. Admin must review and decide which to keep.`,
})
deterministicOutcome = 'FLAGGED'
}
// 1. Run deterministic rules
for (const rule of deterministicRules) {
const config = rule.configJson as unknown as RuleConfig
@@ -312,11 +354,12 @@ export async function runStageFiltering(
}
}
// 2. AI screening (only if deterministic passed)
// 2. AI screening (run if deterministic passed, OR if duplicate—so AI can recommend which to keep)
const isDuplicate = duplicateProjectIds.has(project.id)
let aiScreeningJson: Record<string, unknown> | null = null
let finalOutcome: 'PASSED' | 'FILTERED_OUT' | 'FLAGGED' = deterministicOutcome
if (deterministicPassed && aiRules.length > 0) {
if ((deterministicPassed || isDuplicate) && aiRules.length > 0) {
// Build a simplified AI screening result using the existing AI criteria
// In production this would call OpenAI via the ai-filtering service
const aiRule = aiRules[0]
@@ -337,12 +380,25 @@ export async function runStageFiltering(
: 'Insufficient project data for AI screening',
}
// Attach duplicate metadata so admin can see sibling projects
if (isDuplicate) {
const siblingIds = duplicateGroups.get(project.id) ?? []
aiScreeningJson.isDuplicate = true
aiScreeningJson.siblingProjectIds = siblingIds
aiScreeningJson.duplicateNote =
`This project shares a submitter email with ${siblingIds.length} other project(s). ` +
'AI screening should compare these and recommend which to keep.'
}
const banded = bandByConfidence({
confidence,
meetsAllCriteria: hasMinimalData,
})
finalOutcome = banded.outcome
// For non-duplicate projects, use AI banding; for duplicates, keep FLAGGED
if (!isDuplicate) {
finalOutcome = banded.outcome
}
ruleResults.push({
ruleId: aiRule.id,
@@ -354,6 +410,12 @@ export async function runStageFiltering(
})
}
// Duplicate submissions must ALWAYS be flagged for admin review,
// even if other rules would auto-reject them.
if (duplicateProjectIds.has(project.id) && finalOutcome === 'FILTERED_OUT') {
finalOutcome = 'FLAGGED'
}
await prisma.filteringResult.upsert({
where: {
stageId_projectId: {