Re-apply: seed all CSV entries, fix category mapping, add duplicate detection
Some checks failed
Build and Push Docker Image / build (push) Failing after 3m40s
Some checks failed
Build and Push Docker Image / build (push) Failing after 3m40s
Rebase had inverted ours/theirs, reverting our changes. Re-applying: - normalizeSpaces() for non-breaking space fix in category mapping - Remove isValidEntry filter, include all CSV rows for AI screening - Duplicate submission detection in stage-filtering (always flags, never auto-rejects) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -50,9 +50,14 @@ const issueMap: Record<string, OceanIssue> = {
|
||||
'Other': OceanIssue.OTHER,
|
||||
}
|
||||
|
||||
function normalizeSpaces(s: string): string {
|
||||
// Replace non-breaking spaces (U+00A0) and other whitespace variants with regular spaces
|
||||
return s.replace(/\u00A0/g, ' ')
|
||||
}
|
||||
|
||||
function mapCategory(raw: string | undefined): CompetitionCategory | null {
|
||||
if (!raw) return null
|
||||
const trimmed = raw.trim()
|
||||
const trimmed = normalizeSpaces(raw.trim())
|
||||
for (const [prefix, value] of Object.entries(categoryMap)) {
|
||||
if (trimmed.startsWith(prefix)) return value
|
||||
}
|
||||
@@ -61,7 +66,7 @@ function mapCategory(raw: string | undefined): CompetitionCategory | null {
|
||||
|
||||
function mapIssue(raw: string | undefined): OceanIssue | null {
|
||||
if (!raw) return null
|
||||
const trimmed = raw.trim()
|
||||
const trimmed = normalizeSpaces(raw.trim())
|
||||
for (const [prefix, value] of Object.entries(issueMap)) {
|
||||
if (trimmed.startsWith(prefix)) return value
|
||||
}
|
||||
@@ -76,17 +81,11 @@ function parseFoundedDate(raw: string | undefined): Date | null {
|
||||
return isNaN(d.getTime()) ? null : d
|
||||
}
|
||||
|
||||
function isValidEntry(row: Record<string, string>): boolean {
|
||||
const status = (row['Application status'] || '').trim().toLowerCase()
|
||||
if (status === 'ignore' || status === 'doublon') return false
|
||||
|
||||
function isEmptyRow(row: Record<string, string>): boolean {
|
||||
const name = (row['Full name'] || '').trim()
|
||||
if (name.length <= 2) return false // skip test entries
|
||||
|
||||
const email = (row['E-mail'] || '').trim()
|
||||
if (!email || !email.includes('@')) return false
|
||||
|
||||
return true
|
||||
const project = (row["Project's name"] || '').trim()
|
||||
return !name && !email && !project
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
@@ -814,21 +813,9 @@ async function main() {
|
||||
|
||||
console.log(` Raw CSV rows: ${records.length}`)
|
||||
|
||||
// Filter and deduplicate
|
||||
const seenEmails = new Set<string>()
|
||||
const validRecords: Record<string, string>[] = []
|
||||
|
||||
for (const row of records) {
|
||||
if (!isValidEntry(row)) continue
|
||||
|
||||
const email = (row['E-mail'] || '').trim().toLowerCase()
|
||||
if (seenEmails.has(email)) continue
|
||||
|
||||
seenEmails.add(email)
|
||||
validRecords.push(row)
|
||||
}
|
||||
|
||||
console.log(` Valid entries after filtering: ${validRecords.length}`)
|
||||
// Skip only completely empty rows (no name, no email, no project)
|
||||
const validRecords = records.filter((row: Record<string, string>) => !isEmptyRow(row))
|
||||
console.log(` Entries to seed: ${validRecords.length}`)
|
||||
|
||||
// Create applicant users and projects
|
||||
console.log('\n🚀 Creating applicant users and projects...')
|
||||
@@ -836,7 +823,9 @@ async function main() {
|
||||
const intakeStage = mainStages[0] // INTAKE - CLOSED
|
||||
const filterStage = mainStages[1] // FILTER - ACTIVE
|
||||
|
||||
for (const row of validRecords) {
|
||||
let skippedNoEmail = 0
|
||||
for (let rowIdx = 0; rowIdx < validRecords.length; rowIdx++) {
|
||||
const row = validRecords[rowIdx]
|
||||
const email = (row['E-mail'] || '').trim().toLowerCase()
|
||||
const name = (row['Full name'] || '').trim()
|
||||
const phone = (row['Téléphone'] || '').trim() || null
|
||||
@@ -855,7 +844,14 @@ async function main() {
|
||||
const phase2Url = (row['PHASE 2 - Submission'] || '').trim() || null
|
||||
const foundedAt = parseFoundedDate(row['Date of creation'])
|
||||
|
||||
// Create or get applicant user
|
||||
// Skip rows with no usable email (can't create user without one)
|
||||
if (!email || !email.includes('@')) {
|
||||
skippedNoEmail++
|
||||
console.log(` ⚠ Row ${rowIdx + 2}: skipped (no valid email)`)
|
||||
continue
|
||||
}
|
||||
|
||||
// Create or get applicant user (upsert handles duplicate emails)
|
||||
const user = await prisma.user.upsert({
|
||||
where: { email },
|
||||
update: {
|
||||
@@ -864,7 +860,7 @@ async function main() {
|
||||
},
|
||||
create: {
|
||||
email,
|
||||
name,
|
||||
name: name || `Applicant ${rowIdx + 1}`,
|
||||
role: UserRole.APPLICANT,
|
||||
status: UserStatus.NONE,
|
||||
phoneNumber: phone,
|
||||
@@ -930,6 +926,9 @@ async function main() {
|
||||
}
|
||||
|
||||
console.log(` ✓ Created ${projectCount} projects with stage states`)
|
||||
if (skippedNoEmail > 0) {
|
||||
console.log(` ⚠ Skipped ${skippedNoEmail} rows with no valid email`)
|
||||
}
|
||||
}
|
||||
|
||||
// ==========================================================================
|
||||
|
||||
@@ -261,6 +261,34 @@ export async function runStageFiltering(
|
||||
)
|
||||
const aiRules = rules.filter((r: any) => r.ruleType === 'AI_SCREENING')
|
||||
|
||||
// ── Built-in: Duplicate submission detection ──────────────────────────────
|
||||
// Group projects by submitter email to detect duplicate submissions.
|
||||
// Duplicates are ALWAYS flagged for admin review (never auto-rejected).
|
||||
const duplicateProjectIds = new Set<string>()
|
||||
const emailToProjects = new Map<string, Array<{ id: string; title: string }>>()
|
||||
|
||||
for (const project of projects) {
|
||||
const email = (project.submittedByEmail ?? '').toLowerCase().trim()
|
||||
if (!email) continue
|
||||
if (!emailToProjects.has(email)) emailToProjects.set(email, [])
|
||||
emailToProjects.get(email)!.push({ id: project.id, title: project.title })
|
||||
}
|
||||
|
||||
const duplicateGroups: Map<string, string[]> = new Map() // projectId → sibling ids
|
||||
emailToProjects.forEach((group, _email) => {
|
||||
if (group.length <= 1) return
|
||||
const ids = group.map((p) => p.id)
|
||||
for (const p of group) {
|
||||
duplicateProjectIds.add(p.id)
|
||||
duplicateGroups.set(p.id, ids.filter((id) => id !== p.id))
|
||||
}
|
||||
})
|
||||
|
||||
if (duplicateProjectIds.size > 0) {
|
||||
console.log(`[Stage Filtering] Detected ${duplicateProjectIds.size} projects in duplicate groups`)
|
||||
}
|
||||
// ── End duplicate detection ───────────────────────────────────────────────
|
||||
|
||||
let passed = 0
|
||||
let rejected = 0
|
||||
let manualQueue = 0
|
||||
@@ -271,6 +299,20 @@ export async function runStageFiltering(
|
||||
let deterministicPassed = true
|
||||
let deterministicOutcome: 'PASSED' | 'FILTERED_OUT' | 'FLAGGED' = 'PASSED'
|
||||
|
||||
// 0. Check for duplicate submissions (always FLAG, never auto-reject)
|
||||
if (duplicateProjectIds.has(project.id)) {
|
||||
const siblingIds = duplicateGroups.get(project.id) ?? []
|
||||
ruleResults.push({
|
||||
ruleId: '__duplicate_check',
|
||||
ruleName: 'Duplicate Submission Check',
|
||||
ruleType: 'DUPLICATE_CHECK',
|
||||
passed: false,
|
||||
action: 'FLAG',
|
||||
reasoning: `Duplicate submission detected: same applicant email submitted ${siblingIds.length + 1} project(s). Sibling project IDs: ${siblingIds.join(', ')}. Admin must review and decide which to keep.`,
|
||||
})
|
||||
deterministicOutcome = 'FLAGGED'
|
||||
}
|
||||
|
||||
// 1. Run deterministic rules
|
||||
for (const rule of deterministicRules) {
|
||||
const config = rule.configJson as unknown as RuleConfig
|
||||
@@ -312,11 +354,12 @@ export async function runStageFiltering(
|
||||
}
|
||||
}
|
||||
|
||||
// 2. AI screening (only if deterministic passed)
|
||||
// 2. AI screening (run if deterministic passed, OR if duplicate—so AI can recommend which to keep)
|
||||
const isDuplicate = duplicateProjectIds.has(project.id)
|
||||
let aiScreeningJson: Record<string, unknown> | null = null
|
||||
let finalOutcome: 'PASSED' | 'FILTERED_OUT' | 'FLAGGED' = deterministicOutcome
|
||||
|
||||
if (deterministicPassed && aiRules.length > 0) {
|
||||
if ((deterministicPassed || isDuplicate) && aiRules.length > 0) {
|
||||
// Build a simplified AI screening result using the existing AI criteria
|
||||
// In production this would call OpenAI via the ai-filtering service
|
||||
const aiRule = aiRules[0]
|
||||
@@ -337,12 +380,25 @@ export async function runStageFiltering(
|
||||
: 'Insufficient project data for AI screening',
|
||||
}
|
||||
|
||||
// Attach duplicate metadata so admin can see sibling projects
|
||||
if (isDuplicate) {
|
||||
const siblingIds = duplicateGroups.get(project.id) ?? []
|
||||
aiScreeningJson.isDuplicate = true
|
||||
aiScreeningJson.siblingProjectIds = siblingIds
|
||||
aiScreeningJson.duplicateNote =
|
||||
`This project shares a submitter email with ${siblingIds.length} other project(s). ` +
|
||||
'AI screening should compare these and recommend which to keep.'
|
||||
}
|
||||
|
||||
const banded = bandByConfidence({
|
||||
confidence,
|
||||
meetsAllCriteria: hasMinimalData,
|
||||
})
|
||||
|
||||
// For non-duplicate projects, use AI banding; for duplicates, keep FLAGGED
|
||||
if (!isDuplicate) {
|
||||
finalOutcome = banded.outcome
|
||||
}
|
||||
|
||||
ruleResults.push({
|
||||
ruleId: aiRule.id,
|
||||
@@ -354,6 +410,12 @@ export async function runStageFiltering(
|
||||
})
|
||||
}
|
||||
|
||||
// Duplicate submissions must ALWAYS be flagged for admin review,
|
||||
// even if other rules would auto-reject them.
|
||||
if (duplicateProjectIds.has(project.id) && finalOutcome === 'FILTERED_OUT') {
|
||||
finalOutcome = 'FLAGGED'
|
||||
}
|
||||
|
||||
await prisma.filteringResult.upsert({
|
||||
where: {
|
||||
stageId_projectId: {
|
||||
|
||||
Reference in New Issue
Block a user