Add document analysis: page count, text extraction & language detection
All checks were successful
Build and Push Docker Image / build (push) Successful in 11m7s

Introduces a document analyzer service that extracts page count (via pdf-parse),
text preview, and detected language (via franc) from uploaded files. Analysis runs
automatically on upload (configurable via SystemSettings) and can be triggered
retroactively for existing files. Results are displayed as badges in the FileViewer
and fed to AI screening for language-based filtering criteria.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt
2026-02-17 10:08:04 +01:00
parent 771f35c695
commit c9640c6086
13 changed files with 565 additions and 10 deletions

View File

@@ -1599,7 +1599,7 @@ function FilteringRulesSection({ roundId }: { roundId: string }) {
className="text-sm"
/>
<p className="text-xs text-muted-foreground mt-1">
The AI has access to: category, country, region, founded year, ocean issue, tags, description, file details (type, page count, size), and team size.
The AI has access to: category, country, region, founded year, ocean issue, tags, description, file details (type, page count, size, detected language), and team size.
</p>
</div>

View File

@@ -65,6 +65,12 @@ interface ProjectFile {
isLate?: boolean
requirementId?: string | null
requirement?: FileRequirementInfo | null
// Document analysis fields
pageCount?: number | null
textPreview?: string | null
detectedLang?: string | null
langConfidence?: number | null
analyzedAt?: Date | string | null
}
interface RoundGroup {
@@ -270,6 +276,25 @@ function FileItem({ file }: { file: ProjectFile }) {
</Badge>
)}
<span>{formatFileSize(file.size)}</span>
{file.pageCount != null && (
<Badge variant="outline" className="text-xs gap-1">
<FileText className="h-3 w-3" />
{file.pageCount} {file.pageCount === 1 ? 'page' : 'pages'}
</Badge>
)}
{file.detectedLang && file.detectedLang !== 'und' && (
<Badge
variant="outline"
className={cn('text-xs font-mono uppercase', {
'border-green-300 text-green-700 bg-green-50': file.langConfidence != null && file.langConfidence >= 0.8,
'border-amber-300 text-amber-700 bg-amber-50': file.langConfidence != null && file.langConfidence >= 0.4 && file.langConfidence < 0.8,
'border-red-300 text-red-700 bg-red-50': file.langConfidence != null && file.langConfidence < 0.4,
})}
title={`Language: ${file.detectedLang} (${Math.round((file.langConfidence ?? 0) * 100)}% confidence)`}
>
{file.detectedLang.toUpperCase()}
</Badge>
)}
</div>
</div>