Add document analysis: page count, text extraction & language detection

Introduces a document analyzer service that extracts page count (via pdf-parse), text preview, and detected language (via franc) from uploaded files. Analysis runs automatically on upload (configurable via SystemSettings) and can be triggered retroactively for existing files. Results are displayed as badges in the FileViewer and fed to AI screening for language-based filtering criteria. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 10:08:04 +01:00
parent 771f35c695
commit c9640c6086
13 changed files with 565 additions and 10 deletions
--- a/src/server/services/anonymization.ts
+++ b/src/server/services/anonymization.ts
@@ -83,6 +83,8 @@ export interface AnonymizedFileInfo {
  file_type: string // FileType enum value
  page_count: number | null // Number of pages if known
  size_kb: number // File size in KB
+  detected_lang?: string | null // ISO 639-3 language code (e.g. 'eng', 'fra')
+  lang_confidence?: number | null // 0.0–1.0 confidence score
  round_name?: string | null // Which round the file was submitted for
  is_current_round?: boolean // Whether this file belongs to the current filtering/evaluation round
  text_content?: string // Extracted text content (when aiParseFiles is enabled)
@@ -309,6 +311,8 @@ export function anonymizeProjectForAI(
      file_type: f.fileType ?? 'OTHER',
      page_count: f.pageCount ?? null,
      size_kb: Math.round((f.size ?? 0) / 1024),
+      ...(f.detectedLang ? { detected_lang: f.detectedLang } : {}),
+      ...(f.langConfidence != null ? { lang_confidence: f.langConfidence } : {}),
      ...(f.roundName ? { round_name: f.roundName } : {}),
      ...(f.isCurrentRound !== undefined ? { is_current_round: f.isCurrentRound } : {}),
      ...(f.textContent ? { text_content: f.textContent } : {}),