Add document analysis: page count, text extraction & language detection
All checks were successful
Build and Push Docker Image / build (push) Successful in 11m7s

Introduces a document analyzer service that extracts page count (via pdf-parse),
text preview, and detected language (via franc) from uploaded files. Analysis runs
automatically on upload (configurable via SystemSettings) and can be triggered
retroactively for existing files. Results are displayed as badges in the FileViewer
and fed to AI screening for language-based filtering criteria.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt
2026-02-17 10:08:04 +01:00
parent 771f35c695
commit c9640c6086
13 changed files with 565 additions and 10 deletions

View File

@@ -0,0 +1,367 @@
/**
* Document Analyzer Service
*
* Extracts metadata from uploaded files:
* - Page count (PDFs)
* - Text preview (first ~2000 chars)
* - Language detection via franc
*
* Runs optionally on upload (controlled by SystemSettings) and
* retroactively via admin endpoint.
*/
import { getStorageProvider } from '@/lib/storage'
import { isParseableMimeType } from './file-content-extractor'
import { prisma } from '@/lib/prisma'
const TEXT_PREVIEW_LIMIT = 2000
const BATCH_SIZE = 10
// ─── Types ──────────────────────────────────────────────────────────────────
export type AnalysisResult = {
fileId: string
pageCount: number | null
textPreview: string | null
detectedLang: string | null
langConfidence: number | null
error?: string
}
// ─── Language Detection ──────────────────────────────────────────────────────
/**
* Detect language using franc. Returns ISO 639-3 code and confidence.
* franc returns a distance-based score where lower = better match.
* We convert to 0-1 confidence where 1 = perfect match.
*/
async function detectLanguage(
text: string
): Promise<{ lang: string; confidence: number }> {
if (!text || text.trim().length < 20) {
return { lang: 'und', confidence: 0 }
}
// Use a reasonable sample for detection (first 5000 chars)
const sample = text.slice(0, 5000)
const { francAll } = await import('franc')
const results = francAll(sample, { minLength: 10 })
if (!results || results.length === 0 || results[0][0] === 'und') {
return { lang: 'und', confidence: 0 }
}
const topLang = results[0][0]
const topScore = results[0][1] // 1.0 = best match, 0.0 = worst
// franc scores: 1.0 is best match, scale drops from there
// Convert to a 0-1 confidence
const confidence = Math.max(0, Math.min(1, topScore))
return { lang: topLang, confidence: Math.round(confidence * 100) / 100 }
}
// ─── Core Analysis ──────────────────────────────────────────────────────────
/**
* Analyze a single file: extract page count, text preview, and detect language.
* Downloads the file from storage, parses it, and returns results.
*/
export async function analyzeFileContent(
objectKey: string,
bucket: string,
mimeType: string,
fileName: string,
fileId: string
): Promise<AnalysisResult> {
const result: AnalysisResult = {
fileId,
pageCount: null,
textPreview: null,
detectedLang: null,
langConfidence: null,
}
if (!isParseableMimeType(mimeType)) {
return { ...result, error: 'Unsupported mime type for analysis' }
}
try {
const storage = await getStorageProvider()
const buffer = await storage.getObject(objectKey)
let text = ''
let pageCount: number | null = null
if (mimeType === 'application/pdf') {
const pdfParseModule = await import('pdf-parse')
const pdfParse =
typeof pdfParseModule === 'function'
? pdfParseModule
: (pdfParseModule as any).default ?? pdfParseModule
const pdf = await pdfParse(buffer)
text = pdf.text || ''
pageCount = pdf.numpages ?? null
} else {
// Text-based files (plain text, CSV, markdown, HTML, RTF)
text = buffer.toString('utf-8')
}
result.pageCount = pageCount
// Text preview
if (text.trim()) {
result.textPreview =
text.length > TEXT_PREVIEW_LIMIT
? text.slice(0, TEXT_PREVIEW_LIMIT)
: text
}
// Language detection
if (text.trim().length >= 20) {
const langResult = await detectLanguage(text)
result.detectedLang = langResult.lang
result.langConfidence = langResult.confidence
}
return result
} catch (error) {
console.warn(
`[DocAnalyzer] Failed to analyze ${fileName}:`,
error instanceof Error ? error.message : error
)
return {
...result,
error: error instanceof Error ? error.message : 'Analysis failed',
}
}
}
// ─── DB-Integrated Operations ───────────────────────────────────────────────
/**
* Analyze a single file by ID and persist results to DB.
*/
export async function analyzeFile(fileId: string): Promise<AnalysisResult> {
const file = await prisma.projectFile.findUnique({
where: { id: fileId },
select: {
id: true,
objectKey: true,
bucket: true,
mimeType: true,
fileName: true,
},
})
if (!file) {
return {
fileId,
pageCount: null,
textPreview: null,
detectedLang: null,
langConfidence: null,
error: 'File not found',
}
}
const result = await analyzeFileContent(
file.objectKey,
file.bucket,
file.mimeType,
file.fileName,
file.id
)
// Persist results
await prisma.projectFile.update({
where: { id: fileId },
data: {
pageCount: result.pageCount,
textPreview: result.textPreview,
detectedLang: result.detectedLang,
langConfidence: result.langConfidence,
analyzedAt: new Date(),
},
})
return result
}
/**
* Analyze a single file by ID with a delay (for post-upload use).
* The delay accounts for presigned URL uploads where the file
* may not be in storage yet when the DB record is created.
*/
export async function analyzeFileDelayed(
fileId: string,
delayMs = 3000
): Promise<AnalysisResult> {
await new Promise((resolve) => setTimeout(resolve, delayMs))
return analyzeFile(fileId)
}
/**
* Analyze all files for a specific project.
*/
export async function analyzeProjectFiles(
projectId: string
): Promise<{ analyzed: number; failed: number; total: number }> {
const files = await prisma.projectFile.findMany({
where: { projectId },
select: {
id: true,
objectKey: true,
bucket: true,
mimeType: true,
fileName: true,
},
})
let analyzed = 0
let failed = 0
// Process in batches
for (let i = 0; i < files.length; i += BATCH_SIZE) {
const batch = files.slice(i, i + BATCH_SIZE)
const results = await Promise.allSettled(
batch.map(async (file) => {
if (!isParseableMimeType(file.mimeType)) {
// Mark non-parseable files as analyzed with no data
await prisma.projectFile.update({
where: { id: file.id },
data: { analyzedAt: new Date() },
})
return 'skipped'
}
const result = await analyzeFileContent(
file.objectKey,
file.bucket,
file.mimeType,
file.fileName,
file.id
)
await prisma.projectFile.update({
where: { id: file.id },
data: {
pageCount: result.pageCount,
textPreview: result.textPreview,
detectedLang: result.detectedLang,
langConfidence: result.langConfidence,
analyzedAt: new Date(),
},
})
return result.error ? 'failed' : 'analyzed'
})
)
for (const r of results) {
if (r.status === 'fulfilled') {
if (r.value === 'analyzed') analyzed++
else if (r.value === 'failed') failed++
} else {
failed++
}
}
}
return { analyzed, failed, total: files.length }
}
/**
* Retroactive batch analysis: analyze all files that haven't been analyzed yet.
* Returns counts. Processes in batches to avoid memory issues.
*/
export async function analyzeAllUnanalyzed(): Promise<{
analyzed: number
failed: number
skipped: number
total: number
}> {
const files = await prisma.projectFile.findMany({
where: { analyzedAt: null },
select: {
id: true,
objectKey: true,
bucket: true,
mimeType: true,
fileName: true,
},
orderBy: { createdAt: 'desc' },
})
let analyzed = 0
let failed = 0
let skipped = 0
for (let i = 0; i < files.length; i += BATCH_SIZE) {
const batch = files.slice(i, i + BATCH_SIZE)
const results = await Promise.allSettled(
batch.map(async (file) => {
if (!isParseableMimeType(file.mimeType)) {
await prisma.projectFile.update({
where: { id: file.id },
data: { analyzedAt: new Date() },
})
return 'skipped'
}
const result = await analyzeFileContent(
file.objectKey,
file.bucket,
file.mimeType,
file.fileName,
file.id
)
await prisma.projectFile.update({
where: { id: file.id },
data: {
pageCount: result.pageCount,
textPreview: result.textPreview,
detectedLang: result.detectedLang,
langConfidence: result.langConfidence,
analyzedAt: new Date(),
},
})
return result.error ? 'failed' : 'analyzed'
})
)
for (const r of results) {
if (r.status === 'fulfilled') {
if (r.value === 'analyzed') analyzed++
else if (r.value === 'failed') failed++
else if (r.value === 'skipped') skipped++
} else {
failed++
}
}
console.log(
`[DocAnalyzer] Batch progress: ${i + batch.length}/${files.length} (${analyzed} analyzed, ${skipped} skipped, ${failed} failed)`
)
}
return { analyzed, failed, skipped, total: files.length }
}
/**
* Check if auto-analysis is enabled via SystemSettings.
*/
export async function isAutoAnalysisEnabled(): Promise<boolean> {
try {
const setting = await prisma.systemSettings.findUnique({
where: { key: 'file_analysis_auto_enabled' },
})
// Default to true if setting doesn't exist
return setting?.value !== 'false'
} catch {
return true
}
}