Files
MOPC-Portal/src/server/services/document-analyzer.ts

368 lines
9.9 KiB
TypeScript
Raw Normal View History

/**
* Document Analyzer Service
*
* Extracts metadata from uploaded files:
* - Page count (PDFs)
* - Text preview (first ~2000 chars)
* - Language detection via franc
*
* Runs optionally on upload (controlled by SystemSettings) and
* retroactively via admin endpoint.
*/
import { getStorageProvider } from '@/lib/storage'
import { isParseableMimeType } from './file-content-extractor'
import { prisma } from '@/lib/prisma'
const TEXT_PREVIEW_LIMIT = 2000
const BATCH_SIZE = 10
// ─── Types ──────────────────────────────────────────────────────────────────
export type AnalysisResult = {
fileId: string
pageCount: number | null
textPreview: string | null
detectedLang: string | null
langConfidence: number | null
error?: string
}
// ─── Language Detection ──────────────────────────────────────────────────────
/**
* Detect language using franc. Returns ISO 639-3 code and confidence.
* franc returns a distance-based score where lower = better match.
* We convert to 0-1 confidence where 1 = perfect match.
*/
async function detectLanguage(
text: string
): Promise<{ lang: string; confidence: number }> {
if (!text || text.trim().length < 20) {
return { lang: 'und', confidence: 0 }
}
// Use a reasonable sample for detection (first 5000 chars)
const sample = text.slice(0, 5000)
const { francAll } = await import('franc')
const results = francAll(sample, { minLength: 10 })
if (!results || results.length === 0 || results[0][0] === 'und') {
return { lang: 'und', confidence: 0 }
}
const topLang = results[0][0]
const topScore = results[0][1] // 1.0 = best match, 0.0 = worst
// franc scores: 1.0 is best match, scale drops from there
// Convert to a 0-1 confidence
const confidence = Math.max(0, Math.min(1, topScore))
return { lang: topLang, confidence: Math.round(confidence * 100) / 100 }
}
// ─── Core Analysis ──────────────────────────────────────────────────────────
/**
* Analyze a single file: extract page count, text preview, and detect language.
* Downloads the file from storage, parses it, and returns results.
*/
export async function analyzeFileContent(
objectKey: string,
bucket: string,
mimeType: string,
fileName: string,
fileId: string
): Promise<AnalysisResult> {
const result: AnalysisResult = {
fileId,
pageCount: null,
textPreview: null,
detectedLang: null,
langConfidence: null,
}
if (!isParseableMimeType(mimeType)) {
return { ...result, error: 'Unsupported mime type for analysis' }
}
try {
const storage = await getStorageProvider()
const buffer = await storage.getObject(objectKey)
let text = ''
let pageCount: number | null = null
if (mimeType === 'application/pdf') {
const pdfParseModule = await import('pdf-parse')
const pdfParse =
typeof pdfParseModule === 'function'
? pdfParseModule
: (pdfParseModule as any).default ?? pdfParseModule
const pdf = await pdfParse(buffer)
text = pdf.text || ''
pageCount = pdf.numpages ?? null
} else {
// Text-based files (plain text, CSV, markdown, HTML, RTF)
text = buffer.toString('utf-8')
}
result.pageCount = pageCount
// Text preview
if (text.trim()) {
result.textPreview =
text.length > TEXT_PREVIEW_LIMIT
? text.slice(0, TEXT_PREVIEW_LIMIT)
: text
}
// Language detection
if (text.trim().length >= 20) {
const langResult = await detectLanguage(text)
result.detectedLang = langResult.lang
result.langConfidence = langResult.confidence
}
return result
} catch (error) {
console.warn(
`[DocAnalyzer] Failed to analyze ${fileName}:`,
error instanceof Error ? error.message : error
)
return {
...result,
error: error instanceof Error ? error.message : 'Analysis failed',
}
}
}
// ─── DB-Integrated Operations ───────────────────────────────────────────────
/**
* Analyze a single file by ID and persist results to DB.
*/
export async function analyzeFile(fileId: string): Promise<AnalysisResult> {
const file = await prisma.projectFile.findUnique({
where: { id: fileId },
select: {
id: true,
objectKey: true,
bucket: true,
mimeType: true,
fileName: true,
},
})
if (!file) {
return {
fileId,
pageCount: null,
textPreview: null,
detectedLang: null,
langConfidence: null,
error: 'File not found',
}
}
const result = await analyzeFileContent(
file.objectKey,
file.bucket,
file.mimeType,
file.fileName,
file.id
)
// Persist results
await prisma.projectFile.update({
where: { id: fileId },
data: {
pageCount: result.pageCount,
textPreview: result.textPreview,
detectedLang: result.detectedLang,
langConfidence: result.langConfidence,
analyzedAt: new Date(),
},
})
return result
}
/**
* Analyze a single file by ID with a delay (for post-upload use).
* The delay accounts for presigned URL uploads where the file
* may not be in storage yet when the DB record is created.
*/
export async function analyzeFileDelayed(
fileId: string,
delayMs = 3000
): Promise<AnalysisResult> {
await new Promise((resolve) => setTimeout(resolve, delayMs))
return analyzeFile(fileId)
}
/**
* Analyze all files for a specific project.
*/
export async function analyzeProjectFiles(
projectId: string
): Promise<{ analyzed: number; failed: number; total: number }> {
const files = await prisma.projectFile.findMany({
where: { projectId },
select: {
id: true,
objectKey: true,
bucket: true,
mimeType: true,
fileName: true,
},
})
let analyzed = 0
let failed = 0
// Process in batches
for (let i = 0; i < files.length; i += BATCH_SIZE) {
const batch = files.slice(i, i + BATCH_SIZE)
const results = await Promise.allSettled(
batch.map(async (file) => {
if (!isParseableMimeType(file.mimeType)) {
// Mark non-parseable files as analyzed with no data
await prisma.projectFile.update({
where: { id: file.id },
data: { analyzedAt: new Date() },
})
return 'skipped'
}
const result = await analyzeFileContent(
file.objectKey,
file.bucket,
file.mimeType,
file.fileName,
file.id
)
await prisma.projectFile.update({
where: { id: file.id },
data: {
pageCount: result.pageCount,
textPreview: result.textPreview,
detectedLang: result.detectedLang,
langConfidence: result.langConfidence,
analyzedAt: new Date(),
},
})
return result.error ? 'failed' : 'analyzed'
})
)
for (const r of results) {
if (r.status === 'fulfilled') {
if (r.value === 'analyzed') analyzed++
else if (r.value === 'failed') failed++
} else {
failed++
}
}
}
return { analyzed, failed, total: files.length }
}
/**
* Retroactive batch analysis: analyze all files that haven't been analyzed yet.
* Returns counts. Processes in batches to avoid memory issues.
*/
export async function analyzeAllUnanalyzed(): Promise<{
analyzed: number
failed: number
skipped: number
total: number
}> {
const files = await prisma.projectFile.findMany({
where: { analyzedAt: null },
select: {
id: true,
objectKey: true,
bucket: true,
mimeType: true,
fileName: true,
},
orderBy: { createdAt: 'desc' },
})
let analyzed = 0
let failed = 0
let skipped = 0
for (let i = 0; i < files.length; i += BATCH_SIZE) {
const batch = files.slice(i, i + BATCH_SIZE)
const results = await Promise.allSettled(
batch.map(async (file) => {
if (!isParseableMimeType(file.mimeType)) {
await prisma.projectFile.update({
where: { id: file.id },
data: { analyzedAt: new Date() },
})
return 'skipped'
}
const result = await analyzeFileContent(
file.objectKey,
file.bucket,
file.mimeType,
file.fileName,
file.id
)
await prisma.projectFile.update({
where: { id: file.id },
data: {
pageCount: result.pageCount,
textPreview: result.textPreview,
detectedLang: result.detectedLang,
langConfidence: result.langConfidence,
analyzedAt: new Date(),
},
})
return result.error ? 'failed' : 'analyzed'
})
)
for (const r of results) {
if (r.status === 'fulfilled') {
if (r.value === 'analyzed') analyzed++
else if (r.value === 'failed') failed++
else if (r.value === 'skipped') skipped++
} else {
failed++
}
}
console.log(
`[DocAnalyzer] Batch progress: ${i + batch.length}/${files.length} (${analyzed} analyzed, ${skipped} skipped, ${failed} failed)`
)
}
return { analyzed, failed, skipped, total: files.length }
}
/**
* Check if auto-analysis is enabled via SystemSettings.
*/
export async function isAutoAnalysisEnabled(): Promise<boolean> {
try {
const setting = await prisma.systemSettings.findUnique({
where: { key: 'file_analysis_auto_enabled' },
})
// Default to true if setting doesn't exist
return setting?.value !== 'false'
} catch {
return true
}
}