368 lines
9.9 KiB
TypeScript
368 lines
9.9 KiB
TypeScript
|
|
/**
|
||
|
|
* Document Analyzer Service
|
||
|
|
*
|
||
|
|
* Extracts metadata from uploaded files:
|
||
|
|
* - Page count (PDFs)
|
||
|
|
* - Text preview (first ~2000 chars)
|
||
|
|
* - Language detection via franc
|
||
|
|
*
|
||
|
|
* Runs optionally on upload (controlled by SystemSettings) and
|
||
|
|
* retroactively via admin endpoint.
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { getStorageProvider } from '@/lib/storage'
|
||
|
|
import { isParseableMimeType } from './file-content-extractor'
|
||
|
|
import { prisma } from '@/lib/prisma'
|
||
|
|
|
||
|
|
const TEXT_PREVIEW_LIMIT = 2000
|
||
|
|
const BATCH_SIZE = 10
|
||
|
|
|
||
|
|
// ─── Types ──────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
export type AnalysisResult = {
|
||
|
|
fileId: string
|
||
|
|
pageCount: number | null
|
||
|
|
textPreview: string | null
|
||
|
|
detectedLang: string | null
|
||
|
|
langConfidence: number | null
|
||
|
|
error?: string
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Language Detection ──────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Detect language using franc. Returns ISO 639-3 code and confidence.
|
||
|
|
* franc returns a distance-based score where lower = better match.
|
||
|
|
* We convert to 0-1 confidence where 1 = perfect match.
|
||
|
|
*/
|
||
|
|
async function detectLanguage(
|
||
|
|
text: string
|
||
|
|
): Promise<{ lang: string; confidence: number }> {
|
||
|
|
if (!text || text.trim().length < 20) {
|
||
|
|
return { lang: 'und', confidence: 0 }
|
||
|
|
}
|
||
|
|
|
||
|
|
// Use a reasonable sample for detection (first 5000 chars)
|
||
|
|
const sample = text.slice(0, 5000)
|
||
|
|
|
||
|
|
const { francAll } = await import('franc')
|
||
|
|
const results = francAll(sample, { minLength: 10 })
|
||
|
|
|
||
|
|
if (!results || results.length === 0 || results[0][0] === 'und') {
|
||
|
|
return { lang: 'und', confidence: 0 }
|
||
|
|
}
|
||
|
|
|
||
|
|
const topLang = results[0][0]
|
||
|
|
const topScore = results[0][1] // 1.0 = best match, 0.0 = worst
|
||
|
|
|
||
|
|
// franc scores: 1.0 is best match, scale drops from there
|
||
|
|
// Convert to a 0-1 confidence
|
||
|
|
const confidence = Math.max(0, Math.min(1, topScore))
|
||
|
|
|
||
|
|
return { lang: topLang, confidence: Math.round(confidence * 100) / 100 }
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Core Analysis ──────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Analyze a single file: extract page count, text preview, and detect language.
|
||
|
|
* Downloads the file from storage, parses it, and returns results.
|
||
|
|
*/
|
||
|
|
export async function analyzeFileContent(
|
||
|
|
objectKey: string,
|
||
|
|
bucket: string,
|
||
|
|
mimeType: string,
|
||
|
|
fileName: string,
|
||
|
|
fileId: string
|
||
|
|
): Promise<AnalysisResult> {
|
||
|
|
const result: AnalysisResult = {
|
||
|
|
fileId,
|
||
|
|
pageCount: null,
|
||
|
|
textPreview: null,
|
||
|
|
detectedLang: null,
|
||
|
|
langConfidence: null,
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!isParseableMimeType(mimeType)) {
|
||
|
|
return { ...result, error: 'Unsupported mime type for analysis' }
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
const storage = await getStorageProvider()
|
||
|
|
const buffer = await storage.getObject(objectKey)
|
||
|
|
|
||
|
|
let text = ''
|
||
|
|
let pageCount: number | null = null
|
||
|
|
|
||
|
|
if (mimeType === 'application/pdf') {
|
||
|
|
const pdfParseModule = await import('pdf-parse')
|
||
|
|
const pdfParse =
|
||
|
|
typeof pdfParseModule === 'function'
|
||
|
|
? pdfParseModule
|
||
|
|
: (pdfParseModule as any).default ?? pdfParseModule
|
||
|
|
const pdf = await pdfParse(buffer)
|
||
|
|
text = pdf.text || ''
|
||
|
|
pageCount = pdf.numpages ?? null
|
||
|
|
} else {
|
||
|
|
// Text-based files (plain text, CSV, markdown, HTML, RTF)
|
||
|
|
text = buffer.toString('utf-8')
|
||
|
|
}
|
||
|
|
|
||
|
|
result.pageCount = pageCount
|
||
|
|
|
||
|
|
// Text preview
|
||
|
|
if (text.trim()) {
|
||
|
|
result.textPreview =
|
||
|
|
text.length > TEXT_PREVIEW_LIMIT
|
||
|
|
? text.slice(0, TEXT_PREVIEW_LIMIT)
|
||
|
|
: text
|
||
|
|
}
|
||
|
|
|
||
|
|
// Language detection
|
||
|
|
if (text.trim().length >= 20) {
|
||
|
|
const langResult = await detectLanguage(text)
|
||
|
|
result.detectedLang = langResult.lang
|
||
|
|
result.langConfidence = langResult.confidence
|
||
|
|
}
|
||
|
|
|
||
|
|
return result
|
||
|
|
} catch (error) {
|
||
|
|
console.warn(
|
||
|
|
`[DocAnalyzer] Failed to analyze ${fileName}:`,
|
||
|
|
error instanceof Error ? error.message : error
|
||
|
|
)
|
||
|
|
return {
|
||
|
|
...result,
|
||
|
|
error: error instanceof Error ? error.message : 'Analysis failed',
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── DB-Integrated Operations ───────────────────────────────────────────────
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Analyze a single file by ID and persist results to DB.
|
||
|
|
*/
|
||
|
|
export async function analyzeFile(fileId: string): Promise<AnalysisResult> {
|
||
|
|
const file = await prisma.projectFile.findUnique({
|
||
|
|
where: { id: fileId },
|
||
|
|
select: {
|
||
|
|
id: true,
|
||
|
|
objectKey: true,
|
||
|
|
bucket: true,
|
||
|
|
mimeType: true,
|
||
|
|
fileName: true,
|
||
|
|
},
|
||
|
|
})
|
||
|
|
|
||
|
|
if (!file) {
|
||
|
|
return {
|
||
|
|
fileId,
|
||
|
|
pageCount: null,
|
||
|
|
textPreview: null,
|
||
|
|
detectedLang: null,
|
||
|
|
langConfidence: null,
|
||
|
|
error: 'File not found',
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
const result = await analyzeFileContent(
|
||
|
|
file.objectKey,
|
||
|
|
file.bucket,
|
||
|
|
file.mimeType,
|
||
|
|
file.fileName,
|
||
|
|
file.id
|
||
|
|
)
|
||
|
|
|
||
|
|
// Persist results
|
||
|
|
await prisma.projectFile.update({
|
||
|
|
where: { id: fileId },
|
||
|
|
data: {
|
||
|
|
pageCount: result.pageCount,
|
||
|
|
textPreview: result.textPreview,
|
||
|
|
detectedLang: result.detectedLang,
|
||
|
|
langConfidence: result.langConfidence,
|
||
|
|
analyzedAt: new Date(),
|
||
|
|
},
|
||
|
|
})
|
||
|
|
|
||
|
|
return result
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Analyze a single file by ID with a delay (for post-upload use).
|
||
|
|
* The delay accounts for presigned URL uploads where the file
|
||
|
|
* may not be in storage yet when the DB record is created.
|
||
|
|
*/
|
||
|
|
export async function analyzeFileDelayed(
|
||
|
|
fileId: string,
|
||
|
|
delayMs = 3000
|
||
|
|
): Promise<AnalysisResult> {
|
||
|
|
await new Promise((resolve) => setTimeout(resolve, delayMs))
|
||
|
|
return analyzeFile(fileId)
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Analyze all files for a specific project.
|
||
|
|
*/
|
||
|
|
export async function analyzeProjectFiles(
|
||
|
|
projectId: string
|
||
|
|
): Promise<{ analyzed: number; failed: number; total: number }> {
|
||
|
|
const files = await prisma.projectFile.findMany({
|
||
|
|
where: { projectId },
|
||
|
|
select: {
|
||
|
|
id: true,
|
||
|
|
objectKey: true,
|
||
|
|
bucket: true,
|
||
|
|
mimeType: true,
|
||
|
|
fileName: true,
|
||
|
|
},
|
||
|
|
})
|
||
|
|
|
||
|
|
let analyzed = 0
|
||
|
|
let failed = 0
|
||
|
|
|
||
|
|
// Process in batches
|
||
|
|
for (let i = 0; i < files.length; i += BATCH_SIZE) {
|
||
|
|
const batch = files.slice(i, i + BATCH_SIZE)
|
||
|
|
const results = await Promise.allSettled(
|
||
|
|
batch.map(async (file) => {
|
||
|
|
if (!isParseableMimeType(file.mimeType)) {
|
||
|
|
// Mark non-parseable files as analyzed with no data
|
||
|
|
await prisma.projectFile.update({
|
||
|
|
where: { id: file.id },
|
||
|
|
data: { analyzedAt: new Date() },
|
||
|
|
})
|
||
|
|
return 'skipped'
|
||
|
|
}
|
||
|
|
|
||
|
|
const result = await analyzeFileContent(
|
||
|
|
file.objectKey,
|
||
|
|
file.bucket,
|
||
|
|
file.mimeType,
|
||
|
|
file.fileName,
|
||
|
|
file.id
|
||
|
|
)
|
||
|
|
|
||
|
|
await prisma.projectFile.update({
|
||
|
|
where: { id: file.id },
|
||
|
|
data: {
|
||
|
|
pageCount: result.pageCount,
|
||
|
|
textPreview: result.textPreview,
|
||
|
|
detectedLang: result.detectedLang,
|
||
|
|
langConfidence: result.langConfidence,
|
||
|
|
analyzedAt: new Date(),
|
||
|
|
},
|
||
|
|
})
|
||
|
|
|
||
|
|
return result.error ? 'failed' : 'analyzed'
|
||
|
|
})
|
||
|
|
)
|
||
|
|
|
||
|
|
for (const r of results) {
|
||
|
|
if (r.status === 'fulfilled') {
|
||
|
|
if (r.value === 'analyzed') analyzed++
|
||
|
|
else if (r.value === 'failed') failed++
|
||
|
|
} else {
|
||
|
|
failed++
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return { analyzed, failed, total: files.length }
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Retroactive batch analysis: analyze all files that haven't been analyzed yet.
|
||
|
|
* Returns counts. Processes in batches to avoid memory issues.
|
||
|
|
*/
|
||
|
|
export async function analyzeAllUnanalyzed(): Promise<{
|
||
|
|
analyzed: number
|
||
|
|
failed: number
|
||
|
|
skipped: number
|
||
|
|
total: number
|
||
|
|
}> {
|
||
|
|
const files = await prisma.projectFile.findMany({
|
||
|
|
where: { analyzedAt: null },
|
||
|
|
select: {
|
||
|
|
id: true,
|
||
|
|
objectKey: true,
|
||
|
|
bucket: true,
|
||
|
|
mimeType: true,
|
||
|
|
fileName: true,
|
||
|
|
},
|
||
|
|
orderBy: { createdAt: 'desc' },
|
||
|
|
})
|
||
|
|
|
||
|
|
let analyzed = 0
|
||
|
|
let failed = 0
|
||
|
|
let skipped = 0
|
||
|
|
|
||
|
|
for (let i = 0; i < files.length; i += BATCH_SIZE) {
|
||
|
|
const batch = files.slice(i, i + BATCH_SIZE)
|
||
|
|
const results = await Promise.allSettled(
|
||
|
|
batch.map(async (file) => {
|
||
|
|
if (!isParseableMimeType(file.mimeType)) {
|
||
|
|
await prisma.projectFile.update({
|
||
|
|
where: { id: file.id },
|
||
|
|
data: { analyzedAt: new Date() },
|
||
|
|
})
|
||
|
|
return 'skipped'
|
||
|
|
}
|
||
|
|
|
||
|
|
const result = await analyzeFileContent(
|
||
|
|
file.objectKey,
|
||
|
|
file.bucket,
|
||
|
|
file.mimeType,
|
||
|
|
file.fileName,
|
||
|
|
file.id
|
||
|
|
)
|
||
|
|
|
||
|
|
await prisma.projectFile.update({
|
||
|
|
where: { id: file.id },
|
||
|
|
data: {
|
||
|
|
pageCount: result.pageCount,
|
||
|
|
textPreview: result.textPreview,
|
||
|
|
detectedLang: result.detectedLang,
|
||
|
|
langConfidence: result.langConfidence,
|
||
|
|
analyzedAt: new Date(),
|
||
|
|
},
|
||
|
|
})
|
||
|
|
|
||
|
|
return result.error ? 'failed' : 'analyzed'
|
||
|
|
})
|
||
|
|
)
|
||
|
|
|
||
|
|
for (const r of results) {
|
||
|
|
if (r.status === 'fulfilled') {
|
||
|
|
if (r.value === 'analyzed') analyzed++
|
||
|
|
else if (r.value === 'failed') failed++
|
||
|
|
else if (r.value === 'skipped') skipped++
|
||
|
|
} else {
|
||
|
|
failed++
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(
|
||
|
|
`[DocAnalyzer] Batch progress: ${i + batch.length}/${files.length} (${analyzed} analyzed, ${skipped} skipped, ${failed} failed)`
|
||
|
|
)
|
||
|
|
}
|
||
|
|
|
||
|
|
return { analyzed, failed, skipped, total: files.length }
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Check if auto-analysis is enabled via SystemSettings.
|
||
|
|
*/
|
||
|
|
export async function isAutoAnalysisEnabled(): Promise<boolean> {
|
||
|
|
try {
|
||
|
|
const setting = await prisma.systemSettings.findUnique({
|
||
|
|
where: { key: 'file_analysis_auto_enabled' },
|
||
|
|
})
|
||
|
|
// Default to true if setting doesn't exist
|
||
|
|
return setting?.value !== 'false'
|
||
|
|
} catch {
|
||
|
|
return true
|
||
|
|
}
|
||
|
|
}
|