Files
MOPC-Portal/src/server/services/document-analyzer.ts
Matt 3e70de3a5a Add Anthropic API, test environment, remove locale settings
Feature 1: Anthropic API Integration
- Add @anthropic-ai/sdk with adapter wrapping OpenAI-shaped interface
- Support Claude models (opus, sonnet, haiku) with extended thinking
- Auto-reset model on provider switch, JSON retry logic
- Add Claude model pricing to ai-usage tracker
- Update AI settings form with Anthropic provider option

Feature 2: Remove Locale Settings UI
- Strip Localization tab from admin settings
- Remove i18n settings from router inferCategory and getFeatureFlags
- Keep franc document language detection intact

Feature 3: Test Environment with Role Impersonation
- Add isTest field to User, Program, Project, Competition models
- Test environment service: create/teardown with realistic dummy data
- JWT-based impersonation for test users (@test.local emails)
- Impersonation banner with quick-switch between test roles
- Test environment panel in admin settings (SUPER_ADMIN only)
- Email redirect: @test.local emails routed to admin with [TEST] prefix
- Complete data isolation: 45+ isTest:false filters across platform
  - All global queries on User/Project/Program/Competition
  - AI services blocked from processing test data
  - Cron jobs skip test rounds/users
  - Analytics/exports exclude test data
  - Admin layout/pickers hide test programs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 17:28:07 +01:00

404 lines
11 KiB
TypeScript

/**
* Document Analyzer Service
*
* Extracts metadata from uploaded files:
* - Page count (PDFs)
* - Text preview (first ~2000 chars)
* - Language detection via franc
*
* Supports: PDF (via unpdf), Word .docx (via mammoth), plain text files.
* Runs optionally on upload (controlled by SystemSettings) and
* retroactively via admin endpoint.
*/
import { getStorageProvider } from '@/lib/storage'
import { prisma } from '@/lib/prisma'
const TEXT_PREVIEW_LIMIT = 2000
const BATCH_SIZE = 10
const ANALYZABLE_MIME_TYPES = [
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx
'application/msword', // .doc (limited support)
'text/plain',
'text/csv',
'text/markdown',
'text/html',
'application/rtf',
]
function isAnalyzableMimeType(mimeType: string): boolean {
return ANALYZABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
}
/**
* Strip null bytes from extracted text — PostgreSQL rejects \x00 in UTF-8 text columns.
*/
function sanitizeText(text: string): string {
return text.replace(/\0/g, '')
}
// ─── Types ──────────────────────────────────────────────────────────────────
export type AnalysisResult = {
fileId: string
pageCount: number | null
textPreview: string | null
detectedLang: string | null
langConfidence: number | null
error?: string
}
// ─── Language Detection ──────────────────────────────────────────────────────
/**
* Detect language using franc. Returns ISO 639-3 code and confidence.
*/
async function detectLanguage(
text: string
): Promise<{ lang: string; confidence: number }> {
if (!text || text.trim().length < 20) {
return { lang: 'und', confidence: 0 }
}
const sample = text.slice(0, 5000)
const { francAll } = await import('franc')
const results = francAll(sample, { minLength: 10 })
if (!results || results.length === 0 || results[0][0] === 'und') {
return { lang: 'und', confidence: 0 }
}
const topLang = results[0][0]
const topScore = results[0][1]
const confidence = Math.max(0, Math.min(1, topScore))
return { lang: topLang, confidence: Math.round(confidence * 100) / 100 }
}
// ─── Parsers ─────────────────────────────────────────────────────────────────
async function parsePdf(
buffer: Buffer
): Promise<{ text: string; pageCount: number }> {
const { extractText, getDocumentProxy } = await import('unpdf')
const pdf = await getDocumentProxy(new Uint8Array(buffer))
const { totalPages, text } = await extractText(pdf, { mergePages: true })
return { text: text as string, pageCount: totalPages }
}
async function parseDocx(
buffer: Buffer
): Promise<{ text: string }> {
const mammoth = await import('mammoth')
const result = await mammoth.extractRawText({ buffer })
return { text: result.value }
}
// ─── Core Analysis ──────────────────────────────────────────────────────────
/**
* Analyze a single file: extract page count, text preview, and detect language.
* Downloads the file from storage, parses it, and returns results.
*/
export async function analyzeFileContent(
objectKey: string,
bucket: string,
mimeType: string,
fileName: string,
fileId: string
): Promise<AnalysisResult> {
const result: AnalysisResult = {
fileId,
pageCount: null,
textPreview: null,
detectedLang: null,
langConfidence: null,
}
if (!isAnalyzableMimeType(mimeType)) {
return { ...result, error: 'Unsupported mime type for analysis' }
}
try {
const storage = await getStorageProvider()
const buffer = await storage.getObject(objectKey)
let text = ''
let pageCount: number | null = null
if (mimeType === 'application/pdf') {
const parsed = await parsePdf(buffer)
text = sanitizeText(parsed.text)
pageCount = parsed.pageCount
} else if (
mimeType ===
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
mimeType === 'application/msword'
) {
const parsed = await parseDocx(buffer)
text = sanitizeText(parsed.text)
} else {
// Text-based files (plain text, CSV, markdown, HTML, RTF)
text = sanitizeText(buffer.toString('utf-8'))
}
result.pageCount = pageCount
// Text preview
if (text.trim()) {
result.textPreview =
text.length > TEXT_PREVIEW_LIMIT
? text.slice(0, TEXT_PREVIEW_LIMIT)
: text
}
// Language detection
if (text.trim().length >= 20) {
const langResult = await detectLanguage(text)
result.detectedLang = langResult.lang
result.langConfidence = langResult.confidence
}
return result
} catch (error) {
console.warn(
`[DocAnalyzer] Failed to analyze ${fileName}:`,
error instanceof Error ? error.message : error
)
return {
...result,
error: error instanceof Error ? error.message : 'Analysis failed',
}
}
}
// ─── DB-Integrated Operations ───────────────────────────────────────────────
/**
* Analyze a single file by ID and persist results to DB.
*/
export async function analyzeFile(fileId: string): Promise<AnalysisResult> {
const file = await prisma.projectFile.findUnique({
where: { id: fileId },
select: {
id: true,
objectKey: true,
bucket: true,
mimeType: true,
fileName: true,
},
})
if (!file) {
return {
fileId,
pageCount: null,
textPreview: null,
detectedLang: null,
langConfidence: null,
error: 'File not found',
}
}
const result = await analyzeFileContent(
file.objectKey,
file.bucket,
file.mimeType,
file.fileName,
file.id
)
// Persist results
await prisma.projectFile.update({
where: { id: fileId },
data: {
pageCount: result.pageCount,
textPreview: result.textPreview,
detectedLang: result.detectedLang,
langConfidence: result.langConfidence,
analyzedAt: new Date(),
},
})
return result
}
/**
* Analyze a single file by ID with a delay (for post-upload use).
* The delay accounts for presigned URL uploads where the file
* may not be in storage yet when the DB record is created.
*/
export async function analyzeFileDelayed(
fileId: string,
delayMs = 3000
): Promise<AnalysisResult> {
await new Promise((resolve) => setTimeout(resolve, delayMs))
return analyzeFile(fileId)
}
/**
* Analyze all files for a specific project.
*/
export async function analyzeProjectFiles(
projectId: string
): Promise<{ analyzed: number; failed: number; total: number }> {
const files = await prisma.projectFile.findMany({
where: { projectId },
select: {
id: true,
objectKey: true,
bucket: true,
mimeType: true,
fileName: true,
},
})
let analyzed = 0
let failed = 0
// Process in batches
for (let i = 0; i < files.length; i += BATCH_SIZE) {
const batch = files.slice(i, i + BATCH_SIZE)
const results = await Promise.allSettled(
batch.map(async (file) => {
if (!isAnalyzableMimeType(file.mimeType)) {
// Mark non-analyzable files as analyzed with no data
await prisma.projectFile.update({
where: { id: file.id },
data: { analyzedAt: new Date() },
})
return 'skipped'
}
const result = await analyzeFileContent(
file.objectKey,
file.bucket,
file.mimeType,
file.fileName,
file.id
)
await prisma.projectFile.update({
where: { id: file.id },
data: {
pageCount: result.pageCount,
textPreview: result.textPreview,
detectedLang: result.detectedLang,
langConfidence: result.langConfidence,
analyzedAt: new Date(),
},
})
return result.error ? 'failed' : 'analyzed'
})
)
for (const r of results) {
if (r.status === 'fulfilled') {
if (r.value === 'analyzed') analyzed++
else if (r.value === 'failed') failed++
} else {
failed++
}
}
}
return { analyzed, failed, total: files.length }
}
/**
* Retroactive batch analysis: analyze all files that haven't been analyzed yet.
* Returns counts. Processes in batches to avoid memory issues.
*/
export async function analyzeAllUnanalyzed(): Promise<{
analyzed: number
failed: number
skipped: number
total: number
}> {
const files = await prisma.projectFile.findMany({
where: { analyzedAt: null, project: { isTest: false } },
select: {
id: true,
objectKey: true,
bucket: true,
mimeType: true,
fileName: true,
},
orderBy: { createdAt: 'desc' },
})
let analyzed = 0
let failed = 0
let skipped = 0
for (let i = 0; i < files.length; i += BATCH_SIZE) {
const batch = files.slice(i, i + BATCH_SIZE)
const results = await Promise.allSettled(
batch.map(async (file) => {
if (!isAnalyzableMimeType(file.mimeType)) {
await prisma.projectFile.update({
where: { id: file.id },
data: { analyzedAt: new Date() },
})
return 'skipped'
}
const result = await analyzeFileContent(
file.objectKey,
file.bucket,
file.mimeType,
file.fileName,
file.id
)
await prisma.projectFile.update({
where: { id: file.id },
data: {
pageCount: result.pageCount,
textPreview: result.textPreview,
detectedLang: result.detectedLang,
langConfidence: result.langConfidence,
analyzedAt: new Date(),
},
})
return result.error ? 'failed' : 'analyzed'
})
)
for (const r of results) {
if (r.status === 'fulfilled') {
if (r.value === 'analyzed') analyzed++
else if (r.value === 'failed') failed++
else if (r.value === 'skipped') skipped++
} else {
failed++
}
}
console.log(
`[DocAnalyzer] Batch progress: ${i + batch.length}/${files.length} (${analyzed} analyzed, ${skipped} skipped, ${failed} failed)`
)
}
return { analyzed, failed, skipped, total: files.length }
}
/**
* Check if auto-analysis is enabled via SystemSettings.
*/
export async function isAutoAnalysisEnabled(): Promise<boolean> {
try {
const setting = await prisma.systemSettings.findUnique({
where: { key: 'file_analysis_auto_enabled' },
})
return setting?.value !== 'false'
} catch {
return true
}
}