Feature 1: Anthropic API Integration - Add @anthropic-ai/sdk with adapter wrapping OpenAI-shaped interface - Support Claude models (opus, sonnet, haiku) with extended thinking - Auto-reset model on provider switch, JSON retry logic - Add Claude model pricing to ai-usage tracker - Update AI settings form with Anthropic provider option Feature 2: Remove Locale Settings UI - Strip Localization tab from admin settings - Remove i18n settings from router inferCategory and getFeatureFlags - Keep franc document language detection intact Feature 3: Test Environment with Role Impersonation - Add isTest field to User, Program, Project, Competition models - Test environment service: create/teardown with realistic dummy data - JWT-based impersonation for test users (@test.local emails) - Impersonation banner with quick-switch between test roles - Test environment panel in admin settings (SUPER_ADMIN only) - Email redirect: @test.local emails routed to admin with [TEST] prefix - Complete data isolation: 45+ isTest:false filters across platform - All global queries on User/Project/Program/Competition - AI services blocked from processing test data - Cron jobs skip test rounds/users - Analytics/exports exclude test data - Admin layout/pickers hide test programs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
404 lines
11 KiB
TypeScript
404 lines
11 KiB
TypeScript
/**
|
|
* Document Analyzer Service
|
|
*
|
|
* Extracts metadata from uploaded files:
|
|
* - Page count (PDFs)
|
|
* - Text preview (first ~2000 chars)
|
|
* - Language detection via franc
|
|
*
|
|
* Supports: PDF (via unpdf), Word .docx (via mammoth), plain text files.
|
|
* Runs optionally on upload (controlled by SystemSettings) and
|
|
* retroactively via admin endpoint.
|
|
*/
|
|
|
|
import { getStorageProvider } from '@/lib/storage'
|
|
import { prisma } from '@/lib/prisma'
|
|
|
|
const TEXT_PREVIEW_LIMIT = 2000
|
|
const BATCH_SIZE = 10
|
|
|
|
const ANALYZABLE_MIME_TYPES = [
|
|
'application/pdf',
|
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx
|
|
'application/msword', // .doc (limited support)
|
|
'text/plain',
|
|
'text/csv',
|
|
'text/markdown',
|
|
'text/html',
|
|
'application/rtf',
|
|
]
|
|
|
|
function isAnalyzableMimeType(mimeType: string): boolean {
|
|
return ANALYZABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
|
|
}
|
|
|
|
/**
|
|
* Strip null bytes from extracted text — PostgreSQL rejects \x00 in UTF-8 text columns.
|
|
*/
|
|
function sanitizeText(text: string): string {
|
|
return text.replace(/\0/g, '')
|
|
}
|
|
|
|
// ─── Types ──────────────────────────────────────────────────────────────────
|
|
|
|
export type AnalysisResult = {
|
|
fileId: string
|
|
pageCount: number | null
|
|
textPreview: string | null
|
|
detectedLang: string | null
|
|
langConfidence: number | null
|
|
error?: string
|
|
}
|
|
|
|
// ─── Language Detection ──────────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Detect language using franc. Returns ISO 639-3 code and confidence.
|
|
*/
|
|
async function detectLanguage(
|
|
text: string
|
|
): Promise<{ lang: string; confidence: number }> {
|
|
if (!text || text.trim().length < 20) {
|
|
return { lang: 'und', confidence: 0 }
|
|
}
|
|
|
|
const sample = text.slice(0, 5000)
|
|
|
|
const { francAll } = await import('franc')
|
|
const results = francAll(sample, { minLength: 10 })
|
|
|
|
if (!results || results.length === 0 || results[0][0] === 'und') {
|
|
return { lang: 'und', confidence: 0 }
|
|
}
|
|
|
|
const topLang = results[0][0]
|
|
const topScore = results[0][1]
|
|
const confidence = Math.max(0, Math.min(1, topScore))
|
|
|
|
return { lang: topLang, confidence: Math.round(confidence * 100) / 100 }
|
|
}
|
|
|
|
// ─── Parsers ─────────────────────────────────────────────────────────────────
|
|
|
|
async function parsePdf(
|
|
buffer: Buffer
|
|
): Promise<{ text: string; pageCount: number }> {
|
|
const { extractText, getDocumentProxy } = await import('unpdf')
|
|
const pdf = await getDocumentProxy(new Uint8Array(buffer))
|
|
const { totalPages, text } = await extractText(pdf, { mergePages: true })
|
|
return { text: text as string, pageCount: totalPages }
|
|
}
|
|
|
|
async function parseDocx(
|
|
buffer: Buffer
|
|
): Promise<{ text: string }> {
|
|
const mammoth = await import('mammoth')
|
|
const result = await mammoth.extractRawText({ buffer })
|
|
return { text: result.value }
|
|
}
|
|
|
|
// ─── Core Analysis ──────────────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Analyze a single file: extract page count, text preview, and detect language.
|
|
* Downloads the file from storage, parses it, and returns results.
|
|
*/
|
|
export async function analyzeFileContent(
|
|
objectKey: string,
|
|
bucket: string,
|
|
mimeType: string,
|
|
fileName: string,
|
|
fileId: string
|
|
): Promise<AnalysisResult> {
|
|
const result: AnalysisResult = {
|
|
fileId,
|
|
pageCount: null,
|
|
textPreview: null,
|
|
detectedLang: null,
|
|
langConfidence: null,
|
|
}
|
|
|
|
if (!isAnalyzableMimeType(mimeType)) {
|
|
return { ...result, error: 'Unsupported mime type for analysis' }
|
|
}
|
|
|
|
try {
|
|
const storage = await getStorageProvider()
|
|
const buffer = await storage.getObject(objectKey)
|
|
|
|
let text = ''
|
|
let pageCount: number | null = null
|
|
|
|
if (mimeType === 'application/pdf') {
|
|
const parsed = await parsePdf(buffer)
|
|
text = sanitizeText(parsed.text)
|
|
pageCount = parsed.pageCount
|
|
} else if (
|
|
mimeType ===
|
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
|
|
mimeType === 'application/msword'
|
|
) {
|
|
const parsed = await parseDocx(buffer)
|
|
text = sanitizeText(parsed.text)
|
|
} else {
|
|
// Text-based files (plain text, CSV, markdown, HTML, RTF)
|
|
text = sanitizeText(buffer.toString('utf-8'))
|
|
}
|
|
|
|
result.pageCount = pageCount
|
|
|
|
// Text preview
|
|
if (text.trim()) {
|
|
result.textPreview =
|
|
text.length > TEXT_PREVIEW_LIMIT
|
|
? text.slice(0, TEXT_PREVIEW_LIMIT)
|
|
: text
|
|
}
|
|
|
|
// Language detection
|
|
if (text.trim().length >= 20) {
|
|
const langResult = await detectLanguage(text)
|
|
result.detectedLang = langResult.lang
|
|
result.langConfidence = langResult.confidence
|
|
}
|
|
|
|
return result
|
|
} catch (error) {
|
|
console.warn(
|
|
`[DocAnalyzer] Failed to analyze ${fileName}:`,
|
|
error instanceof Error ? error.message : error
|
|
)
|
|
return {
|
|
...result,
|
|
error: error instanceof Error ? error.message : 'Analysis failed',
|
|
}
|
|
}
|
|
}
|
|
|
|
// ─── DB-Integrated Operations ───────────────────────────────────────────────
|
|
|
|
/**
|
|
* Analyze a single file by ID and persist results to DB.
|
|
*/
|
|
export async function analyzeFile(fileId: string): Promise<AnalysisResult> {
|
|
const file = await prisma.projectFile.findUnique({
|
|
where: { id: fileId },
|
|
select: {
|
|
id: true,
|
|
objectKey: true,
|
|
bucket: true,
|
|
mimeType: true,
|
|
fileName: true,
|
|
},
|
|
})
|
|
|
|
if (!file) {
|
|
return {
|
|
fileId,
|
|
pageCount: null,
|
|
textPreview: null,
|
|
detectedLang: null,
|
|
langConfidence: null,
|
|
error: 'File not found',
|
|
}
|
|
}
|
|
|
|
const result = await analyzeFileContent(
|
|
file.objectKey,
|
|
file.bucket,
|
|
file.mimeType,
|
|
file.fileName,
|
|
file.id
|
|
)
|
|
|
|
// Persist results
|
|
await prisma.projectFile.update({
|
|
where: { id: fileId },
|
|
data: {
|
|
pageCount: result.pageCount,
|
|
textPreview: result.textPreview,
|
|
detectedLang: result.detectedLang,
|
|
langConfidence: result.langConfidence,
|
|
analyzedAt: new Date(),
|
|
},
|
|
})
|
|
|
|
return result
|
|
}
|
|
|
|
/**
|
|
* Analyze a single file by ID with a delay (for post-upload use).
|
|
* The delay accounts for presigned URL uploads where the file
|
|
* may not be in storage yet when the DB record is created.
|
|
*/
|
|
export async function analyzeFileDelayed(
|
|
fileId: string,
|
|
delayMs = 3000
|
|
): Promise<AnalysisResult> {
|
|
await new Promise((resolve) => setTimeout(resolve, delayMs))
|
|
return analyzeFile(fileId)
|
|
}
|
|
|
|
/**
|
|
* Analyze all files for a specific project.
|
|
*/
|
|
export async function analyzeProjectFiles(
|
|
projectId: string
|
|
): Promise<{ analyzed: number; failed: number; total: number }> {
|
|
const files = await prisma.projectFile.findMany({
|
|
where: { projectId },
|
|
select: {
|
|
id: true,
|
|
objectKey: true,
|
|
bucket: true,
|
|
mimeType: true,
|
|
fileName: true,
|
|
},
|
|
})
|
|
|
|
let analyzed = 0
|
|
let failed = 0
|
|
|
|
// Process in batches
|
|
for (let i = 0; i < files.length; i += BATCH_SIZE) {
|
|
const batch = files.slice(i, i + BATCH_SIZE)
|
|
const results = await Promise.allSettled(
|
|
batch.map(async (file) => {
|
|
if (!isAnalyzableMimeType(file.mimeType)) {
|
|
// Mark non-analyzable files as analyzed with no data
|
|
await prisma.projectFile.update({
|
|
where: { id: file.id },
|
|
data: { analyzedAt: new Date() },
|
|
})
|
|
return 'skipped'
|
|
}
|
|
|
|
const result = await analyzeFileContent(
|
|
file.objectKey,
|
|
file.bucket,
|
|
file.mimeType,
|
|
file.fileName,
|
|
file.id
|
|
)
|
|
|
|
await prisma.projectFile.update({
|
|
where: { id: file.id },
|
|
data: {
|
|
pageCount: result.pageCount,
|
|
textPreview: result.textPreview,
|
|
detectedLang: result.detectedLang,
|
|
langConfidence: result.langConfidence,
|
|
analyzedAt: new Date(),
|
|
},
|
|
})
|
|
|
|
return result.error ? 'failed' : 'analyzed'
|
|
})
|
|
)
|
|
|
|
for (const r of results) {
|
|
if (r.status === 'fulfilled') {
|
|
if (r.value === 'analyzed') analyzed++
|
|
else if (r.value === 'failed') failed++
|
|
} else {
|
|
failed++
|
|
}
|
|
}
|
|
}
|
|
|
|
return { analyzed, failed, total: files.length }
|
|
}
|
|
|
|
/**
|
|
* Retroactive batch analysis: analyze all files that haven't been analyzed yet.
|
|
* Returns counts. Processes in batches to avoid memory issues.
|
|
*/
|
|
export async function analyzeAllUnanalyzed(): Promise<{
|
|
analyzed: number
|
|
failed: number
|
|
skipped: number
|
|
total: number
|
|
}> {
|
|
const files = await prisma.projectFile.findMany({
|
|
where: { analyzedAt: null, project: { isTest: false } },
|
|
select: {
|
|
id: true,
|
|
objectKey: true,
|
|
bucket: true,
|
|
mimeType: true,
|
|
fileName: true,
|
|
},
|
|
orderBy: { createdAt: 'desc' },
|
|
})
|
|
|
|
let analyzed = 0
|
|
let failed = 0
|
|
let skipped = 0
|
|
|
|
for (let i = 0; i < files.length; i += BATCH_SIZE) {
|
|
const batch = files.slice(i, i + BATCH_SIZE)
|
|
const results = await Promise.allSettled(
|
|
batch.map(async (file) => {
|
|
if (!isAnalyzableMimeType(file.mimeType)) {
|
|
await prisma.projectFile.update({
|
|
where: { id: file.id },
|
|
data: { analyzedAt: new Date() },
|
|
})
|
|
return 'skipped'
|
|
}
|
|
|
|
const result = await analyzeFileContent(
|
|
file.objectKey,
|
|
file.bucket,
|
|
file.mimeType,
|
|
file.fileName,
|
|
file.id
|
|
)
|
|
|
|
await prisma.projectFile.update({
|
|
where: { id: file.id },
|
|
data: {
|
|
pageCount: result.pageCount,
|
|
textPreview: result.textPreview,
|
|
detectedLang: result.detectedLang,
|
|
langConfidence: result.langConfidence,
|
|
analyzedAt: new Date(),
|
|
},
|
|
})
|
|
|
|
return result.error ? 'failed' : 'analyzed'
|
|
})
|
|
)
|
|
|
|
for (const r of results) {
|
|
if (r.status === 'fulfilled') {
|
|
if (r.value === 'analyzed') analyzed++
|
|
else if (r.value === 'failed') failed++
|
|
else if (r.value === 'skipped') skipped++
|
|
} else {
|
|
failed++
|
|
}
|
|
}
|
|
|
|
console.log(
|
|
`[DocAnalyzer] Batch progress: ${i + batch.length}/${files.length} (${analyzed} analyzed, ${skipped} skipped, ${failed} failed)`
|
|
)
|
|
}
|
|
|
|
return { analyzed, failed, skipped, total: files.length }
|
|
}
|
|
|
|
/**
|
|
* Check if auto-analysis is enabled via SystemSettings.
|
|
*/
|
|
export async function isAutoAnalysisEnabled(): Promise<boolean> {
|
|
try {
|
|
const setting = await prisma.systemSettings.findUnique({
|
|
where: { key: 'file_analysis_auto_enabled' },
|
|
})
|
|
return setting?.value !== 'false'
|
|
} catch {
|
|
return true
|
|
}
|
|
}
|