MOPC-Portal/src/server/services/document-analyzer.ts

/**
 * Document Analyzer Service
 *
 * Extracts metadata from uploaded files:
 * - Page count (PDFs)
 * - Text preview (first ~2000 chars)
 * - Language detection via franc
 *
 * Supports: PDF (via unpdf), Word .docx (via mammoth), plain text files.
 * Runs optionally on upload (controlled by SystemSettings) and
 * retroactively via admin endpoint.
 */

import { getStorageProvider } from '@/lib/storage'
import { prisma } from '@/lib/prisma'

const TEXT_PREVIEW_LIMIT = 2000
const BATCH_SIZE = 10

const ANALYZABLE_MIME_TYPES = [
  'application/pdf',
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx
  'application/msword', // .doc (limited support)
  'text/plain',
  'text/csv',
  'text/markdown',
  'text/html',
  'application/rtf',
]

function isAnalyzableMimeType(mimeType: string): boolean {
  return ANALYZABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
}

/**
 * Strip null bytes from extracted text — PostgreSQL rejects \x00 in UTF-8 text columns.
 */
function sanitizeText(text: string): string {
  return text.replace(/\0/g, '')
}

// ─── Types ──────────────────────────────────────────────────────────────────

export type AnalysisResult = {
  fileId: string
  pageCount: number | null
  textPreview: string | null
  detectedLang: string | null
  langConfidence: number | null
  error?: string
}

// ─── Language Detection ──────────────────────────────────────────────────────

/**
 * Detect language using franc. Returns ISO 639-3 code and confidence.
 */
async function detectLanguage(
  text: string
): Promise<{ lang: string; confidence: number }> {
  if (!text || text.trim().length < 20) {
    return { lang: 'und', confidence: 0 }
  }

  const sample = text.slice(0, 5000)

  const { francAll } = await import('franc')
  const results = francAll(sample, { minLength: 10 })

  if (!results || results.length === 0 || results[0][0] === 'und') {
    return { lang: 'und', confidence: 0 }
  }

  const topLang = results[0][0]
  const topScore = results[0][1]
  const confidence = Math.max(0, Math.min(1, topScore))

  return { lang: topLang, confidence: Math.round(confidence * 100) / 100 }
}

// ─── Parsers ─────────────────────────────────────────────────────────────────

async function parsePdf(
  buffer: Buffer
): Promise<{ text: string; pageCount: number }> {
  const { extractText, getDocumentProxy } = await import('unpdf')
  const pdf = await getDocumentProxy(new Uint8Array(buffer))
  const { totalPages, text } = await extractText(pdf, { mergePages: true })
  return { text: text as string, pageCount: totalPages }
}

async function parseDocx(
  buffer: Buffer
): Promise<{ text: string }> {
  const mammoth = await import('mammoth')
  const result = await mammoth.extractRawText({ buffer })
  return { text: result.value }
}

// ─── Core Analysis ──────────────────────────────────────────────────────────

/**
 * Analyze a single file: extract page count, text preview, and detect language.
 * Downloads the file from storage, parses it, and returns results.
 */
export async function analyzeFileContent(
  objectKey: string,
  bucket: string,
  mimeType: string,
  fileName: string,
  fileId: string
): Promise<AnalysisResult> {
  const result: AnalysisResult = {
    fileId,
    pageCount: null,
    textPreview: null,
    detectedLang: null,
    langConfidence: null,
  }

  if (!isAnalyzableMimeType(mimeType)) {
    return { ...result, error: 'Unsupported mime type for analysis' }
  }

  try {
    const storage = await getStorageProvider()
    const buffer = await storage.getObject(objectKey)

    let text = ''
    let pageCount: number | null = null

    if (mimeType === 'application/pdf') {
      const parsed = await parsePdf(buffer)
      text = sanitizeText(parsed.text)
      pageCount = parsed.pageCount
    } else if (
      mimeType ===
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
      mimeType === 'application/msword'
    ) {
      const parsed = await parseDocx(buffer)
      text = sanitizeText(parsed.text)
    } else {
      // Text-based files (plain text, CSV, markdown, HTML, RTF)
      text = sanitizeText(buffer.toString('utf-8'))
    }

    result.pageCount = pageCount

    // Text preview
    if (text.trim()) {
      result.textPreview =
        text.length > TEXT_PREVIEW_LIMIT
          ? text.slice(0, TEXT_PREVIEW_LIMIT)
          : text
    }

    // Language detection
    if (text.trim().length >= 20) {
      const langResult = await detectLanguage(text)
      result.detectedLang = langResult.lang
      result.langConfidence = langResult.confidence
    }

    return result
  } catch (error) {
    console.warn(
      `[DocAnalyzer] Failed to analyze ${fileName}:`,
      error instanceof Error ? error.message : error
    )
    return {
      ...result,
      error: error instanceof Error ? error.message : 'Analysis failed',
    }
  }
}

// ─── DB-Integrated Operations ───────────────────────────────────────────────

/**
 * Analyze a single file by ID and persist results to DB.
 */
export async function analyzeFile(fileId: string): Promise<AnalysisResult> {
  const file = await prisma.projectFile.findUnique({
    where: { id: fileId },
    select: {
      id: true,
      objectKey: true,
      bucket: true,
      mimeType: true,
      fileName: true,
    },
  })

  if (!file) {
    return {
      fileId,
      pageCount: null,
      textPreview: null,
      detectedLang: null,
      langConfidence: null,
      error: 'File not found',
    }
  }

  const result = await analyzeFileContent(
    file.objectKey,
    file.bucket,
    file.mimeType,
    file.fileName,
    file.id
  )

  // Persist results
  await prisma.projectFile.update({
    where: { id: fileId },
    data: {
      pageCount: result.pageCount,
      textPreview: result.textPreview,
      detectedLang: result.detectedLang,
      langConfidence: result.langConfidence,
      analyzedAt: new Date(),
    },
  })

  return result
}

/**
 * Analyze a single file by ID with a delay (for post-upload use).
 * The delay accounts for presigned URL uploads where the file
 * may not be in storage yet when the DB record is created.
 */
export async function analyzeFileDelayed(
  fileId: string,
  delayMs = 3000
): Promise<AnalysisResult> {
  await new Promise((resolve) => setTimeout(resolve, delayMs))
  return analyzeFile(fileId)
}

/**
 * Analyze all files for a specific project.
 */
export async function analyzeProjectFiles(
  projectId: string
): Promise<{ analyzed: number; failed: number; total: number }> {
  const files = await prisma.projectFile.findMany({
    where: { projectId },
    select: {
      id: true,
      objectKey: true,
      bucket: true,
      mimeType: true,
      fileName: true,
    },
  })

  let analyzed = 0
  let failed = 0

  // Process in batches
  for (let i = 0; i < files.length; i += BATCH_SIZE) {
    const batch = files.slice(i, i + BATCH_SIZE)
    const results = await Promise.allSettled(
      batch.map(async (file) => {
        if (!isAnalyzableMimeType(file.mimeType)) {
          // Mark non-analyzable files as analyzed with no data
          await prisma.projectFile.update({
            where: { id: file.id },
            data: { analyzedAt: new Date() },
          })
          return 'skipped'
        }

        const result = await analyzeFileContent(
          file.objectKey,
          file.bucket,
          file.mimeType,
          file.fileName,
          file.id
        )

        await prisma.projectFile.update({
          where: { id: file.id },
          data: {
            pageCount: result.pageCount,
            textPreview: result.textPreview,
            detectedLang: result.detectedLang,
            langConfidence: result.langConfidence,
            analyzedAt: new Date(),
          },
        })

        return result.error ? 'failed' : 'analyzed'
      })
    )

    for (const r of results) {
      if (r.status === 'fulfilled') {
        if (r.value === 'analyzed') analyzed++
        else if (r.value === 'failed') failed++
      } else {
        failed++
      }
    }
  }

  return { analyzed, failed, total: files.length }
}

/**
 * Retroactive batch analysis: analyze all files that haven't been analyzed yet.
 * Returns counts. Processes in batches to avoid memory issues.
 */
export async function analyzeAllUnanalyzed(): Promise<{
  analyzed: number
  failed: number
  skipped: number
  total: number
}> {
  const files = await prisma.projectFile.findMany({
    where: { analyzedAt: null, project: { isTest: false } },
    select: {
      id: true,
      objectKey: true,
      bucket: true,
      mimeType: true,
      fileName: true,
    },
    orderBy: { createdAt: 'desc' },
  })

  let analyzed = 0
  let failed = 0
  let skipped = 0

  for (let i = 0; i < files.length; i += BATCH_SIZE) {
    const batch = files.slice(i, i + BATCH_SIZE)
    const results = await Promise.allSettled(
      batch.map(async (file) => {
        if (!isAnalyzableMimeType(file.mimeType)) {
          await prisma.projectFile.update({
            where: { id: file.id },
            data: { analyzedAt: new Date() },
          })
          return 'skipped'
        }

        const result = await analyzeFileContent(
          file.objectKey,
          file.bucket,
          file.mimeType,
          file.fileName,
          file.id
        )

        await prisma.projectFile.update({
          where: { id: file.id },
          data: {
            pageCount: result.pageCount,
            textPreview: result.textPreview,
            detectedLang: result.detectedLang,
            langConfidence: result.langConfidence,
            analyzedAt: new Date(),
          },
        })

        return result.error ? 'failed' : 'analyzed'
      })
    )

    for (const r of results) {
      if (r.status === 'fulfilled') {
        if (r.value === 'analyzed') analyzed++
        else if (r.value === 'failed') failed++
        else if (r.value === 'skipped') skipped++
      } else {
        failed++
      }
    }

    console.log(
      `[DocAnalyzer] Batch progress: ${i + batch.length}/${files.length} (${analyzed} analyzed, ${skipped} skipped, ${failed} failed)`
    )
  }

  return { analyzed, failed, skipped, total: files.length }
}

/**
 * Check if auto-analysis is enabled via SystemSettings.
 */
export async function isAutoAnalysisEnabled(): Promise<boolean> {
  try {
    const setting = await prisma.systemSettings.findUnique({
      where: { key: 'file_analysis_auto_enabled' },
    })
    return setting?.value !== 'false'
  } catch {
    return true
  }
}