MOPC-Portal/src/server/services/file-content-extractor.ts

/**
 * File Content Extractor
 *
 * Downloads files from storage and extracts text content for AI analysis.
 * Supports PDF and plain text files. Used when round config has aiParseFiles=true.
 *
 * Limits:
 * - Max 50KB of extracted text per file (to stay within AI token limits)
 * - Only PDF and text-based files are parsed
 * - Extraction failures are non-fatal (file is skipped)
 */

import { getStorageProvider } from '@/lib/storage'

const MAX_TEXT_PER_FILE = 50_000 // ~50KB of text per file
const PARSEABLE_MIME_TYPES = [
  'application/pdf',
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
  'application/msword',
  'text/plain',
  'text/csv',
  'text/markdown',
  'text/html',
  'application/rtf',
]

export type ExtractedFileContent = {
  fileId: string
  fileName: string
  content: string | null
  error?: string
}

/**
 * Check if a file's mime type supports content extraction
 */
export function isParseableMimeType(mimeType: string): boolean {
  return PARSEABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
}

/**
 * Extract text content from a single file stored in MinIO/S3.
 * Returns null content if file type is unsupported or extraction fails.
 */
export async function extractFileContent(
  objectKey: string,
  mimeType: string,
  fileName: string,
  fileId: string,
): Promise<ExtractedFileContent> {
  if (!isParseableMimeType(mimeType)) {
    return { fileId, fileName, content: null, error: 'Unsupported mime type' }
  }

  try {
    const storage = await getStorageProvider()
    const buffer = await storage.getObject(objectKey)

    let text: string

    if (mimeType === 'application/pdf') {
      const { extractText, getDocumentProxy } = await import('unpdf')
      const pdf = await getDocumentProxy(new Uint8Array(buffer))
      const result = await extractText(pdf, { mergePages: true })
      text = (result.text as string).replace(/\0/g, '')
    } else if (
      mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
      mimeType === 'application/msword'
    ) {
      const mammoth = await import('mammoth')
      const result = await mammoth.extractRawText({ buffer })
      text = result.value.replace(/\0/g, '')
    } else {
      // Text-based files
      text = buffer.toString('utf-8').replace(/\0/g, '')
    }

    // Truncate to limit
    if (text.length > MAX_TEXT_PER_FILE) {
      text = text.slice(0, MAX_TEXT_PER_FILE) + '\n[... content truncated ...]'
    }

    return { fileId, fileName, content: text }
  } catch (error) {
    console.warn(`[FileExtractor] Failed to extract content from ${fileName}:`, error)
    return {
      fileId,
      fileName,
      content: null,
      error: error instanceof Error ? error.message : 'Extraction failed',
    }
  }
}

/**
 * Extract content from multiple files in parallel.
 * Non-fatal: files that fail extraction are returned with null content.
 */
export async function extractMultipleFileContents(
  files: Array<{
    id: string
    fileName: string
    mimeType: string
    objectKey: string
  }>,
): Promise<ExtractedFileContent[]> {
  const parseableFiles = files.filter((f) => isParseableMimeType(f.mimeType))

  if (parseableFiles.length === 0) return []

  const results = await Promise.allSettled(
    parseableFiles.map((f) => extractFileContent(f.objectKey, f.mimeType, f.fileName, f.id)),
  )

  return results.map((r, i) =>
    r.status === 'fulfilled'
      ? r.value
      : { fileId: parseableFiles[i].id, fileName: parseableFiles[i].fileName, content: null, error: 'Promise rejected' },
  )
}