src/server/services/file-content-extractor.ts

/**
 * File Content Extractor
 *
 * Downloads files from storage and extracts text content for AI analysis.
 * Supports PDF and plain text files. Used when round config has aiParseFiles=true.
 *
 * Limits:
 * - Max 50KB of extracted text per file (to stay within AI token limits)
 * - Only PDF and text-based files are parsed
 * - Extraction failures are non-fatal (file is skipped)
 */

import { getStorageProvider } from '@/lib/storage'

const MAX_TEXT_PER_FILE = 50_000 // ~50KB of text per file
const PARSEABLE_MIME_TYPES = [
  'application/pdf',
  'text/plain',
  'text/csv',
  'text/markdown',
  'text/html',
  'application/rtf',
]

export type ExtractedFileContent = {
  fileId: string
  fileName: string
  content: string | null
  error?: string
}

/**
 * Check if a file's mime type supports content extraction
 */
export function isParseableMimeType(mimeType: string): boolean {
  return PARSEABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))
}

/**
 * Extract text content from a single file stored in MinIO/S3.
 * Returns null content if file type is unsupported or extraction fails.
 */
export async function extractFileContent(
  objectKey: string,
  mimeType: string,
  fileName: string,
  fileId: string,
): Promise<ExtractedFileContent> {
  if (!isParseableMimeType(mimeType)) {
    return { fileId, fileName, content: null, error: 'Unsupported mime type' }
  }

  try {
    const storage = await getStorageProvider()
    const buffer = await storage.getObject(objectKey)

    let text: string

    if (mimeType === 'application/pdf') {
      // Dynamic import to avoid loading pdf-parse when not needed
      const pdfParseModule = await import('pdf-parse')
      const pdfParse = typeof pdfParseModule === 'function' ? pdfParseModule : (pdfParseModule as any).default ?? pdfParseModule
      const pdf = await pdfParse(buffer)
      text = pdf.text
    } else {
      // Text-based files
      text = buffer.toString('utf-8')
    }

    // Truncate to limit
    if (text.length > MAX_TEXT_PER_FILE) {
      text = text.slice(0, MAX_TEXT_PER_FILE) + '\n[... content truncated ...]'
    }

    return { fileId, fileName, content: text }
  } catch (error) {
    console.warn(`[FileExtractor] Failed to extract content from ${fileName}:`, error)
    return {
      fileId,
      fileName,
      content: null,
      error: error instanceof Error ? error.message : 'Extraction failed',
    }
  }
}

/**
 * Extract content from multiple files in parallel.
 * Non-fatal: files that fail extraction are returned with null content.
 */
export async function extractMultipleFileContents(
  files: Array<{
    id: string
    fileName: string
    mimeType: string
    objectKey: string
  }>,
): Promise<ExtractedFileContent[]> {
  const parseableFiles = files.filter((f) => isParseableMimeType(f.mimeType))

  if (parseableFiles.length === 0) return []

  const results = await Promise.allSettled(
    parseableFiles.map((f) => extractFileContent(f.objectKey, f.mimeType, f.fileName, f.id)),
  )

  return results.map((r, i) =>
    r.status === 'fulfilled'
      ? r.value
      : { fileId: parseableFiles[i].id, fileName: parseableFiles[i].fileName, content: null, error: 'Promise rejected' },
  )
}
AI category-aware evaluation: per-round config, file parsing, shortlist, advance flow - Per-juror cap mode (HARD/SOFT/NONE) in add-member dialog and members table - Jury invite flow: create user + add to group + send invitation from dialog - Per-round config: notifyOnAdvance, aiParseFiles, startupAdvanceCount, conceptAdvanceCount - Moved notify-on-advance from competition-level to per-round setting - AI filtering: round-tagged files with newest-first sorting, optional file content extraction - File content extractor service (pdf-parse for PDF, utf-8 for text files) - AI shortlist runs independently per category (STARTUP / BUSINESS_CONCEPT) - generateAIRecommendations tRPC endpoint with per-round config integration - AI recommendations UI: trigger button, confirmation dialog, per-category results display - Category-aware advance dialog: select/deselect projects by category with target caps - STAGE_ACTIVE bug fix in assignment router Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-16 10:09:52 +01:00			`/**`
			`* File Content Extractor`
			`*`
			`* Downloads files from storage and extracts text content for AI analysis.`
			`* Supports PDF and plain text files. Used when round config has aiParseFiles=true.`
			`*`
			`* Limits:`
			`* - Max 50KB of extracted text per file (to stay within AI token limits)`
			`* - Only PDF and text-based files are parsed`
			`* - Extraction failures are non-fatal (file is skipped)`
			`*/`

			`import { getStorageProvider } from '@/lib/storage'`

			`const MAX_TEXT_PER_FILE = 50_000 // ~50KB of text per file`
			`const PARSEABLE_MIME_TYPES = [`
			`'application/pdf',`
			`'text/plain',`
			`'text/csv',`
			`'text/markdown',`
			`'text/html',`
			`'application/rtf',`
			`]`

			`export type ExtractedFileContent = {`
			`fileId: string`
			`fileName: string`
			`content: string \| null`
			`error?: string`
			`}`

			`/**`
			`* Check if a file's mime type supports content extraction`
			`*/`
			`export function isParseableMimeType(mimeType: string): boolean {`
			`return PARSEABLE_MIME_TYPES.some((t) => mimeType.startsWith(t))`
			`}`

			`/**`
			`* Extract text content from a single file stored in MinIO/S3.`
			`* Returns null content if file type is unsupported or extraction fails.`
			`*/`
			`export async function extractFileContent(`
			`objectKey: string,`
			`mimeType: string,`
			`fileName: string,`
			`fileId: string,`
			`): Promise<ExtractedFileContent> {`
			`if (!isParseableMimeType(mimeType)) {`
			`return { fileId, fileName, content: null, error: 'Unsupported mime type' }`
			`}`

			`try {`
			`const storage = await getStorageProvider()`
			`const buffer = await storage.getObject(objectKey)`

			`let text: string`

			`if (mimeType === 'application/pdf') {`
			`// Dynamic import to avoid loading pdf-parse when not needed`
			`const pdfParseModule = await import('pdf-parse')`
			`const pdfParse = typeof pdfParseModule === 'function' ? pdfParseModule : (pdfParseModule as any).default ?? pdfParseModule`
			`const pdf = await pdfParse(buffer)`
			`text = pdf.text`
			`} else {`
			`// Text-based files`
			`text = buffer.toString('utf-8')`
			`}`

			`// Truncate to limit`
			`if (text.length > MAX_TEXT_PER_FILE) {`
			`text = text.slice(0, MAX_TEXT_PER_FILE) + '\n[... content truncated ...]'`
			`}`

			`return { fileId, fileName, content: text }`
			`} catch (error) {`
			console.warn(`[FileExtractor] Failed to extract content from ${fileName}:`, error)
			`return {`
			`fileId,`
			`fileName,`
			`content: null,`
			`error: error instanceof Error ? error.message : 'Extraction failed',`
			`}`
			`}`
			`}`

			`/**`
			`* Extract content from multiple files in parallel.`
			`* Non-fatal: files that fail extraction are returned with null content.`
			`*/`
			`export async function extractMultipleFileContents(`
			`files: Array<{`
			`id: string`
			`fileName: string`
			`mimeType: string`
			`objectKey: string`
			`}>,`
			`): Promise<ExtractedFileContent[]> {`
			`const parseableFiles = files.filter((f) => isParseableMimeType(f.mimeType))`

			`if (parseableFiles.length === 0) return []`

			`const results = await Promise.allSettled(`
			`parseableFiles.map((f) => extractFileContent(f.objectKey, f.mimeType, f.fileName, f.id)),`
			`)`

			`return results.map((r, i) =>`
			`r.status === 'fulfilled'`
			`? r.value`
			`: { fileId: parseableFiles[i].id, fileName: parseableFiles[i].fileName, content: null, error: 'Promise rejected' },`
			`)`
			`}`