001project_wildgrowth/backend/scripts/test-chunking-file.ts

#!/usr/bin/env npx ts-node
/**
 * 本地测试脚本：解析文档并分块
 * 用法: npx ts-node scripts/test-chunking-file.ts <文件路径>
 * 支持: .docx, .pdf, .epub
 */

import fs from 'fs';
import path from 'path';
import pdfParse from 'pdf-parse';
import mammoth from 'mammoth';
import EPub from 'epub';
import { structureChunkingService } from '../src/services/structureChunkingService';

async function parseFile(filePath: string): Promise<string> {
  const ext = path.extname(filePath).toLowerCase();
  const buffer = fs.readFileSync(filePath);

  if (ext === '.docx') {
    const result = await mammoth.extractRawText({ buffer });
    return result.value;
  }

  if (ext === '.pdf') {
    const data = await pdfParse(buffer);
    return data.text;
  }

  if (ext === '.epub') {
    return new Promise((resolve, reject) => {
      const epub = new EPub(filePath);
      epub.on('end', async () => {
        const chapters: string[] = [];
        for (const item of epub.flow || []) {
          try {
            const text = await new Promise<string>((res) => {
              epub.getChapter(item.id, (err: any, t: string) => res(err ? '' : t || ''));
            });
            if (text.trim()) chapters.push(text.trim());
          } catch {}
        }
        resolve(chapters.join('\n\n'));
      });
      epub.on('error', reject);
      epub.parse();
    });
  }

  throw new Error(`不支持的格式: ${ext}`);
}

async function main() {
  const filePath = process.argv[2];
  if (!filePath) {
    console.log('用法: npx ts-node scripts/test-chunking-file.ts <文件路径>');
    console.log('支持: .docx, .pdf, .epub');
    process.exit(1);
  }

  if (!fs.existsSync(filePath)) {
    console.error(`文件不存在: ${filePath}`);
    process.exit(1);
  }

  console.log('═'.repeat(60));
  console.log(`📄 文件: ${path.basename(filePath)}`);
  console.log('═'.repeat(60));

  try {
    // 1. 解析文档
    console.log('\n⏳ 解析文档...');
    const text = await parseFile(filePath);
    console.log(`✅ 解析完成: ${text.length.toLocaleString()} 字符`);

    // 2. 分块（使用 LLM 增强版）
    console.log('\n⏳ 执行分块（LLM 增强版）...');
    const result = await structureChunkingService.parseAsync(text);

    // 3. 输出结果
    console.log('\n' + '─'.repeat(60));
    console.log('📊 分块结果');
    console.log('─'.repeat(60));
    console.log(`  识别模式: ${result.pattern || '(无结构)'}`);
    console.log(`  分块数量: ${result.chunks.length}`);
    console.log(`  总字符数: ${result.totalCharacters.toLocaleString()}`);

    if (result.chunks.length > 0) {
      console.log('\n' + '─'.repeat(60));
      console.log('📋 分块列表');
      console.log('─'.repeat(60));

      result.chunks.forEach((chunk, i) => {
        const preview = chunk.content.replace(/\s+/g, ' ').substring(0, 60);
        console.log(`\n[${i + 1}] ${chunk.title}`);
        console.log(`    字符: ${chunk.content.length.toLocaleString()}`);
        console.log(`    预览: ${preview}...`);
      });

      // 4. 潜在问题检测
      console.log('\n' + '─'.repeat(60));
      console.log('🔍 潜在问题检测');
      console.log('─'.repeat(60));

      let issues: string[] = [];

      // 检查分块数量异常
      if (result.chunks.length > 50) {
        issues.push(`⚠️ 分块数量较多 (${result.chunks.length})，可能存在误匹配`);
      }
      if (result.chunks.length === 1 && result.totalCharacters > 5000) {
        issues.push(`⚠️ 只有1个分块但内容很长，可能未正确识别结构`);
      }

      // 检查分块大小差异
      const sizes = result.chunks.map(c => c.content.length);
      const avgSize = sizes.reduce((a, b) => a + b, 0) / sizes.length;
      const tooSmall = sizes.filter(s => s < 100).length;
      const tooLarge = sizes.filter(s => s > avgSize * 5).length;

      if (tooSmall > 0) {
        issues.push(`⚠️ ${tooSmall} 个分块内容过短 (<100字符)，可能是误匹配`);
      }
      if (tooLarge > 0) {
        issues.push(`⚠️ ${tooLarge} 个分块内容过长，分块可能不均匀`);
      }

      // 检查标题异常
      const shortTitles = result.chunks.filter(c => c.title.length < 3);
      if (shortTitles.length > 0) {
        issues.push(`⚠️ ${shortTitles.length} 个分块标题过短`);
      }

      // 检查重复标题
      const titleSet = new Set(result.chunks.map(c => c.title));
      if (titleSet.size < result.chunks.length) {
        issues.push(`⚠️ 存在重复标题，可能是目录或列表被误匹配`);
      }

      if (issues.length === 0) {
        console.log('  ✅ 未发现明显问题');
      } else {
        issues.forEach(issue => console.log(`  ${issue}`));
      }

      // 5. 显示前5个分块的完整标题
      console.log('\n' + '─'.repeat(60));
      console.log('📝 前10个分块标题（完整）');
      console.log('─'.repeat(60));
      result.chunks.slice(0, 10).forEach((chunk, i) => {
        console.log(`  ${i + 1}. ${chunk.title}`);
      });
      if (result.chunks.length > 10) {
        console.log(`  ... 还有 ${result.chunks.length - 10} 个`);
      }
    }

    console.log('\n' + '═'.repeat(60));

  } catch (error: any) {
    console.error(`❌ 处理失败: ${error.message}`);
    process.exit(1);
  }
}

main();