#!/usr/bin/env npx ts-node /** * 本地测试脚本:解析文档并分块 * 用法: npx ts-node scripts/test-chunking-file.ts <文件路径> * 支持: .docx, .pdf, .epub */ import fs from 'fs'; import path from 'path'; import pdfParse from 'pdf-parse'; import mammoth from 'mammoth'; import EPub from 'epub'; import { structureChunkingService } from '../src/services/structureChunkingService'; async function parseFile(filePath: string): Promise { const ext = path.extname(filePath).toLowerCase(); const buffer = fs.readFileSync(filePath); if (ext === '.docx') { const result = await mammoth.extractRawText({ buffer }); return result.value; } if (ext === '.pdf') { const data = await pdfParse(buffer); return data.text; } if (ext === '.epub') { return new Promise((resolve, reject) => { const epub = new EPub(filePath); epub.on('end', async () => { const chapters: string[] = []; for (const item of epub.flow || []) { try { const text = await new Promise((res) => { epub.getChapter(item.id, (err: any, t: string) => res(err ? '' : t || '')); }); if (text.trim()) chapters.push(text.trim()); } catch {} } resolve(chapters.join('\n\n')); }); epub.on('error', reject); epub.parse(); }); } throw new Error(`不支持的格式: ${ext}`); } async function main() { const filePath = process.argv[2]; if (!filePath) { console.log('用法: npx ts-node scripts/test-chunking-file.ts <文件路径>'); console.log('支持: .docx, .pdf, .epub'); process.exit(1); } if (!fs.existsSync(filePath)) { console.error(`文件不存在: ${filePath}`); process.exit(1); } console.log('═'.repeat(60)); console.log(`📄 文件: ${path.basename(filePath)}`); console.log('═'.repeat(60)); try { // 1. 解析文档 console.log('\n⏳ 解析文档...'); const text = await parseFile(filePath); console.log(`✅ 解析完成: ${text.length.toLocaleString()} 字符`); // 2. 分块(使用 LLM 增强版) console.log('\n⏳ 执行分块(LLM 增强版)...'); const result = await structureChunkingService.parseAsync(text); // 3. 输出结果 console.log('\n' + '─'.repeat(60)); console.log('📊 分块结果'); console.log('─'.repeat(60)); console.log(` 识别模式: ${result.pattern || '(无结构)'}`); console.log(` 分块数量: ${result.chunks.length}`); console.log(` 总字符数: ${result.totalCharacters.toLocaleString()}`); if (result.chunks.length > 0) { console.log('\n' + '─'.repeat(60)); console.log('📋 分块列表'); console.log('─'.repeat(60)); result.chunks.forEach((chunk, i) => { const preview = chunk.content.replace(/\s+/g, ' ').substring(0, 60); console.log(`\n[${i + 1}] ${chunk.title}`); console.log(` 字符: ${chunk.content.length.toLocaleString()}`); console.log(` 预览: ${preview}...`); }); // 4. 潜在问题检测 console.log('\n' + '─'.repeat(60)); console.log('🔍 潜在问题检测'); console.log('─'.repeat(60)); let issues: string[] = []; // 检查分块数量异常 if (result.chunks.length > 50) { issues.push(`⚠️ 分块数量较多 (${result.chunks.length}),可能存在误匹配`); } if (result.chunks.length === 1 && result.totalCharacters > 5000) { issues.push(`⚠️ 只有1个分块但内容很长,可能未正确识别结构`); } // 检查分块大小差异 const sizes = result.chunks.map(c => c.content.length); const avgSize = sizes.reduce((a, b) => a + b, 0) / sizes.length; const tooSmall = sizes.filter(s => s < 100).length; const tooLarge = sizes.filter(s => s > avgSize * 5).length; if (tooSmall > 0) { issues.push(`⚠️ ${tooSmall} 个分块内容过短 (<100字符),可能是误匹配`); } if (tooLarge > 0) { issues.push(`⚠️ ${tooLarge} 个分块内容过长,分块可能不均匀`); } // 检查标题异常 const shortTitles = result.chunks.filter(c => c.title.length < 3); if (shortTitles.length > 0) { issues.push(`⚠️ ${shortTitles.length} 个分块标题过短`); } // 检查重复标题 const titleSet = new Set(result.chunks.map(c => c.title)); if (titleSet.size < result.chunks.length) { issues.push(`⚠️ 存在重复标题,可能是目录或列表被误匹配`); } if (issues.length === 0) { console.log(' ✅ 未发现明显问题'); } else { issues.forEach(issue => console.log(` ${issue}`)); } // 5. 显示前5个分块的完整标题 console.log('\n' + '─'.repeat(60)); console.log('📝 前10个分块标题(完整)'); console.log('─'.repeat(60)); result.chunks.slice(0, 10).forEach((chunk, i) => { console.log(` ${i + 1}. ${chunk.title}`); }); if (result.chunks.length > 10) { console.log(` ... 还有 ${result.chunks.length - 10} 个`); } } console.log('\n' + '═'.repeat(60)); } catch (error: any) { console.error(`❌ 处理失败: ${error.message}`); process.exit(1); } } main();