001project_wildgrowth/backend/scripts/test-chunking-file.ts

166 lines
5.3 KiB
TypeScript
Raw Normal View History

2026-02-11 15:26:03 +08:00
#!/usr/bin/env npx ts-node
/**
*
* 用法: npx ts-node scripts/test-chunking-file.ts <文件路径>
* 支持: .docx, .pdf, .epub
*/
import fs from 'fs';
import path from 'path';
import pdfParse from 'pdf-parse';
import mammoth from 'mammoth';
import EPub from 'epub';
import { structureChunkingService } from '../src/services/structureChunkingService';
async function parseFile(filePath: string): Promise<string> {
const ext = path.extname(filePath).toLowerCase();
const buffer = fs.readFileSync(filePath);
if (ext === '.docx') {
const result = await mammoth.extractRawText({ buffer });
return result.value;
}
if (ext === '.pdf') {
const data = await pdfParse(buffer);
return data.text;
}
if (ext === '.epub') {
return new Promise((resolve, reject) => {
const epub = new EPub(filePath);
epub.on('end', async () => {
const chapters: string[] = [];
for (const item of epub.flow || []) {
try {
const text = await new Promise<string>((res) => {
epub.getChapter(item.id, (err: any, t: string) => res(err ? '' : t || ''));
});
if (text.trim()) chapters.push(text.trim());
} catch {}
}
resolve(chapters.join('\n\n'));
});
epub.on('error', reject);
epub.parse();
});
}
throw new Error(`不支持的格式: ${ext}`);
}
async function main() {
const filePath = process.argv[2];
if (!filePath) {
console.log('用法: npx ts-node scripts/test-chunking-file.ts <文件路径>');
console.log('支持: .docx, .pdf, .epub');
process.exit(1);
}
if (!fs.existsSync(filePath)) {
console.error(`文件不存在: ${filePath}`);
process.exit(1);
}
console.log('═'.repeat(60));
console.log(`📄 文件: ${path.basename(filePath)}`);
console.log('═'.repeat(60));
try {
// 1. 解析文档
console.log('\n⏳ 解析文档...');
const text = await parseFile(filePath);
console.log(`✅ 解析完成: ${text.length.toLocaleString()} 字符`);
// 2. 分块(使用 LLM 增强版)
console.log('\n⏳ 执行分块LLM 增强版)...');
const result = await structureChunkingService.parseAsync(text);
// 3. 输出结果
console.log('\n' + '─'.repeat(60));
console.log('📊 分块结果');
console.log('─'.repeat(60));
console.log(` 识别模式: ${result.pattern || '(无结构)'}`);
console.log(` 分块数量: ${result.chunks.length}`);
console.log(` 总字符数: ${result.totalCharacters.toLocaleString()}`);
if (result.chunks.length > 0) {
console.log('\n' + '─'.repeat(60));
console.log('📋 分块列表');
console.log('─'.repeat(60));
result.chunks.forEach((chunk, i) => {
const preview = chunk.content.replace(/\s+/g, ' ').substring(0, 60);
console.log(`\n[${i + 1}] ${chunk.title}`);
console.log(` 字符: ${chunk.content.length.toLocaleString()}`);
console.log(` 预览: ${preview}...`);
});
// 4. 潜在问题检测
console.log('\n' + '─'.repeat(60));
console.log('🔍 潜在问题检测');
console.log('─'.repeat(60));
let issues: string[] = [];
// 检查分块数量异常
if (result.chunks.length > 50) {
issues.push(`⚠️ 分块数量较多 (${result.chunks.length}),可能存在误匹配`);
}
if (result.chunks.length === 1 && result.totalCharacters > 5000) {
issues.push(`⚠️ 只有1个分块但内容很长可能未正确识别结构`);
}
// 检查分块大小差异
const sizes = result.chunks.map(c => c.content.length);
const avgSize = sizes.reduce((a, b) => a + b, 0) / sizes.length;
const tooSmall = sizes.filter(s => s < 100).length;
const tooLarge = sizes.filter(s => s > avgSize * 5).length;
if (tooSmall > 0) {
issues.push(`⚠️ ${tooSmall} 个分块内容过短 (<100字符),可能是误匹配`);
}
if (tooLarge > 0) {
issues.push(`⚠️ ${tooLarge} 个分块内容过长,分块可能不均匀`);
}
// 检查标题异常
const shortTitles = result.chunks.filter(c => c.title.length < 3);
if (shortTitles.length > 0) {
issues.push(`⚠️ ${shortTitles.length} 个分块标题过短`);
}
// 检查重复标题
const titleSet = new Set(result.chunks.map(c => c.title));
if (titleSet.size < result.chunks.length) {
issues.push(`⚠️ 存在重复标题,可能是目录或列表被误匹配`);
}
if (issues.length === 0) {
console.log(' ✅ 未发现明显问题');
} else {
issues.forEach(issue => console.log(` ${issue}`));
}
// 5. 显示前5个分块的完整标题
console.log('\n' + '─'.repeat(60));
console.log('📝 前10个分块标题完整');
console.log('─'.repeat(60));
result.chunks.slice(0, 10).forEach((chunk, i) => {
console.log(` ${i + 1}. ${chunk.title}`);
});
if (result.chunks.length > 10) {
console.log(` ... 还有 ${result.chunks.length - 10}`);
}
}
console.log('\n' + '═'.repeat(60));
} catch (error: any) {
console.error(`❌ 处理失败: ${error.message}`);
process.exit(1);
}
}
main();