166 lines
5.3 KiB
TypeScript
166 lines
5.3 KiB
TypeScript
|
|
#!/usr/bin/env npx ts-node
|
|||
|
|
/**
|
|||
|
|
* 本地测试脚本:解析文档并分块
|
|||
|
|
* 用法: npx ts-node scripts/test-chunking-file.ts <文件路径>
|
|||
|
|
* 支持: .docx, .pdf, .epub
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
import fs from 'fs';
|
|||
|
|
import path from 'path';
|
|||
|
|
import pdfParse from 'pdf-parse';
|
|||
|
|
import mammoth from 'mammoth';
|
|||
|
|
import EPub from 'epub';
|
|||
|
|
import { structureChunkingService } from '../src/services/structureChunkingService';
|
|||
|
|
|
|||
|
|
async function parseFile(filePath: string): Promise<string> {
|
|||
|
|
const ext = path.extname(filePath).toLowerCase();
|
|||
|
|
const buffer = fs.readFileSync(filePath);
|
|||
|
|
|
|||
|
|
if (ext === '.docx') {
|
|||
|
|
const result = await mammoth.extractRawText({ buffer });
|
|||
|
|
return result.value;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (ext === '.pdf') {
|
|||
|
|
const data = await pdfParse(buffer);
|
|||
|
|
return data.text;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (ext === '.epub') {
|
|||
|
|
return new Promise((resolve, reject) => {
|
|||
|
|
const epub = new EPub(filePath);
|
|||
|
|
epub.on('end', async () => {
|
|||
|
|
const chapters: string[] = [];
|
|||
|
|
for (const item of epub.flow || []) {
|
|||
|
|
try {
|
|||
|
|
const text = await new Promise<string>((res) => {
|
|||
|
|
epub.getChapter(item.id, (err: any, t: string) => res(err ? '' : t || ''));
|
|||
|
|
});
|
|||
|
|
if (text.trim()) chapters.push(text.trim());
|
|||
|
|
} catch {}
|
|||
|
|
}
|
|||
|
|
resolve(chapters.join('\n\n'));
|
|||
|
|
});
|
|||
|
|
epub.on('error', reject);
|
|||
|
|
epub.parse();
|
|||
|
|
});
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
throw new Error(`不支持的格式: ${ext}`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
async function main() {
|
|||
|
|
const filePath = process.argv[2];
|
|||
|
|
if (!filePath) {
|
|||
|
|
console.log('用法: npx ts-node scripts/test-chunking-file.ts <文件路径>');
|
|||
|
|
console.log('支持: .docx, .pdf, .epub');
|
|||
|
|
process.exit(1);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (!fs.existsSync(filePath)) {
|
|||
|
|
console.error(`文件不存在: ${filePath}`);
|
|||
|
|
process.exit(1);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
console.log('═'.repeat(60));
|
|||
|
|
console.log(`📄 文件: ${path.basename(filePath)}`);
|
|||
|
|
console.log('═'.repeat(60));
|
|||
|
|
|
|||
|
|
try {
|
|||
|
|
// 1. 解析文档
|
|||
|
|
console.log('\n⏳ 解析文档...');
|
|||
|
|
const text = await parseFile(filePath);
|
|||
|
|
console.log(`✅ 解析完成: ${text.length.toLocaleString()} 字符`);
|
|||
|
|
|
|||
|
|
// 2. 分块(使用 LLM 增强版)
|
|||
|
|
console.log('\n⏳ 执行分块(LLM 增强版)...');
|
|||
|
|
const result = await structureChunkingService.parseAsync(text);
|
|||
|
|
|
|||
|
|
// 3. 输出结果
|
|||
|
|
console.log('\n' + '─'.repeat(60));
|
|||
|
|
console.log('📊 分块结果');
|
|||
|
|
console.log('─'.repeat(60));
|
|||
|
|
console.log(` 识别模式: ${result.pattern || '(无结构)'}`);
|
|||
|
|
console.log(` 分块数量: ${result.chunks.length}`);
|
|||
|
|
console.log(` 总字符数: ${result.totalCharacters.toLocaleString()}`);
|
|||
|
|
|
|||
|
|
if (result.chunks.length > 0) {
|
|||
|
|
console.log('\n' + '─'.repeat(60));
|
|||
|
|
console.log('📋 分块列表');
|
|||
|
|
console.log('─'.repeat(60));
|
|||
|
|
|
|||
|
|
result.chunks.forEach((chunk, i) => {
|
|||
|
|
const preview = chunk.content.replace(/\s+/g, ' ').substring(0, 60);
|
|||
|
|
console.log(`\n[${i + 1}] ${chunk.title}`);
|
|||
|
|
console.log(` 字符: ${chunk.content.length.toLocaleString()}`);
|
|||
|
|
console.log(` 预览: ${preview}...`);
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
// 4. 潜在问题检测
|
|||
|
|
console.log('\n' + '─'.repeat(60));
|
|||
|
|
console.log('🔍 潜在问题检测');
|
|||
|
|
console.log('─'.repeat(60));
|
|||
|
|
|
|||
|
|
let issues: string[] = [];
|
|||
|
|
|
|||
|
|
// 检查分块数量异常
|
|||
|
|
if (result.chunks.length > 50) {
|
|||
|
|
issues.push(`⚠️ 分块数量较多 (${result.chunks.length}),可能存在误匹配`);
|
|||
|
|
}
|
|||
|
|
if (result.chunks.length === 1 && result.totalCharacters > 5000) {
|
|||
|
|
issues.push(`⚠️ 只有1个分块但内容很长,可能未正确识别结构`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 检查分块大小差异
|
|||
|
|
const sizes = result.chunks.map(c => c.content.length);
|
|||
|
|
const avgSize = sizes.reduce((a, b) => a + b, 0) / sizes.length;
|
|||
|
|
const tooSmall = sizes.filter(s => s < 100).length;
|
|||
|
|
const tooLarge = sizes.filter(s => s > avgSize * 5).length;
|
|||
|
|
|
|||
|
|
if (tooSmall > 0) {
|
|||
|
|
issues.push(`⚠️ ${tooSmall} 个分块内容过短 (<100字符),可能是误匹配`);
|
|||
|
|
}
|
|||
|
|
if (tooLarge > 0) {
|
|||
|
|
issues.push(`⚠️ ${tooLarge} 个分块内容过长,分块可能不均匀`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 检查标题异常
|
|||
|
|
const shortTitles = result.chunks.filter(c => c.title.length < 3);
|
|||
|
|
if (shortTitles.length > 0) {
|
|||
|
|
issues.push(`⚠️ ${shortTitles.length} 个分块标题过短`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 检查重复标题
|
|||
|
|
const titleSet = new Set(result.chunks.map(c => c.title));
|
|||
|
|
if (titleSet.size < result.chunks.length) {
|
|||
|
|
issues.push(`⚠️ 存在重复标题,可能是目录或列表被误匹配`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (issues.length === 0) {
|
|||
|
|
console.log(' ✅ 未发现明显问题');
|
|||
|
|
} else {
|
|||
|
|
issues.forEach(issue => console.log(` ${issue}`));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 5. 显示前5个分块的完整标题
|
|||
|
|
console.log('\n' + '─'.repeat(60));
|
|||
|
|
console.log('📝 前10个分块标题(完整)');
|
|||
|
|
console.log('─'.repeat(60));
|
|||
|
|
result.chunks.slice(0, 10).forEach((chunk, i) => {
|
|||
|
|
console.log(` ${i + 1}. ${chunk.title}`);
|
|||
|
|
});
|
|||
|
|
if (result.chunks.length > 10) {
|
|||
|
|
console.log(` ... 还有 ${result.chunks.length - 10} 个`);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
console.log('\n' + '═'.repeat(60));
|
|||
|
|
|
|||
|
|
} catch (error: any) {
|
|||
|
|
console.error(`❌ 处理失败: ${error.message}`);
|
|||
|
|
process.exit(1);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
main();
|