用于PDF/Word/邮件接入的非结构化文档接入原子 skill,适用于通用行业数据接入场景。
本 Skill 支持多种非结构化文档输入格式,核心数据来源包括:
说明:本 Skill 不包含文档采集功能,需要用户提供非结构化文档文件。建议文档格式规范,以便进行准确的文档接入。
本 Skill 提供全面的非结构化文档接入能力,涵盖多种接入功能:
{
"source_info": {
"source_type": "file_system",
"source_path": "/documents",
"document_count": 100
},
"ingestion_config": {
"supported_formats": ["pdf", "docx", "doc"],
"extract_text": true,
"extract_metadata": true,
"index_content": true
},
"ingestion_results": {
"total_documents": 100,
"successful_documents": 95,
"failed_documents": 5,
"ingestion_time": "2024-03-15T10:00:00",
"duration": "300s"
},
"document_samples": [
{
"document_id": "DOC001",
"filename": "annual_report.pdf",
"file_size": 5120000,
"file_type": "pdf",
"page_count": 200,
"text_length": 50000,
"metadata": {
"title": "2024年度报告",
"author": "示例公司",
"created_date": "2024-03-01",
"modified_date": "2024-03-10"
},
"extraction_status": "success",
"indexed": true
}
],
"statistics": {
"documents_processed": 100,
"documents_indexed": 95,
"total_text_extracted": 5000000,
"average_processing_time": "3s",
"success_rate": 0.95
}
}