- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
75 lines
2.3 KiB
Python
75 lines
2.3 KiB
Python
"""Utils - Pipeline de parsing PDF avec OCR Mistral et structuration LLM.
|
|
|
|
Version 2.0 : Pipeline intelligent avec extraction LLM des métadonnées,
|
|
TOC, classification des sections, chunking sémantique et validation.
|
|
"""
|
|
|
|
from .mistral_client import create_client, get_api_key, estimate_ocr_cost
|
|
from .pdf_uploader import upload_pdf
|
|
from .ocr_processor import run_ocr, serialize_ocr_response
|
|
from .markdown_builder import build_markdown
|
|
from .image_extractor import extract_images, create_image_writer
|
|
from .hierarchy_parser import build_hierarchy
|
|
from .llm_structurer import structure_with_llm, LLMStructureError
|
|
|
|
# Nouveaux modules LLM v2
|
|
from .llm_metadata import extract_metadata
|
|
from .llm_toc import extract_toc
|
|
from .llm_classifier import classify_sections, filter_indexable_sections
|
|
from .llm_cleaner import clean_chunk, clean_page_markers, is_chunk_valid
|
|
from .llm_chunker import chunk_section_with_llm, simple_chunk_by_paragraphs, extract_concepts_from_chunk, extract_paragraph_number
|
|
from .llm_validator import validate_document, apply_corrections, enrich_chunks_with_concepts
|
|
|
|
# Pipeline
|
|
from .pdf_pipeline import process_pdf, process_pdf_v2, process_pdf_bytes
|
|
from .weaviate_ingest import ingest_document, delete_document_chunks
|
|
|
|
__all__ = [
|
|
# Client Mistral
|
|
"create_client",
|
|
"get_api_key",
|
|
"estimate_ocr_cost",
|
|
# Upload
|
|
"upload_pdf",
|
|
# OCR
|
|
"run_ocr",
|
|
"serialize_ocr_response",
|
|
# Markdown
|
|
"build_markdown",
|
|
# Images
|
|
"extract_images",
|
|
"create_image_writer",
|
|
# Hiérarchie
|
|
"build_hierarchy",
|
|
# LLM Legacy
|
|
"structure_with_llm",
|
|
"LLMStructureError",
|
|
# LLM v2 - Métadonnées
|
|
"extract_metadata",
|
|
# LLM v2 - TOC
|
|
"extract_toc",
|
|
# LLM v2 - Classification
|
|
"classify_sections",
|
|
"filter_indexable_sections",
|
|
# LLM v2 - Nettoyage
|
|
"clean_chunk",
|
|
"clean_page_markers",
|
|
"is_chunk_valid",
|
|
# LLM v2 - Chunking
|
|
"chunk_section_with_llm",
|
|
"simple_chunk_by_paragraphs",
|
|
"extract_concepts_from_chunk",
|
|
"extract_paragraph_number",
|
|
# LLM v2 - Validation
|
|
"validate_document",
|
|
"apply_corrections",
|
|
"enrich_chunks_with_concepts",
|
|
# Pipeline
|
|
"process_pdf",
|
|
"process_pdf_v2",
|
|
"process_pdf_bytes",
|
|
# Weaviate
|
|
"ingest_document",
|
|
"delete_document_chunks",
|
|
]
|