"""PDF Processing Pipeline for Philosophical and Academic Texts. This module provides a comprehensive 10-step pipeline for processing PDF documents through OCR, intelligent LLM-based extraction, and vector database ingestion. It is the main orchestration layer for the Library RAG application. Pipeline V2 Architecture (10 Steps): 1. **OCR Processing**: Extract text and images via Mistral OCR API (~0.003 EUR/page) 2. **Markdown Building**: Convert OCR output to structured Markdown 3. **Image Extraction**: Save document images to disk (optional base64 embedding) 4. **Metadata Extraction**: LLM-based extraction of title, author, year, language 5. **TOC Extraction**: Hierarchical table of contents via LLM or OCR annotations 6. **Section Classification**: Classify sections (main_content, exposition, etc.) 7. **Semantic Chunking**: Split content into argumentative units via LLM 8. **Chunk Cleaning**: Remove OCR artifacts and validate chunk quality 9. **Validation & Enrichment**: LLM validation + concept extraction 10. **Weaviate Ingestion**: Vectorize and store chunks for semantic search LLM Provider Options: - **Ollama** (local): Free, slower, good for development/testing - **Mistral API** (remote): Fast, paid, recommended for production Key Parameters: - ``skip_ocr``: Reuse existing markdown (avoids OCR cost on re-processing) - ``use_llm``: Enable/disable LLM processing steps - ``use_semantic_chunking``: Use LLM for intelligent chunking (slower but precise) - ``use_ocr_annotations``: Use OCR annotations for TOC (3x cost, more reliable) - ``ingest_to_weaviate``: Insert chunks into Weaviate vector database Example: Basic usage with default settings (Ollama local):: from pathlib import Path from utils.pdf_pipeline import process_pdf result = process_pdf( Path("input/descartes_meditations.pdf"), output_dir=Path("output"), use_llm=True, llm_provider="ollama", ) if result["success"]: print(f"Processed: {result['document_name']}") print(f"Chunks: {result['chunks_count']}") print(f"Cost: {result.get('cost_total', 0):.4f} EUR") Production usage with Mistral API:: result = process_pdf( Path("input/document.pdf"), llm_provider="mistral", use_semantic_chunking=True, # Better quality chunks use_ocr_annotations=True, # More reliable TOC ingest_to_weaviate=True, ) Re-processing without OCR cost:: result = process_pdf( Path("input/document.pdf"), skip_ocr=True, # Reuse existing markdown use_llm=True, ) Cost Tracking: The pipeline tracks costs for both OCR and LLM operations: - OCR costs: ~0.003 EUR/page (standard), ~0.009 EUR/page (with annotations) - LLM costs: Variable when using Mistral API, free with Ollama Costs are returned in the result dict:: result["cost_ocr"] # OCR cost in euros result["cost_llm"] # LLM cost in euros (Mistral API only) result["cost_total"] # Total cost Output Files: For each processed document, the pipeline creates:: output// .md # Structured markdown _ocr.json # Raw OCR response _chunks.json # Processed chunks + metadata _weaviate.json # Weaviate ingestion results images/ # Extracted images (if not embedded) See Also: - :mod:`utils.mistral_client`: OCR API client - :mod:`utils.llm_metadata`: Metadata extraction - :mod:`utils.llm_toc`: TOC extraction - :mod:`utils.llm_classifier`: Section classification - :mod:`utils.llm_chunker`: Semantic chunking - :mod:`utils.llm_cleaner`: Chunk cleaning - :mod:`utils.llm_validator`: Document validation - :mod:`utils.weaviate_ingest`: Vector database ingestion - :mod:`utils.types`: Type definitions for all data structures Note: This module supports both V1 (legacy) and V2 (current) pipelines. V1 is kept for backwards compatibility but V2 is recommended. Version: 2.0 """ from __future__ import annotations import json import logging from pathlib import Path from typing import Any, Callable, Dict, List, Literal, Optional, TYPE_CHECKING, Union, cast # Import Mistral types for type checking only (avoids runtime import issues) if TYPE_CHECKING: from mistralai import Mistral as MistralClient from .types import ( ChunkData, ChunksDataExport, ClassifiedSection, DocumentHierarchy, FlatChunk, FlatSection, HierarchyNode, LegacyMetadata, LLMCostStats, LLMProvider, Metadata, OptionalProgressCallback, ParsedDocument, PipelineResult, SemanticChunk, SimpleTOCEntry, TOCEntry, TOCExtractionResult, V1PipelineResult, V2PipelineFiles, V2PipelineResult, ValidationResult, WeaviateIngestResult, ) from .mistral_client import create_client, estimate_ocr_cost from .ocr_processor import run_ocr, serialize_ocr_response from .markdown_builder import build_markdown from .image_extractor import create_image_writer from .hierarchy_parser import build_hierarchy, flatten_hierarchy from .llm_structurer import structure_with_llm, LLMStructureError, LLMStructuredResult, reset_llm_cost, get_llm_cost # Nouveaux modules LLM v2 from .llm_metadata import extract_metadata from .llm_toc import extract_toc from .llm_classifier import classify_sections, filter_indexable_sections from .llm_cleaner import clean_chunk, is_chunk_valid from .llm_chunker import chunk_section_with_llm, simple_chunk_by_paragraphs from .llm_validator import validate_document, apply_corrections, enrich_chunks_with_concepts from .weaviate_ingest import ingest_document # Logger logger = logging.getLogger(__name__) def extract_document_metadata_legacy( hierarchy: DocumentHierarchy, chunks: List[Dict[str, Any]], doc_name: str, ) -> LegacyMetadata: """Extrait les métadonnées du document depuis la hiérarchie et les chunks (méthode legacy). Args: hierarchy: Structure hiérarchique du document (DocumentHierarchy). chunks: Liste des chunks aplatis. doc_name: Nom du document. Returns: Dictionnaire de métadonnées au format LegacyMetadata. """ import re metadata: LegacyMetadata = { "title": None, "author": None, "work": doc_name, "chunks_count": len(chunks), "toc": [], } sections: List[HierarchyNode] = hierarchy.get("sections", []) if sections: first_section: HierarchyNode = sections[0] metadata["title"] = first_section.get("title") def _extract_toc_entries( sections: List[HierarchyNode], level: int = 0, ) -> List[SimpleTOCEntry]: """Extract simple TOC entries from hierarchy nodes.""" toc_items: List[SimpleTOCEntry] = [] for section in sections: section_level: int = section.get("level", 1) if section_level <= 2: toc_items.append({ "title": section.get("title") or "Sans titre", "level": section_level, }) children: List[HierarchyNode] = section.get("children", []) if children: toc_items.extend(_extract_toc_entries(children, level + 1)) return toc_items metadata["toc"] = _extract_toc_entries(sections) preamble: str = hierarchy.get("preamble", "") author_patterns: List[str] = [ r"(?:par|by|auteur|author)[:\s]+([A-ZÀ-Ü][a-zà-ü]+(?:\s+[A-ZÀ-Ü][a-zà-ü]+)*)", r"([A-ZÀ-Ü][a-zà-ü]+(?:\s+[A-ZÀ-Ü][a-zà-ü]+)*)\s*[-–—]\s*(?:auteur|author)", ] for pattern in author_patterns: match = re.search(pattern, preamble, re.IGNORECASE) if match: metadata["author"] = match.group(1).strip() break return metadata def process_pdf_v2( pdf_path: Path, output_dir: Path = Path("output"), *, document_name: Optional[str] = None, api_key: Optional[str] = None, use_llm: bool = True, llm_provider: LLMProvider = "ollama", llm_model: Optional[str] = None, llm_base_url: Optional[str] = None, llm_temperature: float = 0.2, embed_images: bool = False, ingest_to_weaviate: bool = True, clean_chunks: bool = True, extract_concepts: bool = True, validate_output: bool = True, skip_ocr: bool = False, use_ocr_annotations: bool = False, max_toc_pages: int = 8, use_semantic_chunking: bool = False, progress_callback: OptionalProgressCallback = None, ) -> V2PipelineResult: """Process a PDF document through the intelligent V2 pipeline with LLM extraction. This is the main entry point for processing philosophical and academic PDF documents. The pipeline performs OCR, LLM-based metadata/TOC extraction, semantic chunking, and optional Weaviate ingestion for semantic search. The 10 processing steps are: 1. OCR via Mistral API (unless skip_ocr=True) 2. Markdown construction from OCR output 3. Image extraction to disk (or base64 embedding) 4. Metadata extraction via LLM (title, author, year, language) 5. TOC extraction via LLM or OCR annotations 6. Section classification via LLM (main_content, exposition, etc.) 7. Semantic chunking via LLM (argumentative units) 8. Chunk cleaning (remove OCR artifacts, validate quality) 9. Validation and concept enrichment via LLM 10. Weaviate ingestion (vectorization and storage) Args: pdf_path: Path to the PDF file to process. Must exist and be readable. output_dir: Base output directory. A subdirectory named after the document will be created. Defaults to "output". document_name: Override the document name (derived from filename if None). api_key: Mistral API key for OCR. If None, uses MISTRAL_API_KEY env var. use_llm: Enable LLM processing steps (metadata, TOC, classification, chunking, validation). Set to False for basic processing only. llm_provider: LLM provider to use. "ollama" for local (free, slower) or "mistral" for API (fast, paid). Defaults to "ollama". llm_model: Specific model name. If None, uses provider default (e.g., "qwen2.5:7b" for Ollama, "mistral-small" for Mistral). llm_base_url: Base URL for Ollama server. Defaults to "http://localhost:11434". llm_temperature: LLM sampling temperature. Lower values (0.1-0.3) produce more deterministic output. Defaults to 0.2. embed_images: If True, embed images as base64 in markdown. If False, save images to disk. Defaults to False. ingest_to_weaviate: Insert processed chunks into Weaviate vector database for semantic search. Requires Weaviate to be running. Defaults to True. clean_chunks: Apply chunk cleaning to remove OCR artifacts and validate minimum quality thresholds. Defaults to True. extract_concepts: Extract philosophical/academic concepts from chunks using LLM. Adds concept keywords for search. Defaults to True. validate_output: Run final LLM validation on the processed document. Checks metadata accuracy and suggests corrections. Defaults to True. skip_ocr: Skip OCR step and reuse existing markdown file from previous run. Useful for re-processing without incurring OCR costs. Defaults to False. use_ocr_annotations: Use OCR with annotations for more reliable TOC extraction. Costs approximately 3x more than standard OCR. Defaults to False. max_toc_pages: Maximum number of pages to analyze for TOC extraction. Mistral API limits to 8 pages maximum. Defaults to 8. use_semantic_chunking: Use LLM-based semantic chunking instead of basic paragraph splitting. Slower but produces higher quality argumentative units. Defaults to False. progress_callback: Optional callback function for progress updates. Signature: ``callback(step_id: str, status: str, detail: str | None)``. step_id values: ocr, markdown, metadata, toc, classify, chunking, cleaning, validation, weaviate. status values: active, completed, error, skipped. Returns: V2PipelineResult dictionary containing: - success (bool): Whether processing completed successfully. - document_name (str): Name of the processed document. - output_dir (str): Path to the output directory. - files (dict): Paths to generated files (markdown, chunks, ocr, weaviate). - pages (int): Number of pages in the PDF. - cost_ocr (float): OCR cost in euros. - cost_llm (float): LLM cost in euros (Mistral API only). - cost_total (float): Total processing cost. - llm_stats (dict | None): Detailed LLM usage statistics. - metadata (dict): Extracted document metadata. - toc (list): Hierarchical table of contents. - chunks_count (int): Number of chunks generated. - validation (dict | None): Validation results if enabled. - weaviate_ingest (dict | None): Weaviate ingestion results. - pipeline_version (str): Always "2.0" for this pipeline. - error (str): Error message if success is False. Raises: FileNotFoundError: If pdf_path does not exist and skip_ocr is False. ValueError: If skip_ocr is True but no cached markdown exists. Example: Basic processing with Ollama:: from pathlib import Path from utils.pdf_pipeline import process_pdf_v2 result = process_pdf_v2( Path("input/plato_republic.pdf"), use_llm=True, llm_provider="ollama", ) if result["success"]: print(f"Processed {result['chunks_count']} chunks") print(f"Title: {result['metadata'].get('title')}") Production processing with Mistral API:: result = process_pdf_v2( Path("input/kant_critique.pdf"), llm_provider="mistral", use_semantic_chunking=True, use_ocr_annotations=True, ) print(f"Total cost: {result['cost_total']:.4f} EUR") Note: Cost implications: - OCR: ~0.003 EUR/page (standard), ~0.009 EUR/page (with annotations) - LLM (Mistral API): Variable based on token usage - LLM (Ollama): Free (local processing) Use ``skip_ocr=True`` when re-processing to avoid OCR costs. The function will reuse the existing markdown file. """ pdf_path = Path(pdf_path).resolve() # Detect file type by extension file_ext: str = pdf_path.suffix.lower() is_markdown_file: bool = file_ext == ".md" doc_name: str = document_name or pdf_path.stem if doc_name.lower().endswith(".pdf"): doc_name = doc_name[:-4] elif doc_name.lower().endswith(".md"): doc_name = doc_name[:-3] doc_output_dir: Path = Path(output_dir).resolve() / doc_name doc_output_dir.mkdir(parents=True, exist_ok=True) # Chemins des fichiers de sortie md_path: Path = doc_output_dir / f"{doc_name}.md" chunks_path: Path = doc_output_dir / f"{doc_name}_chunks.json" ocr_path: Path = doc_output_dir / f"{doc_name}_ocr.json" images_dir: Path = doc_output_dir / "images" logger.info(f"[V2] Traitement de : {pdf_path}") logger.info(f"[V2] Sortie dans : {doc_output_dir}") # Helper pour émettre la progression def emit_progress(step: str, status: str, detail: Optional[str] = None) -> None: if progress_callback: try: progress_callback(step, status, detail) except Exception: pass try: # ═══════════════════════════════════════════════════════════════════ # ÉTAPE 1-4 : OCR et Markdown (sauf si skip_ocr) # ═══════════════════════════════════════════════════════════════════ nb_pages: int = 0 cost: float = 0.0 # Coût OCR (0 si skip_ocr) # Réinitialiser le compteur de coût LLM pour ce document if llm_provider == "mistral": reset_llm_cost() if is_markdown_file: # Mode Markdown natif : charger directement le fichier .md emit_progress("ocr", "active", "Chargement Markdown...") if not pdf_path.is_file(): emit_progress("ocr", "error", "Fichier Markdown introuvable") return { "success": False, "error": f"Fichier Markdown introuvable : {pdf_path}", } logger.info("[1-4/10] 📝 Chargement direct du fichier Markdown (pas d'OCR)") markdown_text: str = pdf_path.read_text(encoding="utf-8") # Copier le contenu vers le répertoire de sortie md_path.write_text(markdown_text, encoding="utf-8") # Estimer le nombre de "pages" à partir des en-têtes H1 h1_count = markdown_text.count("\n# ") + (1 if markdown_text.startswith("# ") else 0) nb_pages = max(h1_count, 1) # Au moins 1 "page" emit_progress("ocr", "skipped", f"Markdown (0.00€)") emit_progress("markdown", "completed", f"{nb_pages} sections détectées") logger.info(f"Fichier Markdown chargé directement ({nb_pages} sections H1 détectées)") elif skip_ocr: # Mode test : réutiliser le markdown existant emit_progress("ocr", "active", "Vérification du cache...") if not md_path.exists(): emit_progress("ocr", "error", "Fichier non trouvé") return { "success": False, "error": f"Mode skip_ocr activé mais fichier non trouvé : {md_path}", } logger.info("[1-4/10] ⚡ Skip OCR - Réutilisation du markdown existant") markdown_text = md_path.read_text(encoding="utf-8") # Essayer de récupérer le nombre de pages depuis l'OCR existant if ocr_path.exists(): try: ocr_data = json.loads(ocr_path.read_text(encoding="utf-8")) nb_pages = len(ocr_data.get("pages", [])) except Exception: nb_pages = markdown_text.count("