Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions
--- a/generations/library_rag/utils/toc_extractor.py
+++ b/generations/library_rag/utils/toc_extractor.py
@@ -0,0 +1,260 @@
+"""Table of Contents (TOC) extraction using Mistral OCR with annotations.
+
+This module is the **primary entry point** for TOC extraction in the Library RAG
+pipeline. It provides intelligent routing between two extraction strategies:
+
+1. **Visual (bbox) Analysis** (default, recommended): Uses bounding box coordinates
+   to detect indentation and hierarchy based on horizontal positioning.
+2. **Semantic (annotation) Analysis**: Uses Mistral's document_annotation_format
+   for structured metadata and TOC extraction.
+
+The visual approach is more reliable for philosophical texts with complex
+hierarchies (parts, chapters, sections, subsections).
+
+Extraction Strategies:
+    ┌─────────────────────────────────────────────────────────────┐
+    │  extract_toc_from_annotations(use_visual_bbox=True)         │
+    │         ↓ (default)                                         │
+    │  toc_extractor_visual.py → X-coordinate based hierarchy     │
+    │                                                             │
+    │  extract_toc_from_annotations(use_visual_bbox=False)        │
+    │         ↓                                                   │
+    │  DocumentMetadata Pydantic schema → Structured extraction   │
+    └─────────────────────────────────────────────────────────────┘
+
+Cost Considerations:
+    - Annotated OCR: ~0.003€/page (3x standard OCR cost)
+    - Only first N pages are processed (default: 8)
+    - Total cost: max_toc_pages × 0.003€
+
+Output Structure:
+    {
+        "success": bool,
+        "metadata": {...},           # Document metadata
+        "toc": [...],               # Hierarchical TOC (nested children)
+        "toc_flat": [...],          # Flat list with levels
+        "cost_ocr_annotated": float
+    }
+
+Example:
+    >>> from pathlib import Path
+    >>> from utils.toc_extractor import extract_toc_from_annotations
+    >>>
+    >>> # Extract TOC using visual analysis (recommended)
+    >>> result = extract_toc_from_annotations(
+    ...     pdf_path=Path("input/philosophy_book.pdf"),
+    ...     max_toc_pages=8,
+    ...     use_visual_bbox=True  # default
+    ... )
+    >>> if result["success"]:
+    ...     for entry in result["toc"]:
+    ...         print(f"{entry['title']} (p.{entry['page']})")
+
+Functions:
+    - extract_toc_from_annotations(): Main entry point with strategy routing
+    - build_hierarchical_toc(): Converts flat TOC entries to nested structure
+    - map_toc_to_content(): Associates TOC entries with document content
+
+See Also:
+    - utils.toc_extractor_visual: Visual/bbox-based extraction (default)
+    - utils.toc_extractor_markdown: Markdown indentation-based extraction
+    - utils.llm_toc: LLM-based TOC extraction (alternative approach)
+"""
+
+import json
+import logging
+from typing import Any, Dict, List, Optional, Union, cast
+from pathlib import Path
+
+from .ocr_schemas import DocumentMetadata, TocEntry
+from .ocr_processor import run_ocr_with_annotations
+from .mistral_client import create_client
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+# TypedDict for hierarchical TOC nodes
+class TOCNode(Dict[str, Any]):
+    """Type alias for TOC node structure with title, page, level, type, children."""
+    pass
+
+
+def extract_toc_from_annotations(
+    pdf_path: Path,
+    api_key: Optional[str] = None,
+    max_toc_pages: int = 8,
+    use_visual_bbox: bool = True,  # NOUVEAU : Utiliser l'analyse visuelle par défaut
+) -> Dict[str, Any]:
+    """Extrait la TOC structurée via OCR avec annotations.
+    
+    Coût : 3€/1000 pages pour les pages annotées (vs 1€/1000 pour OCR basique).
+    
+    Args:
+        pdf_path: Chemin du fichier PDF
+        api_key: Clé API Mistral (optionnel, sinon charge depuis .env)
+        max_toc_pages: Nombre max de pages à annoter (défaut 8, limite API pour document_annotation)
+        use_visual_bbox: Si True, utilise l'analyse visuelle des bounding boxes (plus fiable)
+    
+    Returns:
+        Dict avec :
+        - success: bool
+        - metadata: dict avec métadonnées enrichies
+        - toc: liste hiérarchique [{title, page, level, children}]
+        - toc_flat: liste plate [{title, page, level, type, parent_title}]
+        - cost_ocr_annotated: float (coût en €)
+        - error: str (si échec)
+    """
+    # Si demandé, utiliser l'approche visuelle (bbox)
+    if use_visual_bbox:
+        logger.info("Utilisation de l'analyse visuelle (bbox) pour extraction TOC")
+        from .toc_extractor_visual import extract_toc_with_visual_analysis
+        return cast(Dict[str, Any], extract_toc_with_visual_analysis(pdf_path, api_key, max_toc_pages))
+    
+    # Sinon, continuer avec l'approche sémantique (document_annotation_format)
+    try:
+        client = create_client(api_key)
+        pdf_bytes = pdf_path.read_bytes()
+    except Exception as e:
+        logger.error(f"Erreur initialisation client/lecture PDF : {e}")
+        return {"success": False, "error": f"Initialisation échouée : {str(e)}"}
+    
+    # Phase 1 : Annoter les premières pages pour extraire TOC + métadonnées
+    logger.info(f"Extraction TOC avec annotations sur {max_toc_pages} premières pages")
+    
+    try:
+        annotated_response = run_ocr_with_annotations(
+            client=client,
+            file_bytes=pdf_bytes,
+            filename=pdf_path.name,
+            include_images=False,  # Pas besoin d'images pour la TOC
+            document_annotation_format=DocumentMetadata,
+            pages=list(range(max_toc_pages)),  # Pages 0 à max_toc_pages-1
+        )
+    except Exception as e:
+        logger.error(f"Erreur appel OCR avec annotations : {e}")
+        return {"success": False, "error": f"Appel OCR échoué : {str(e)}"}
+    
+    # Extraire les annotations du document
+    doc_annotation = getattr(annotated_response, "document_annotation", None)
+    
+    if not doc_annotation:
+        return {"success": False, "error": "Aucune annotation retournée par l'API"}
+    
+    # Convertir en dictionnaire
+    try:
+        if isinstance(doc_annotation, str):
+            metadata_dict = json.loads(doc_annotation)
+        else:
+            metadata_dict = doc_annotation
+    except Exception as e:
+        logger.error(f"Erreur parsing annotations : {e}")
+        return {"success": False, "error": f"Parsing annotations échoué : {str(e)}"}
+    
+    # Valider avec Pydantic
+    try:
+        metadata = DocumentMetadata(**metadata_dict)
+        toc_entries = metadata.toc.entries
+        
+        logger.info(f"TOC extraite : {len(toc_entries)} entrées")
+        
+        # Construire la TOC hiérarchique
+        hierarchical_toc = build_hierarchical_toc(toc_entries)
+        
+        return {
+            "success": True,
+            "metadata": metadata.model_dump(),
+            "toc": hierarchical_toc,
+            "toc_flat": [entry.model_dump() for entry in toc_entries],
+            "cost_ocr_annotated": max_toc_pages * 0.003,  # 3€/1000 pages
+        }
+    except Exception as e:
+        logger.error(f"Erreur validation annotations : {e}")
+        return {"success": False, "error": f"Validation Pydantic échouée : {str(e)}"}
+
+
+def build_hierarchical_toc(entries: List[TocEntry]) -> List[Dict[str, Any]]:
+    """Construit une TOC hiérarchique à partir des entrées plates avec niveaux.
+
+    Utilise une stack pour gérer la hiérarchie basée sur les niveaux.
+
+    Args:
+        entries: Liste d'entrées TocEntry avec level (1=racine, 2=enfant de 1, etc.)
+
+    Returns:
+        TOC hiérarchique avec structure [{title, page, level, type, children: [...]}]
+    """
+    if not entries:
+        return []
+
+    toc: List[Dict[str, Any]] = []
+    stack: List[Dict[str, Any]] = []  # Stack pour gérer la hiérarchie courante
+
+    for entry in entries:
+        node: Dict[str, Any] = {
+            "title": entry.title,
+            "page": entry.page_number,
+            "level": entry.level,
+            "type": entry.entry_type.value,
+            "children": [],
+        }
+
+        # Remonter dans la stack jusqu'au parent approprié
+        # Un élément de level N doit être enfant du dernier élément de level < N
+        while stack and stack[-1]["level"] >= entry.level:
+            stack.pop()
+
+        if stack:
+            # Ajouter comme enfant du dernier élément de la stack
+            children: List[Dict[str, Any]] = stack[-1]["children"]
+            children.append(node)
+        else:
+            # Ajouter à la racine de la TOC
+            toc.append(node)
+
+        # Empiler ce nœud pour les prochaines itérations
+        stack.append(node)
+
+    return toc
+
+
+def map_toc_to_content(
+    toc_entries: List[TocEntry],
+    all_pages_markdown: str,
+) -> Dict[str, str]:
+    """Associe les entrées de TOC au contenu réel du document.
+
+    Utilise les vrais numéros de page pour découper le contenu par section.
+
+    Args:
+        toc_entries: Entrées de TOC avec numéros de page réels
+        all_pages_markdown: Markdown complet du document avec <!-- Page N --> markers
+
+    Returns:
+        Mapping {section_title: content_text}
+    """
+    # Découper le markdown par commentaires de page
+    pages: List[str] = all_pages_markdown.split("<!-- Page ")
+
+    content_map: Dict[str, str] = {}
+
+    for i, entry in enumerate(toc_entries):
+        start_page: int = entry.page_number
+
+        # Trouver la page de fin (numéro de page de la prochaine entrée ou fin du doc)
+        end_page: int
+        if i < len(toc_entries) - 1:
+            end_page = toc_entries[i + 1].page_number
+        else:
+            end_page = len(pages)  # Jusqu'à la fin
+
+        # Extraire le contenu entre start_page et end_page
+        section_content: List[str] = []
+        for page_idx in range(start_page, end_page):
+            if page_idx < len(pages):
+                # Nettoyer le commentaire de page et extraire le contenu
+                page_text: str = pages[page_idx].split("-->", 1)[-1].strip()
+                section_content.append(page_text)
+
+        content_map[entry.title] = "\n\n".join(section_content)
+
+    return content_map