Add Library RAG project and cleanup root directory
- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
260
generations/library_rag/utils/toc_extractor.py
Normal file
260
generations/library_rag/utils/toc_extractor.py
Normal file
@@ -0,0 +1,260 @@
|
||||
"""Table of Contents (TOC) extraction using Mistral OCR with annotations.
|
||||
|
||||
This module is the **primary entry point** for TOC extraction in the Library RAG
|
||||
pipeline. It provides intelligent routing between two extraction strategies:
|
||||
|
||||
1. **Visual (bbox) Analysis** (default, recommended): Uses bounding box coordinates
|
||||
to detect indentation and hierarchy based on horizontal positioning.
|
||||
2. **Semantic (annotation) Analysis**: Uses Mistral's document_annotation_format
|
||||
for structured metadata and TOC extraction.
|
||||
|
||||
The visual approach is more reliable for philosophical texts with complex
|
||||
hierarchies (parts, chapters, sections, subsections).
|
||||
|
||||
Extraction Strategies:
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ extract_toc_from_annotations(use_visual_bbox=True) │
|
||||
│ ↓ (default) │
|
||||
│ toc_extractor_visual.py → X-coordinate based hierarchy │
|
||||
│ │
|
||||
│ extract_toc_from_annotations(use_visual_bbox=False) │
|
||||
│ ↓ │
|
||||
│ DocumentMetadata Pydantic schema → Structured extraction │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
|
||||
Cost Considerations:
|
||||
- Annotated OCR: ~0.003€/page (3x standard OCR cost)
|
||||
- Only first N pages are processed (default: 8)
|
||||
- Total cost: max_toc_pages × 0.003€
|
||||
|
||||
Output Structure:
|
||||
{
|
||||
"success": bool,
|
||||
"metadata": {...}, # Document metadata
|
||||
"toc": [...], # Hierarchical TOC (nested children)
|
||||
"toc_flat": [...], # Flat list with levels
|
||||
"cost_ocr_annotated": float
|
||||
}
|
||||
|
||||
Example:
|
||||
>>> from pathlib import Path
|
||||
>>> from utils.toc_extractor import extract_toc_from_annotations
|
||||
>>>
|
||||
>>> # Extract TOC using visual analysis (recommended)
|
||||
>>> result = extract_toc_from_annotations(
|
||||
... pdf_path=Path("input/philosophy_book.pdf"),
|
||||
... max_toc_pages=8,
|
||||
... use_visual_bbox=True # default
|
||||
... )
|
||||
>>> if result["success"]:
|
||||
... for entry in result["toc"]:
|
||||
... print(f"{entry['title']} (p.{entry['page']})")
|
||||
|
||||
Functions:
|
||||
- extract_toc_from_annotations(): Main entry point with strategy routing
|
||||
- build_hierarchical_toc(): Converts flat TOC entries to nested structure
|
||||
- map_toc_to_content(): Associates TOC entries with document content
|
||||
|
||||
See Also:
|
||||
- utils.toc_extractor_visual: Visual/bbox-based extraction (default)
|
||||
- utils.toc_extractor_markdown: Markdown indentation-based extraction
|
||||
- utils.llm_toc: LLM-based TOC extraction (alternative approach)
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional, Union, cast
|
||||
from pathlib import Path
|
||||
|
||||
from .ocr_schemas import DocumentMetadata, TocEntry
|
||||
from .ocr_processor import run_ocr_with_annotations
|
||||
from .mistral_client import create_client
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# TypedDict for hierarchical TOC nodes
|
||||
class TOCNode(Dict[str, Any]):
|
||||
"""Type alias for TOC node structure with title, page, level, type, children."""
|
||||
pass
|
||||
|
||||
|
||||
def extract_toc_from_annotations(
|
||||
pdf_path: Path,
|
||||
api_key: Optional[str] = None,
|
||||
max_toc_pages: int = 8,
|
||||
use_visual_bbox: bool = True, # NOUVEAU : Utiliser l'analyse visuelle par défaut
|
||||
) -> Dict[str, Any]:
|
||||
"""Extrait la TOC structurée via OCR avec annotations.
|
||||
|
||||
Coût : 3€/1000 pages pour les pages annotées (vs 1€/1000 pour OCR basique).
|
||||
|
||||
Args:
|
||||
pdf_path: Chemin du fichier PDF
|
||||
api_key: Clé API Mistral (optionnel, sinon charge depuis .env)
|
||||
max_toc_pages: Nombre max de pages à annoter (défaut 8, limite API pour document_annotation)
|
||||
use_visual_bbox: Si True, utilise l'analyse visuelle des bounding boxes (plus fiable)
|
||||
|
||||
Returns:
|
||||
Dict avec :
|
||||
- success: bool
|
||||
- metadata: dict avec métadonnées enrichies
|
||||
- toc: liste hiérarchique [{title, page, level, children}]
|
||||
- toc_flat: liste plate [{title, page, level, type, parent_title}]
|
||||
- cost_ocr_annotated: float (coût en €)
|
||||
- error: str (si échec)
|
||||
"""
|
||||
# Si demandé, utiliser l'approche visuelle (bbox)
|
||||
if use_visual_bbox:
|
||||
logger.info("Utilisation de l'analyse visuelle (bbox) pour extraction TOC")
|
||||
from .toc_extractor_visual import extract_toc_with_visual_analysis
|
||||
return cast(Dict[str, Any], extract_toc_with_visual_analysis(pdf_path, api_key, max_toc_pages))
|
||||
|
||||
# Sinon, continuer avec l'approche sémantique (document_annotation_format)
|
||||
try:
|
||||
client = create_client(api_key)
|
||||
pdf_bytes = pdf_path.read_bytes()
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur initialisation client/lecture PDF : {e}")
|
||||
return {"success": False, "error": f"Initialisation échouée : {str(e)}"}
|
||||
|
||||
# Phase 1 : Annoter les premières pages pour extraire TOC + métadonnées
|
||||
logger.info(f"Extraction TOC avec annotations sur {max_toc_pages} premières pages")
|
||||
|
||||
try:
|
||||
annotated_response = run_ocr_with_annotations(
|
||||
client=client,
|
||||
file_bytes=pdf_bytes,
|
||||
filename=pdf_path.name,
|
||||
include_images=False, # Pas besoin d'images pour la TOC
|
||||
document_annotation_format=DocumentMetadata,
|
||||
pages=list(range(max_toc_pages)), # Pages 0 à max_toc_pages-1
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur appel OCR avec annotations : {e}")
|
||||
return {"success": False, "error": f"Appel OCR échoué : {str(e)}"}
|
||||
|
||||
# Extraire les annotations du document
|
||||
doc_annotation = getattr(annotated_response, "document_annotation", None)
|
||||
|
||||
if not doc_annotation:
|
||||
return {"success": False, "error": "Aucune annotation retournée par l'API"}
|
||||
|
||||
# Convertir en dictionnaire
|
||||
try:
|
||||
if isinstance(doc_annotation, str):
|
||||
metadata_dict = json.loads(doc_annotation)
|
||||
else:
|
||||
metadata_dict = doc_annotation
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur parsing annotations : {e}")
|
||||
return {"success": False, "error": f"Parsing annotations échoué : {str(e)}"}
|
||||
|
||||
# Valider avec Pydantic
|
||||
try:
|
||||
metadata = DocumentMetadata(**metadata_dict)
|
||||
toc_entries = metadata.toc.entries
|
||||
|
||||
logger.info(f"TOC extraite : {len(toc_entries)} entrées")
|
||||
|
||||
# Construire la TOC hiérarchique
|
||||
hierarchical_toc = build_hierarchical_toc(toc_entries)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"metadata": metadata.model_dump(),
|
||||
"toc": hierarchical_toc,
|
||||
"toc_flat": [entry.model_dump() for entry in toc_entries],
|
||||
"cost_ocr_annotated": max_toc_pages * 0.003, # 3€/1000 pages
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur validation annotations : {e}")
|
||||
return {"success": False, "error": f"Validation Pydantic échouée : {str(e)}"}
|
||||
|
||||
|
||||
def build_hierarchical_toc(entries: List[TocEntry]) -> List[Dict[str, Any]]:
|
||||
"""Construit une TOC hiérarchique à partir des entrées plates avec niveaux.
|
||||
|
||||
Utilise une stack pour gérer la hiérarchie basée sur les niveaux.
|
||||
|
||||
Args:
|
||||
entries: Liste d'entrées TocEntry avec level (1=racine, 2=enfant de 1, etc.)
|
||||
|
||||
Returns:
|
||||
TOC hiérarchique avec structure [{title, page, level, type, children: [...]}]
|
||||
"""
|
||||
if not entries:
|
||||
return []
|
||||
|
||||
toc: List[Dict[str, Any]] = []
|
||||
stack: List[Dict[str, Any]] = [] # Stack pour gérer la hiérarchie courante
|
||||
|
||||
for entry in entries:
|
||||
node: Dict[str, Any] = {
|
||||
"title": entry.title,
|
||||
"page": entry.page_number,
|
||||
"level": entry.level,
|
||||
"type": entry.entry_type.value,
|
||||
"children": [],
|
||||
}
|
||||
|
||||
# Remonter dans la stack jusqu'au parent approprié
|
||||
# Un élément de level N doit être enfant du dernier élément de level < N
|
||||
while stack and stack[-1]["level"] >= entry.level:
|
||||
stack.pop()
|
||||
|
||||
if stack:
|
||||
# Ajouter comme enfant du dernier élément de la stack
|
||||
children: List[Dict[str, Any]] = stack[-1]["children"]
|
||||
children.append(node)
|
||||
else:
|
||||
# Ajouter à la racine de la TOC
|
||||
toc.append(node)
|
||||
|
||||
# Empiler ce nœud pour les prochaines itérations
|
||||
stack.append(node)
|
||||
|
||||
return toc
|
||||
|
||||
|
||||
def map_toc_to_content(
|
||||
toc_entries: List[TocEntry],
|
||||
all_pages_markdown: str,
|
||||
) -> Dict[str, str]:
|
||||
"""Associe les entrées de TOC au contenu réel du document.
|
||||
|
||||
Utilise les vrais numéros de page pour découper le contenu par section.
|
||||
|
||||
Args:
|
||||
toc_entries: Entrées de TOC avec numéros de page réels
|
||||
all_pages_markdown: Markdown complet du document avec <!-- Page N --> markers
|
||||
|
||||
Returns:
|
||||
Mapping {section_title: content_text}
|
||||
"""
|
||||
# Découper le markdown par commentaires de page
|
||||
pages: List[str] = all_pages_markdown.split("<!-- Page ")
|
||||
|
||||
content_map: Dict[str, str] = {}
|
||||
|
||||
for i, entry in enumerate(toc_entries):
|
||||
start_page: int = entry.page_number
|
||||
|
||||
# Trouver la page de fin (numéro de page de la prochaine entrée ou fin du doc)
|
||||
end_page: int
|
||||
if i < len(toc_entries) - 1:
|
||||
end_page = toc_entries[i + 1].page_number
|
||||
else:
|
||||
end_page = len(pages) # Jusqu'à la fin
|
||||
|
||||
# Extraire le contenu entre start_page et end_page
|
||||
section_content: List[str] = []
|
||||
for page_idx in range(start_page, end_page):
|
||||
if page_idx < len(pages):
|
||||
# Nettoyer le commentaire de page et extraire le contenu
|
||||
page_text: str = pages[page_idx].split("-->", 1)[-1].strip()
|
||||
section_content.append(page_text)
|
||||
|
||||
content_map[entry.title] = "\n\n".join(section_content)
|
||||
|
||||
return content_map
|
||||
Reference in New Issue
Block a user