Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions
--- a/generations/library_rag/utils/init.py
+++ b/generations/library_rag/utils/init.py
@@ -0,0 +1,74 @@
+"""Utils - Pipeline de parsing PDF avec OCR Mistral et structuration LLM.
+
+Version 2.0 : Pipeline intelligent avec extraction LLM des métadonnées,
+TOC, classification des sections, chunking sémantique et validation.
+"""
+
+from .mistral_client import create_client, get_api_key, estimate_ocr_cost
+from .pdf_uploader import upload_pdf
+from .ocr_processor import run_ocr, serialize_ocr_response
+from .markdown_builder import build_markdown
+from .image_extractor import extract_images, create_image_writer
+from .hierarchy_parser import build_hierarchy
+from .llm_structurer import structure_with_llm, LLMStructureError
+
+# Nouveaux modules LLM v2
+from .llm_metadata import extract_metadata
+from .llm_toc import extract_toc
+from .llm_classifier import classify_sections, filter_indexable_sections
+from .llm_cleaner import clean_chunk, clean_page_markers, is_chunk_valid
+from .llm_chunker import chunk_section_with_llm, simple_chunk_by_paragraphs, extract_concepts_from_chunk, extract_paragraph_number
+from .llm_validator import validate_document, apply_corrections, enrich_chunks_with_concepts
+
+# Pipeline
+from .pdf_pipeline import process_pdf, process_pdf_v2, process_pdf_bytes
+from .weaviate_ingest import ingest_document, delete_document_chunks
+
+__all__ = [
+    # Client Mistral
+    "create_client",
+    "get_api_key", 
+    "estimate_ocr_cost",
+    # Upload
+    "upload_pdf",
+    # OCR
+    "run_ocr",
+    "serialize_ocr_response",
+    # Markdown
+    "build_markdown",
+    # Images
+    "extract_images",
+    "create_image_writer",
+    # Hiérarchie
+    "build_hierarchy",
+    # LLM Legacy
+    "structure_with_llm",
+    "LLMStructureError",
+    # LLM v2 - Métadonnées
+    "extract_metadata",
+    # LLM v2 - TOC
+    "extract_toc",
+    # LLM v2 - Classification
+    "classify_sections",
+    "filter_indexable_sections",
+    # LLM v2 - Nettoyage
+    "clean_chunk",
+    "clean_page_markers",
+    "is_chunk_valid",
+    # LLM v2 - Chunking
+    "chunk_section_with_llm",
+    "simple_chunk_by_paragraphs",
+    "extract_concepts_from_chunk",
+    "extract_paragraph_number",
+    # LLM v2 - Validation
+    "validate_document",
+    "apply_corrections",
+    "enrich_chunks_with_concepts",
+    # Pipeline
+    "process_pdf",
+    "process_pdf_v2",
+    "process_pdf_bytes",
+    # Weaviate
+    "ingest_document",
+    "delete_document_chunks",
+]
--- a/generations/library_rag/utils/hierarchy_parser.py
+++ b/generations/library_rag/utils/hierarchy_parser.py
@@ -0,0 +1,267 @@
+"""Hierarchical Markdown document parser for semantic chunking.
+
+This module provides utilities for parsing Markdown documents into
+hierarchical structures based on heading levels (# to ######). It is
+a key component of the RAG pipeline, enabling:
+
+1. **Structure Extraction**: Parse Markdown into a tree of sections
+2. **Context Preservation**: Maintain hierarchical context (part > chapter > section)
+3. **Semantic Chunking**: Flatten hierarchy into chunks with full path context
+
+The parser uses a stack-based algorithm to build nested section trees,
+preserving the document's logical structure for downstream processing.
+
+Architecture:
+    Input: Raw Markdown text with headings
+        ↓
+    build_hierarchy() → DocumentHierarchy (tree structure)
+        ↓
+    flatten_hierarchy() → List[FlatChunk] (with hierarchical context)
+
+TypedDict Definitions:
+    - HierarchyPath: Hierarchical path (part/chapter/section/subsection)
+    - HierarchyNode: Tree node with title, level, content, children
+    - DocumentHierarchy: Complete document structure
+    - FlatChunk: Flattened chunk with context for RAG ingestion
+
+Algorithm:
+    The build_hierarchy() function uses a stack-based approach:
+    1. Initialize a virtual root node at level 0
+    2. For each line in the document:
+       - If heading: pop stack until parent level found, then push new node
+       - If content: append to current node's content
+    3. Finalize nodes by joining content lines
+
+Example:
+    >>> markdown = '''
+    ... # Introduction
+    ... This is the intro.
+    ...
+    ... ## Background
+    ... Some background text.
+    ...
+    ... ## Methodology
+    ... Methods used here.
+    ... '''
+    >>> hierarchy = build_hierarchy(markdown)
+    >>> print(hierarchy["sections"][0]["title"])
+    'Introduction'
+    >>> chunks = flatten_hierarchy(hierarchy)
+    >>> for chunk in chunks:
+    ...     print(f"{chunk['chunk_id']}: {chunk['title']}")
+    chunk_00001: Introduction
+    chunk_00002: Background
+    chunk_00003: Methodology
+
+See Also:
+    - utils.llm_chunker: Semantic chunking using LLM
+    - utils.markdown_builder: Markdown generation from OCR
+    - utils.weaviate_ingest: Ingestion of chunks into Weaviate
+"""
+
+from __future__ import annotations
+
+import re
+from typing import List, Optional, Pattern, TypedDict
+
+# Import type definitions from central types module
+from utils.types import (
+    DocumentHierarchy,
+    FlatChunk,
+    HierarchyNode,
+    HierarchyPath,
+)
+
+
+class _BuildNode(TypedDict):
+    """Noeud interne pour la construction de la hiérarchie."""
+
+    title: Optional[str]
+    level: int
+    content: List[str]
+    children: List[_BuildNode]
+
+
+def build_hierarchy(markdown_text: str) -> DocumentHierarchy:
+    """Construit une structure hiérarchique à partir des titres Markdown.
+
+    Analyse les titres (# à ######) et construit un arbre de sections
+    avec leur contenu textuel.
+
+    Args:
+        markdown_text: Texte Markdown à analyser
+
+    Returns:
+        Dictionnaire avec :
+        - preamble: Texte avant le premier titre
+        - sections: Liste de sections imbriquées
+
+        Chaque section contient :
+        - title: Titre de la section
+        - level: Niveau (1-6)
+        - content: Contenu textuel
+        - children: Sous-sections
+    """
+    # Regex pour les titres Markdown
+    heading_re: Pattern[str] = re.compile(r"^(#{1,6})\s+(.*)$")
+
+    lines: List[str] = markdown_text.splitlines()
+
+    # Noeud racine (niveau 0, virtuel)
+    root: _BuildNode = {
+        "title": None,
+        "level": 0,
+        "content": [],
+        "children": [],
+    }
+
+    # Pile pour suivre la hiérarchie
+    stack: List[_BuildNode] = [root]
+
+    for line in lines:
+        stripped: str = line.rstrip()
+        match: Optional[re.Match[str]] = heading_re.match(stripped)
+
+        if match:
+            # C'est un titre
+            level: int = len(match.group(1))
+            title: str = match.group(2).strip()
+
+            # Remonter dans la pile jusqu'au parent approprié
+            while stack and stack[-1]["level"] >= level:
+                stack.pop()
+
+            # Créer le nouveau noeud
+            node: _BuildNode = {
+                "title": title,
+                "level": level,
+                "content": [],
+                "children": [],
+            }
+
+            # Ajouter au parent
+            parent: _BuildNode = stack[-1]
+            parent["children"].append(node)
+
+            # Empiler le nouveau noeud
+            stack.append(node)
+        else:
+            # C'est du contenu, l'ajouter au noeud courant
+            stack[-1]["content"].append(stripped)
+
+    # Finaliser les noeuds (joindre le contenu)
+    def finalize(node: _BuildNode) -> HierarchyNode:
+        """Convertit un noeud de construction en noeud final."""
+        return HierarchyNode(
+            title=node["title"],
+            level=node["level"],
+            content="\n".join(node["content"]).strip(),
+            children=[finalize(child) for child in node["children"]],
+        )
+
+    # Extraire le préambule et les sections
+    preamble: str = "\n".join(root["content"]).strip()
+    sections: List[HierarchyNode] = [finalize(child) for child in root["children"]]
+
+    return DocumentHierarchy(
+        preamble=preamble,
+        sections=sections,
+    )
+
+
+def flatten_hierarchy(hierarchy: DocumentHierarchy) -> List[FlatChunk]:
+    """Aplatit la hiérarchie en une liste de chunks.
+
+    Args:
+        hierarchy: Structure hiérarchique (sortie de build_hierarchy)
+
+    Returns:
+        Liste de chunks avec leur contexte hiérarchique
+    """
+    chunks: List[FlatChunk] = []
+
+    # Préambule comme premier chunk
+    if hierarchy.get("preamble"):
+        preamble_chunk: FlatChunk = {
+            "chunk_id": "chunk_00000",
+            "text": hierarchy["preamble"],
+            "hierarchy": HierarchyPath(
+                part=None,
+                chapter=None,
+                section=None,
+                subsection=None,
+            ),
+            "type": "preamble",
+            "level": 0,
+            "title": None,
+        }
+        chunks.append(preamble_chunk)
+
+    def process_section(
+        section: HierarchyNode,
+        path: HierarchyPath,
+        index: int,
+    ) -> int:
+        """Traite récursivement une section.
+
+        Args:
+            section: Noeud de section à traiter
+            path: Chemin hiérarchique courant
+            index: Index du prochain chunk
+
+        Returns:
+            Nouvel index après traitement
+        """
+        level: int = section["level"]
+        title: Optional[str] = section["title"]
+
+        # Mettre à jour le chemin hiérarchique
+        current_path: HierarchyPath = path.copy()
+        if level == 1:
+            current_path = HierarchyPath(
+                part=title,
+                chapter=None,
+                section=None,
+                subsection=None,
+            )
+        elif level == 2:
+            current_path["chapter"] = title
+            current_path["section"] = None
+            current_path["subsection"] = None
+        elif level == 3:
+            current_path["section"] = title
+            current_path["subsection"] = None
+        elif level >= 4:
+            current_path["subsection"] = title
+
+        # Créer le chunk si contenu
+        if section["content"]:
+            chunk: FlatChunk = {
+                "chunk_id": f"chunk_{index:05d}",
+                "text": section["content"],
+                "hierarchy": current_path.copy(),
+                "type": "main_content",
+                "level": level,
+                "title": title,
+            }
+            chunks.append(chunk)
+            index += 1
+
+        # Traiter les enfants
+        for child in section["children"]:
+            index = process_section(child, current_path, index)
+
+        return index
+
+    # Traiter toutes les sections
+    idx: int = 1
+    initial_path: HierarchyPath = HierarchyPath(
+        part=None,
+        chapter=None,
+        section=None,
+        subsection=None,
+    )
+    for section in hierarchy.get("sections", []):
+        idx = process_section(section, initial_path, idx)
+
+    return chunks
--- a/generations/library_rag/utils/image_extractor.py
+++ b/generations/library_rag/utils/image_extractor.py
@@ -0,0 +1,192 @@
+"""Image extraction and storage from OCR API responses.
+
+This module provides utilities for extracting and saving images from
+Mistral OCR API responses. It is a companion module to markdown_builder,
+handling the image-specific aspects of document processing.
+
+Features:
+    - **Image Writer Factory**: Creates reusable callbacks for image saving
+    - **Batch Extraction**: Processes all images from an OCR response
+    - **Protocol-based Design**: Flexible interface for custom implementations
+
+Pipeline Position:
+    OCR Response → **Image Extractor** → Saved images + paths for Markdown
+
+Components:
+    1. ImageWriterProtocol: Interface definition for image saving
+    2. create_image_writer(): Factory for standard file-based writers
+    3. extract_images(): Batch extraction from OCR responses
+
+Integration:
+    The image writer is designed to integrate with markdown_builder:
+
+    >>> from utils.image_extractor import create_image_writer
+    >>> from utils.markdown_builder import build_markdown
+    >>>
+    >>> writer = create_image_writer(Path("output/doc/images"))
+    >>> markdown = build_markdown(ocr_response, image_writer=writer)
+
+Standalone Usage:
+    >>> from pathlib import Path
+    >>> from utils.image_extractor import extract_images
+    >>>
+    >>> # Extract all images from OCR response
+    >>> paths = extract_images(ocr_response, Path("output/my_doc"))
+    >>> print(f"Extracted {len(paths)} images")
+
+File Naming Convention:
+    Images are named: page{N}_img{M}.png
+    - N: Page number (1-based)
+    - M: Image index within page (1-based)
+    - Format: Always PNG (base64 from Mistral is PNG)
+
+Note:
+    - All indices are 1-based for consistency with page numbering
+    - The images subdirectory is created automatically if needed
+    - Base64 data without proper encoding is silently skipped
+    - Large documents may produce many images; monitor disk space
+
+See Also:
+    - utils.markdown_builder: Uses ImageWriter for markdown generation
+    - utils.mistral_client: Source of OCR responses with image data
+"""
+
+import base64
+from pathlib import Path
+from typing import Any, Callable, List, Optional, Protocol
+
+
+class ImageWriterProtocol(Protocol):
+    """Protocol for image writing callbacks.
+
+    This protocol defines the interface for functions that save
+    images extracted from OCR responses and return a relative
+    path for markdown references.
+
+    The protocol expects:
+        - page_idx: 1-based page number
+        - img_idx: 1-based image index within the page
+        - image_b64: Base64-encoded image data
+
+    Returns:
+        Relative path to the saved image for markdown inclusion.
+
+    Example:
+        >>> def my_writer(page_idx: int, img_idx: int, image_b64: str) -> str:
+        ...     # Custom saving logic
+        ...     return f"images/page{page_idx}_img{img_idx}.png"
+    """
+
+    def __call__(self, page_idx: int, img_idx: int, image_b64: str) -> str:
+        """Save image and return relative path for markdown reference."""
+        ...
+
+
+# Type alias for image writer callables
+ImageWriter = Callable[[int, int, str], str]
+
+
+def create_image_writer(images_dir: Path) -> ImageWriter:
+    """Create a function for saving images to disk.
+
+    This factory function creates a closure that saves base64-encoded
+    images to the specified directory and returns relative paths
+    suitable for markdown image references.
+
+    Args:
+        images_dir: Directory path where images will be saved.
+            The directory will be created if it doesn't exist.
+
+    Returns:
+        A callable that accepts (page_idx, img_idx, image_b64) and
+        returns the relative path to the saved image.
+
+    Example:
+        >>> from pathlib import Path
+        >>> writer = create_image_writer(Path("output/images"))
+        >>> path = writer(1, 0, "iVBORw0KGgoAAAANS...")
+        >>> print(path)
+        'images/page1_img0.png'
+    """
+    # Create directory if it doesn't exist
+    images_dir.mkdir(parents=True, exist_ok=True)
+
+    def writer(page_idx: int, img_idx: int, image_b64: str) -> str:
+        """Save an image and return its relative path.
+
+        Args:
+            page_idx: Page number (1-based).
+            img_idx: Image index within the page (1-based).
+            image_b64: Base64-encoded image data.
+
+        Returns:
+            Relative path to the saved image file.
+        """
+        filename: str = f"page{page_idx}_img{img_idx}.png"
+        filepath: Path = images_dir / filename
+
+        # Decode and save
+        image_data: bytes = base64.b64decode(image_b64)
+        filepath.write_bytes(image_data)
+
+        # Return relative path for markdown
+        return f"images/{filename}"
+
+    return writer
+
+
+def extract_images(ocr_response: Any, output_dir: Path) -> List[str]:
+    """Extract all images from an OCR response.
+
+    Iterates through all pages in the OCR response, extracts any
+    embedded images, decodes them from base64, and saves them
+    to the output directory.
+
+    Args:
+        ocr_response: OCR response object from Mistral API.
+            Expected to have a pages attribute, where each page
+            may have an images list containing objects with
+            image_base64 attributes.
+        output_dir: Base output directory. Images will be saved
+            to a subdirectory named "images".
+
+    Returns:
+        List of absolute file paths to the extracted images.
+
+    Example:
+        >>> from pathlib import Path
+        >>> paths = extract_images(ocr_response, Path("output/my_doc"))
+        >>> for path in paths:
+        ...     print(path)
+        'C:/output/my_doc/images/page1_img1.png'
+        'C:/output/my_doc/images/page2_img1.png'
+
+    Note:
+        - Pages and images are 1-indexed in filenames
+        - Images without base64 data are silently skipped
+        - The images subdirectory is created automatically
+    """
+    images_dir: Path = output_dir / "images"
+    images_dir.mkdir(parents=True, exist_ok=True)
+
+    extracted: List[str] = []
+
+    for page_index, page in enumerate(ocr_response.pages, start=1):
+        if not getattr(page, "images", None):
+            continue
+
+        for img_idx, img in enumerate(page.images, start=1):
+            image_b64: Optional[str] = getattr(img, "image_base64", None)
+            if not image_b64:
+                continue
+
+            filename: str = f"page{page_index}_img{img_idx}.png"
+            filepath: Path = images_dir / filename
+
+            # Decode and save
+            image_data: bytes = base64.b64decode(image_b64)
+            filepath.write_bytes(image_data)
+
+            extracted.append(str(filepath))
+
+    return extracted
--- a/generations/library_rag/utils/llm_chat.py
+++ b/generations/library_rag/utils/llm_chat.py
@@ -0,0 +1,319 @@
+"""Multi-LLM Integration Module for Chat Conversation.
+
+Provides a unified interface for calling different LLM providers with streaming support:
+- Ollama (local, free)
+- Mistral API
+- Anthropic API (Claude)
+- OpenAI API
+
+Example:
+    >>> for token in call_llm("Hello world", "ollama", "qwen2.5:7b"):
+    ...     print(token, end="", flush=True)
+"""
+
+import os
+import json
+import time
+import logging
+from typing import Iterator, Optional
+from dotenv import load_dotenv
+
+load_dotenv()
+
+logger = logging.getLogger(__name__)
+
+
+class LLMError(Exception):
+    """Base exception for LLM errors."""
+    pass
+
+
+def call_llm(
+    prompt: str,
+    provider: str,
+    model: str,
+    stream: bool = True,
+    temperature: float = 0.7,
+    max_tokens: int = 16384,
+) -> Iterator[str]:
+    """Call an LLM provider with unified interface.
+
+    Args:
+        prompt: The prompt to send to the LLM.
+        provider: Provider name ("ollama", "mistral", "anthropic", "openai").
+        model: Model name (e.g., "qwen2.5:7b", "mistral-small-latest", "claude-sonnet-4-5").
+        stream: Whether to stream tokens (default: True).
+        temperature: Temperature for generation (0-1).
+        max_tokens: Maximum tokens to generate (default 16384 for philosophical discussions).
+
+    Yields:
+        Tokens as strings (when streaming).
+
+    Raises:
+        LLMError: If provider is invalid or API call fails.
+
+    Example:
+        >>> for token in call_llm("Test", "ollama", "qwen2.5:7b"):
+        ...     print(token, end="")
+    """
+    provider = provider.lower()
+
+    logger.info(f"[LLM Call] Provider: {provider}, Model: {model}, Stream: {stream}")
+    start_time = time.time()
+
+    try:
+        if provider == "ollama":
+            yield from _call_ollama(prompt, model, temperature, stream)
+        elif provider == "mistral":
+            yield from _call_mistral(prompt, model, temperature, max_tokens, stream)
+        elif provider == "anthropic":
+            yield from _call_anthropic(prompt, model, temperature, max_tokens, stream)
+        elif provider == "openai":
+            yield from _call_openai(prompt, model, temperature, max_tokens, stream)
+        else:
+            raise LLMError(f"Provider '{provider}' non supporté. Utilisez: ollama, mistral, anthropic, openai")
+
+    except Exception as e:
+        elapsed = time.time() - start_time
+        logger.error(f"[LLM Call] Error after {elapsed:.2f}s: {e}")
+        raise
+
+    elapsed = time.time() - start_time
+    logger.info(f"[LLM Call] Completed in {elapsed:.2f}s")
+
+
+def _call_ollama(prompt: str, model: str, temperature: float, stream: bool) -> Iterator[str]:
+    """Call Ollama API with streaming support.
+
+    Args:
+        prompt: The prompt text.
+        model: Ollama model name.
+        temperature: Temperature (0-1).
+        stream: Whether to stream.
+
+    Yields:
+        Tokens from the model.
+    """
+    import requests
+
+    base_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
+    url = f"{base_url}/api/generate"
+
+    payload = {
+        "model": model,
+        "prompt": prompt,
+        "stream": stream,
+        "options": {
+            "temperature": temperature,
+        }
+    }
+
+    try:
+        response = requests.post(url, json=payload, stream=stream, timeout=120)
+        response.raise_for_status()
+
+        if stream:
+            # Stream mode: each line is a JSON object with "response" field
+            for line in response.iter_lines():
+                if line:
+                    try:
+                        data = json.loads(line)
+                        token = data.get("response", "")
+                        if token:
+                            yield token
+
+                        # Check if done
+                        if data.get("done", False):
+                            break
+                    except json.JSONDecodeError:
+                        continue
+        else:
+            # Non-stream mode
+            data = response.json()
+            yield data.get("response", "")
+
+    except requests.exceptions.RequestException as e:
+        raise LLMError(f"Ollama API error: {e}")
+
+
+def _call_mistral(prompt: str, model: str, temperature: float, max_tokens: int, stream: bool) -> Iterator[str]:
+    """Call Mistral API with streaming support.
+
+    Args:
+        prompt: The prompt text.
+        model: Mistral model name.
+        temperature: Temperature (0-1).
+        max_tokens: Max tokens to generate.
+        stream: Whether to stream.
+
+    Yields:
+        Tokens from the model.
+    """
+    api_key = os.getenv("MISTRAL_API_KEY")
+    if not api_key:
+        raise LLMError("MISTRAL_API_KEY not set in environment")
+
+    try:
+        from mistralai import Mistral
+    except ImportError:
+        raise LLMError("mistralai package not installed. Run: pip install mistralai")
+
+    client = Mistral(api_key=api_key)
+
+    messages = [{"role": "user", "content": prompt}]
+
+    try:
+        if stream:
+            # Streaming mode
+            stream_response = client.chat.stream(
+                model=model,
+                messages=messages,
+                temperature=temperature,
+                max_tokens=max_tokens,
+            )
+
+            for chunk in stream_response:
+                if chunk.data.choices:
+                    delta = chunk.data.choices[0].delta
+                    if hasattr(delta, 'content') and delta.content:
+                        yield delta.content
+        else:
+            # Non-streaming mode
+            response = client.chat.complete(
+                model=model,
+                messages=messages,
+                temperature=temperature,
+                max_tokens=max_tokens,
+            )
+            if response.choices:
+                yield response.choices[0].message.content or ""
+
+    except Exception as e:
+        raise LLMError(f"Mistral API error: {e}")
+
+
+def _call_anthropic(prompt: str, model: str, temperature: float, max_tokens: int, stream: bool) -> Iterator[str]:
+    """Call Anthropic API (Claude) with streaming support.
+
+    Args:
+        prompt: The prompt text.
+        model: Claude model name.
+        temperature: Temperature (0-1).
+        max_tokens: Max tokens to generate.
+        stream: Whether to stream.
+
+    Yields:
+        Tokens from the model.
+    """
+    api_key = os.getenv("ANTHROPIC_API_KEY")
+    if not api_key:
+        raise LLMError("ANTHROPIC_API_KEY not set in environment")
+
+    try:
+        from anthropic import Anthropic
+    except ImportError:
+        raise LLMError("anthropic package not installed. Run: pip install anthropic")
+
+    client = Anthropic(api_key=api_key)
+
+    try:
+        if stream:
+            # Streaming mode
+            with client.messages.stream(
+                model=model,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                messages=[{"role": "user", "content": prompt}],
+            ) as stream:
+                for text in stream.text_stream:
+                    yield text
+        else:
+            # Non-streaming mode
+            response = client.messages.create(
+                model=model,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                messages=[{"role": "user", "content": prompt}],
+            )
+            if response.content:
+                yield response.content[0].text
+
+    except Exception as e:
+        raise LLMError(f"Anthropic API error: {e}")
+
+
+def _call_openai(prompt: str, model: str, temperature: float, max_tokens: int, stream: bool) -> Iterator[str]:
+    """Call OpenAI API with streaming support.
+
+    Args:
+        prompt: The prompt text.
+        model: OpenAI model name.
+        temperature: Temperature (0-1).
+        max_tokens: Max tokens to generate.
+        stream: Whether to stream.
+
+    Yields:
+        Tokens from the model.
+    """
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        raise LLMError("OPENAI_API_KEY not set in environment")
+
+    try:
+        from openai import OpenAI
+    except ImportError:
+        raise LLMError("openai package not installed. Run: pip install openai")
+
+    client = OpenAI(api_key=api_key)
+
+    messages = [{"role": "user", "content": prompt}]
+
+    # Detect if model uses max_completion_tokens (o1, gpt-5.x) instead of max_tokens
+    uses_completion_tokens = model.startswith("o1") or model.startswith("gpt-5")
+
+    try:
+        if stream:
+            # Streaming mode
+            if uses_completion_tokens:
+                stream_response = client.chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    max_completion_tokens=max_tokens,
+                    stream=True,
+                )
+            else:
+                stream_response = client.chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                    stream=True,
+                )
+
+            for chunk in stream_response:
+                if chunk.choices:
+                    delta = chunk.choices[0].delta
+                    if hasattr(delta, 'content') and delta.content:
+                        yield delta.content
+        else:
+            # Non-streaming mode
+            if uses_completion_tokens:
+                response = client.chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    max_completion_tokens=max_tokens,
+                    stream=False,
+                )
+            else:
+                response = client.chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                    stream=False,
+                )
+            if response.choices:
+                yield response.choices[0].message.content or ""
+
+    except Exception as e:
+        raise LLMError(f"OpenAI API error: {e}")
--- a/generations/library_rag/utils/llm_chunker.py
+++ b/generations/library_rag/utils/llm_chunker.py
@@ -0,0 +1,495 @@
+"""Semantic chunking of documents via LLM.
+
+This module provides intelligent semantic chunking capabilities for academic and
+philosophical texts, using Large Language Models (LLM) to identify coherent units
+of meaning (argumentative units, definitions, examples, citations, etc.).
+
+Overview:
+    The module offers two chunking strategies:
+
+    1. **LLM-based semantic chunking** (chunk_section_with_llm):
+       Uses an LLM to identify semantic boundaries and create chunks that preserve
+       argumentative coherence. Each chunk is annotated with summary, concepts, type.
+
+    2. **Simple paragraph-based chunking** (simple_chunk_by_paragraphs):
+       A fast fallback that splits text by paragraph boundaries.
+
+Semantic Unit Types:
+    - argument: A logical argument or reasoning sequence
+    - definition: A definition or conceptual clarification
+    - example: An illustrative example or case study
+    - citation: A quoted passage from another source
+    - exposition: Expository content presenting ideas
+    - transition: Transitional text between sections
+
+Chunk Size Guidelines:
+    - Target size: 300-500 words per chunk (configurable)
+    - Chunks are never split mid-sentence or mid-paragraph
+    - Short sections (< 80% of target) are kept as single chunks
+
+LLM Provider Support:
+    - ollama: Local LLM (free, slower, default)
+    - mistral: Mistral API (faster, requires API key)
+
+See Also:
+    utils.llm_cleaner: Chunk cleaning and validation
+    utils.llm_classifier: Section type classification
+    utils.pdf_pipeline: Main pipeline orchestration
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from typing import Any, Dict, List, Literal, Optional, TypedDict
+
+from .llm_structurer import (
+    _clean_json_string,
+    _get_default_mistral_model,
+    _get_default_model,
+    call_llm,
+)
+from .llm_cleaner import clean_page_markers, is_chunk_valid
+from .types import LLMProvider, SemanticChunk
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+# =============================================================================
+# Type Definitions for LLM Chunker
+# =============================================================================
+
+#: Unit type for semantic chunking (specific to this module's LLM output)
+ChunkUnitType = Literal[
+    "argument",
+    "definition",
+    "example",
+    "citation",
+    "exposition",
+    "transition",
+    "main_content",
+]
+
+
+class LLMChunkResponse(TypedDict, total=False):
+    """Individual chunk structure as returned by LLM.
+
+    Attributes:
+        text: Chunk text content (exact copy from source)
+        summary: Brief one-sentence summary
+        concepts: Key concepts extracted (3-5 items)
+        type: Semantic unit type
+    """
+
+    text: str
+    summary: str
+    concepts: List[str]
+    type: str
+
+
+class LLMChunksResult(TypedDict):
+    """Complete response structure from LLM chunking.
+
+    Attributes:
+        chunks: List of chunk objects
+    """
+
+    chunks: List[LLMChunkResponse]
+
+
+# Note: SemanticChunk is imported from utils.types
+
+
+def extract_paragraph_number(text: str) -> Optional[int]:
+    """Extract paragraph number from the beginning of text.
+
+    Many philosophical texts use numbered paragraphs. This function
+    detects various numbering formats.
+
+    Args:
+        text: Text content that may start with a paragraph number.
+
+    Returns:
+        The paragraph number if detected, None otherwise.
+
+    Example:
+        >>> extract_paragraph_number("9 On presente...")
+        9
+        >>> extract_paragraph_number("Normal text")
+        None
+    """
+    text = text.strip()
+    
+    # Patterns possibles pour les numéros de paragraphe
+    patterns: List[str] = [
+        r'^(\d+)\s+[A-ZÀ-Ü]',  # "9 On présente..."
+        r'^(\d+)[A-ZÀ-Ü]',      # "10Dans la classification..."
+        r'^§\s*(\d+)',          # "§ 15 ..."
+        r'^\[(\d+)\]',          # "[9] ..."
+        r'^(\d+)\.',            # "9. ..."
+        r'^(\d+)\)',            # "9) ..."
+    ]
+
+    for pattern in patterns:
+        match: Optional[re.Match[str]] = re.match(pattern, text)
+        if match:
+            try:
+                return int(match.group(1))
+            except ValueError:
+                continue
+    
+    return None
+
+
+def _extract_json_from_response(text: str) -> Dict[str, Any]:
+    """Extract JSON from LLM response text.
+
+    Handles both wrapped JSON (in <JSON></JSON> tags) and raw JSON responses.
+    Falls back to empty chunks list if parsing fails.
+
+    Args:
+        text: Response text from LLM containing JSON.
+
+    Returns:
+        Parsed JSON as dictionary with 'chunks' key. Returns
+        {"chunks": []} if parsing fails.
+    """
+    json_match: Optional[re.Match[str]] = re.search(
+        r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL
+    )
+    if json_match:
+        json_str: str = _clean_json_string(json_match.group(1))
+        try:
+            result: Dict[str, Any] = json.loads(json_str)
+            return result
+        except json.JSONDecodeError:
+            pass
+
+    start: int = text.find("{")
+    end: int = text.rfind("}")
+    if start != -1 and end > start:
+        json_str = _clean_json_string(text[start:end + 1])
+        try:
+            result = json.loads(json_str)
+            return result
+        except json.JSONDecodeError as e:
+            logger.warning(f"JSON invalide: {e}")
+
+    return {"chunks": []}
+
+
+def chunk_section_with_llm(
+    section_content: str,
+    section_title: str,
+    chapter_title: Optional[str] = None,
+    subsection_title: Optional[str] = None,
+    section_level: int = 1,
+    model: Optional[str] = None,
+    provider: LLMProvider = "ollama",
+    temperature: float = 0.2,
+    target_chunk_size: int = 400,
+) -> List[SemanticChunk]:
+    """Split a section into semantically coherent chunks using an LLM.
+
+    This is the main semantic chunking function. It uses an LLM to identify
+    natural semantic boundaries in academic/philosophical texts, preserving
+    argumentative coherence and annotating each chunk with metadata.
+
+    Args:
+        section_content: The text content of the section to chunk.
+        section_title: Title of the current section being chunked.
+        chapter_title: Title of the parent chapter (level 1) for context.
+        subsection_title: Title of parent subsection (level 2) if applicable.
+        section_level: Hierarchy level (1=chapter, 2=section, etc.).
+        model: LLM model name. If None, uses provider default.
+        provider: LLM provider ("ollama" for local, "mistral" for API).
+        temperature: LLM temperature (lower = more deterministic).
+        target_chunk_size: Target number of words per chunk.
+
+    Returns:
+        List of SemanticChunk dictionaries containing text, summary,
+        concepts, type, section_level, and optionally paragraph_number.
+
+    Note:
+        If section is shorter than 80% of target_chunk_size, it is returned
+        as a single chunk. If LLM fails, returns section with error field.
+    """
+    if model is None:
+        model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
+
+    # Nettoyer le contenu
+    content: str = clean_page_markers(section_content)
+
+    # Si le contenu est court, ne pas découper
+    word_count: int = len(content.split())
+    if word_count < target_chunk_size * 0.8:
+        para_num: Optional[int] = extract_paragraph_number(content)
+        chunk: SemanticChunk = {
+            "text": content,
+            "summary": section_title,
+            "concepts": [],
+            "type": "main_content",
+            "section_level": section_level,
+        }
+        if para_num is not None:
+            chunk["paragraph_number"] = para_num
+        if subsection_title and subsection_title != section_title:
+            chunk["subsection_title"] = subsection_title
+        return [chunk]
+
+    chapter_info: str = f"Chapitre: {chapter_title}\n" if chapter_title else ""
+    
+    prompt = f"""Tu es un expert en analyse de textes académiques.
+
+TÂCHE: Découper ce texte en unités sémantiques cohérentes.
+
+{chapter_info}Section: {section_title}
+
+RÈGLES DE DÉCOUPAGE:
+1. Chaque chunk doit avoir un SENS COMPLET (une idée, un argument)
+2. Taille idéale: {target_chunk_size - 100} à {target_chunk_size + 100} mots
+3. NE PAS couper au milieu d'une phrase ou d'un paragraphe
+4. NE PAS couper au milieu d'une citation
+5. Regrouper les paragraphes qui développent la même idée
+6. Un chunk peut être plus long si nécessaire pour préserver le sens
+
+POUR CHAQUE CHUNK, INDIQUE:
+- text: le texte exact (copié, pas reformulé)
+- summary: résumé en 1 phrase courte
+- concepts: 3-5 concepts clés (mots ou expressions)
+- type: argument | définition | exemple | citation | exposition | transition
+
+TEXTE À DÉCOUPER:
+{content}
+
+RÉPONDS avec un JSON entre <JSON></JSON>:
+
+<JSON>
+{{
+    "chunks": [
+        {{
+            "text": "Premier paragraphe ou groupe de paragraphes...",
+            "summary": "Présentation de l'idée principale",
+            "concepts": ["concept1", "concept2", "concept3"],
+            "type": "exposition"
+        }},
+        {{
+            "text": "Deuxième partie du texte...",
+            "summary": "Développement de l'argument",
+            "concepts": ["concept4", "concept5"],
+            "type": "argument"
+        }}
+    ]
+}}
+</JSON>
+"""
+
+    logger.info(f"Chunking sémantique de '{section_title}' ({word_count} mots) via {provider.upper()}")
+
+    try:
+        response: str = call_llm(
+            prompt, model=model, provider=provider, temperature=temperature, timeout=300
+        )
+        result: Dict[str, Any] = _extract_json_from_response(response)
+        chunks: List[Dict[str, Any]] = result.get("chunks", [])
+
+        # Valider les chunks et extraire les numéros de paragraphe
+        valid_chunks: List[SemanticChunk] = []
+        for raw_chunk in chunks:
+            text: str = raw_chunk.get("text", "")
+            if is_chunk_valid(text):
+                # Extraire le numéro de paragraphe s'il existe
+                para_num = extract_paragraph_number(text)
+
+                chunk_data: SemanticChunk = {
+                    "text": text,
+                    "summary": raw_chunk.get("summary", ""),
+                    "concepts": raw_chunk.get("concepts", []),
+                    "type": raw_chunk.get("type", "main_content"),
+                    "section_level": section_level,
+                }
+
+                # Ajouter le numéro de paragraphe si détecté
+                if para_num is not None:
+                    chunk_data["paragraph_number"] = para_num
+
+                # Ajouter la hiérarchie complète
+                if subsection_title and subsection_title != section_title:
+                    chunk_data["subsection_title"] = subsection_title
+
+                valid_chunks.append(chunk_data)
+
+        # Si aucun chunk valide, retourner le contenu complet
+        if not valid_chunks:
+            logger.warning(f"Aucun chunk valide pour '{section_title}', retour contenu complet")
+            para_num = extract_paragraph_number(content)
+            fallback: SemanticChunk = {
+                "text": content,
+                "summary": section_title,
+                "concepts": [],
+                "type": "main_content",
+                "section_level": section_level,
+            }
+            if para_num is not None:
+                fallback["paragraph_number"] = para_num
+            return [fallback]
+
+        logger.info(f"Section '{section_title}' découpée en {len(valid_chunks)} chunks")
+        return valid_chunks
+
+    except Exception as e:
+        logger.error(f"Erreur chunking LLM: {e}")
+        # Fallback: retourner le contenu complet
+        para_num = extract_paragraph_number(content)
+        fallback_err: SemanticChunk = {
+            "text": content,
+            "summary": section_title,
+            "concepts": [],
+            "type": "main_content",
+            "section_level": section_level,
+            "error": str(e),
+        }
+        if para_num is not None:
+            fallback_err["paragraph_number"] = para_num
+        return [fallback_err]
+
+
+def simple_chunk_by_paragraphs(
+    content: str,
+    max_words: int = 500,
+    min_words: int = 100,
+) -> List[str]:
+    """Split text into chunks by paragraph boundaries (no LLM required).
+
+    This is a fast fallback chunking method that respects paragraph and
+    sentence boundaries. Use when LLM processing is not desired.
+
+    The algorithm:
+    1. Split by double newlines (paragraph boundaries)
+    2. Merge small paragraphs until max_words is reached
+    3. Split long paragraphs at sentence boundaries
+    4. Filter chunks below min_words threshold
+
+    Args:
+        content: Text content to split into chunks.
+        max_words: Maximum words per chunk. Defaults to 500.
+        min_words: Minimum words per chunk. Defaults to 100.
+
+    Returns:
+        List of text chunks as strings.
+
+    Example:
+        >>> chunks = simple_chunk_by_paragraphs(text, max_words=400)
+        >>> len(chunks)
+        3
+    """
+    content = clean_page_markers(content)
+
+    # Découper par paragraphes (double saut de ligne)
+    paragraphs: List[str] = re.split(r'\n\n+', content)
+
+    chunks: List[str] = []
+    current_chunk: List[str] = []
+    current_words: int = 0
+
+    for para in paragraphs:
+        para = para.strip()
+        if not para:
+            continue
+
+        para_words: int = len(para.split())
+
+        # Si le paragraphe seul est trop long, le découper par phrases
+        if para_words > max_words:
+            if current_chunk:
+                chunks.append('\n\n'.join(current_chunk))
+                current_chunk = []
+                current_words = 0
+
+            # Découper par phrases
+            sentences: List[str] = re.split(r'(?<=[.!?])\s+', para)
+            for sentence in sentences:
+                sentence_words: int = len(sentence.split())
+                if current_words + sentence_words > max_words and current_chunk:
+                    chunks.append('\n\n'.join(current_chunk))
+                    current_chunk = [sentence]
+                    current_words = sentence_words
+                else:
+                    current_chunk.append(sentence)
+                    current_words += sentence_words
+
+        # Si ajouter ce paragraphe dépasse la limite
+        elif current_words + para_words > max_words:
+            if current_chunk:
+                chunks.append('\n\n'.join(current_chunk))
+            current_chunk = [para]
+            current_words = para_words
+
+        else:
+            current_chunk.append(para)
+            current_words += para_words
+
+    # Dernier chunk
+    if current_chunk:
+        chunks.append('\n\n'.join(current_chunk))
+
+    # Filtrer les chunks trop courts
+    return [c for c in chunks if len(c.split()) >= min_words or len(chunks) == 1]
+
+
+def extract_concepts_from_chunk(
+    chunk_text: str,
+    model: Optional[str] = None,
+    provider: LLMProvider = "ollama",
+) -> List[str]:
+    """Extract key concepts from a text chunk using an LLM.
+
+    Useful for enriching chunks created without LLM processing or for
+    extracting additional concepts from existing chunks.
+
+    Args:
+        chunk_text: The text content to analyze for concepts.
+        model: LLM model name. If None, uses provider default.
+        provider: LLM provider ("ollama" or "mistral").
+
+    Returns:
+        List of 3-5 key concepts (words or short phrases). Returns
+        empty list if extraction fails or text is too short (< 100 chars).
+
+    Example:
+        >>> concepts = extract_concepts_from_chunk("L'etre-pour-la-mort...")
+        >>> concepts
+        ['etre-pour-la-mort', 'structure existentiale', 'Dasein']
+    """
+    if model is None:
+        model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
+    
+    if len(chunk_text) < 100:
+        return []
+    
+    prompt: str = f"""Extrait les 3-5 concepts clés de ce texte.
+Un concept = un mot ou une expression courte (2-3 mots max).
+
+Texte:
+{chunk_text[:1500]}
+
+Réponds avec une liste JSON simple:
+["concept1", "concept2", "concept3"]
+"""
+
+    try:
+        response: str = call_llm(prompt, model=model, provider=provider, temperature=0.1, timeout=60)
+
+        # Chercher la liste JSON
+        match: Optional[re.Match[str]] = re.search(r'\[.*?\]', response, re.DOTALL)
+        if match:
+            concepts: List[str] = json.loads(match.group())
+            return concepts[:5]  # Max 5 concepts
+
+        return []
+        
+    except Exception as e:
+        logger.warning(f"Erreur extraction concepts: {e}")
+        return []
+
--- a/generations/library_rag/utils/llm_classifier.py
+++ b/generations/library_rag/utils/llm_classifier.py
@@ -0,0 +1,582 @@
+"""LLM-based section classification module for document structure analysis.
+
+This module provides functionality to classify document sections by type
+(front_matter, chapter, appendix, etc.) using Large Language Models and
+determine which sections should be indexed for semantic search.
+
+Key Features:
+    - Section classification via LLM (classify_sections)
+    - Automatic TOC/metadata section exclusion (is_excluded_section)
+    - Post-classification validation (validate_classified_sections)
+    - Filtering for indexable content (filter_indexable_sections)
+
+Section Types:
+    The following section types are recognized:
+
+    **Indexable Content (should_index=True):**
+        - chapter: Main document content, essays, articles, book reviews
+        - introduction: Document introductions
+        - conclusion: Document conclusions
+        - preface: Prefaces, forewords, warnings (intellectual content)
+        - abstract: Summaries, abstracts
+
+    **Non-Indexable Content (should_index=False):**
+        - front_matter: Title pages, copyright, credits, colophon
+        - toc_display: Table of contents display (not content)
+        - appendix: Document appendices
+        - bibliography: References, bibliography
+        - index: Document index
+        - notes: End notes
+        - ignore: Ads, empty pages, technical metadata
+
+Classification Strategy:
+    1. LLM analyzes section titles and content previews
+    2. Automatic exclusion rules catch common TOC/metadata patterns
+    3. Post-classification validation detects false positives
+    4. Filtering extracts only indexable content
+
+Typical Usage:
+    >>> from utils.llm_classifier import classify_sections, filter_indexable_sections
+    >>> sections = [
+    ...     {"title": "Table of Contents", "content": "...", "level": 1},
+    ...     {"title": "Introduction", "content": "...", "level": 1},
+    ...     {"title": "Chapter 1", "content": "...", "level": 1}
+    ... ]
+    >>> classified = classify_sections(sections, provider="ollama")
+    >>> indexable = filter_indexable_sections(classified)
+    >>> print([s["title"] for s in indexable])
+    ['Introduction', 'Chapter 1']
+
+LLM Provider Options:
+    - "ollama": Local processing, free but slower
+    - "mistral": Cloud API, faster but incurs costs
+
+Note:
+    The classifier is designed to handle edge cases like:
+    - Book reviews with analytical content (classified as chapter)
+    - Editor's notes without analysis (classified as front_matter)
+    - TOC fragments embedded in content (detected and excluded)
+
+See Also:
+    - llm_toc: Table of contents extraction
+    - llm_chunker: Semantic chunking of classified sections
+    - llm_metadata: Document metadata extraction
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from typing import cast, Any, Dict, Final
+
+from .llm_structurer import (
+    _clean_json_string,
+    _get_default_mistral_model,
+    _get_default_model,
+    call_llm,
+)
+from .types import LLMProvider
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+# Types de sections possibles
+SECTION_TYPES: Final[dict[str, str]] = {
+    "front_matter": "Métadonnées, page de titre, copyright, crédits, NOTE DE L'ÉDITEUR, colophon",
+    "toc_display": "Table des matières affichée (pas le contenu)",
+    "preface": "Préface, avant-propos, avertissement (contenu intellectuel à indexer)",
+    "abstract": "Résumé, abstract",
+    "introduction": "Introduction de l'œuvre",
+    "chapter": "Chapitre principal du document",
+    "conclusion": "Conclusion de l'œuvre",
+    "appendix": "Annexes",
+    "bibliography": "Bibliographie, références",
+    "index": "Index",
+    "notes": "Notes de fin",
+    "ignore": "À ignorer (publicités, pages vides, métadonnées techniques)",
+}
+
+
+def _extract_json_from_response(text: str) -> dict[str, Any]:
+    """Extract JSON from LLM response text.
+
+    Handles two formats:
+    1. JSON wrapped in <JSON></JSON> tags
+    2. Raw JSON object in the response
+
+    Args:
+        text: Raw LLM response text.
+
+    Returns:
+        Parsed JSON as dictionary. Returns {"classifications": []} on failure.
+    """
+    json_match: re.Match[str] | None = re.search(
+        r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL
+    )
+    if json_match:
+        json_str: str = _clean_json_string(json_match.group(1))
+        try:
+            result: Dict[str, Any] = json.loads(json_str)
+            return result
+        except json.JSONDecodeError:
+            pass
+
+    start: int = text.find("{")
+    end: int = text.rfind("}")
+    if start != -1 and end > start:
+        json_str = _clean_json_string(text[start:end + 1])
+        try:
+            result = json.loads(json_str)
+            return result
+        except json.JSONDecodeError as e:
+            logger.warning(f"JSON invalide: {e}")
+
+    return {"classifications": []}
+
+
+def classify_sections(
+    sections: list[dict[str, Any]],
+    document_title: str | None = None,
+    model: str | None = None,
+    provider: LLMProvider = "ollama",
+    temperature: float = 0.1,
+) -> list[dict[str, Any]]:
+    """Classify document sections by type using LLM.
+
+    Uses an LLM to analyze section titles and content previews to determine
+    the type of each section (chapter, front_matter, toc_display, etc.) and
+    whether it should be indexed for semantic search.
+
+    Args:
+        sections: List of section dictionaries with keys:
+            - title: Section title
+            - content: Section content (preview used)
+            - level: Hierarchy level (1=chapter, 2=section, etc.)
+        document_title: Optional document title for context.
+        model: LLM model name. If None, uses provider default.
+        provider: LLM provider ("ollama" or "mistral").
+        temperature: Model temperature (0.0-1.0). Lower = more deterministic.
+
+    Returns:
+        Same sections list with added classification fields:
+            - type: Section type (SectionType literal)
+            - should_index: Whether to include in vector index
+            - chapter_number: Chapter number if applicable
+            - classification_reason: Explanation for the classification
+
+    Example:
+        >>> sections = [{"title": "Introduction", "content": "...", "level": 1}]
+        >>> classified = classify_sections(sections, provider="ollama")
+        >>> classified[0]["type"]
+        'introduction'
+        >>> classified[0]["should_index"]
+        True
+    """
+    if model is None:
+        model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
+
+    # Préparer les sections pour le prompt
+    sections_for_prompt: list[dict[str, Any]] = []
+    for i, section in enumerate(sections[:50]):  # Limiter à 50 sections
+        sections_for_prompt.append({
+            "index": i,
+            "title": section.get("title", ""),
+            "preview": section.get("content", "")[:200] if section.get("content") else "",
+            "level": section.get("level", 1),
+        })
+
+    types_description: str = "\n".join([f"- {k}: {v}" for k, v in SECTION_TYPES.items()])
+    title_context: str = f"Titre du document: {document_title}\n" if document_title else ""
+
+    prompt: str = f"""Tu es un expert en analyse de structure documentaire.
+
+TÂCHE: Classifier chaque section selon son type.
+
+{title_context}
+TYPES DISPONIBLES:
+{types_description}
+
+RÈGLES:
+1. "front_matter": UNIQUEMENT pages de titre SANS contenu, copyright, colophon (métadonnées pures)
+2. "toc_display": la TABLE DES MATIÈRES elle-même (pas son contenu)
+3. "preface": préface, avant-propos, avertissement (À INDEXER car contenu intellectuel)
+4. "chapter": TOUT contenu principal - chapitres, sections, articles, revues de livre, essais
+5. "ignore": publicités, pages vides, métadonnées techniques sans valeur
+
+IMPORTANT - REVUES DE LIVRE ET ARTICLES:
+- Une REVUE DE LIVRE ("Book Review") avec analyse critique → chapter, should_index = true
+- Un ARTICLE académique avec contenu substantiel → chapter, should_index = true
+- Les métadonnées éditoriales (auteur, affiliation, journal) au début d'un article NE sont PAS un motif pour classer comme "front_matter"
+- Si le document contient un TEXTE ANALYTIQUE développé → chapter
+
+CAS PARTICULIERS:
+- "NOTE DE L'ÉDITEUR" (infos édition, réimpression, SANS analyse) → front_matter, should_index = false
+- "PRÉFACE" ou "AVANT-PROPOS" (texte intellectuel) → preface, should_index = true
+- "Book Review" ou "Article" avec paragraphes d'analyse → chapter, should_index = true
+
+INDEXATION:
+- should_index = true pour: preface, introduction, chapter, conclusion, abstract
+- should_index = false pour: front_matter, toc_display, ignore
+
+⚠️ ATTENTION AUX FAUX POSITIFS - LISTE DE TITRES VS CONTENU RÉEL:
+
+LISTE DE TITRES (toc_display, should_index=false):
+- Suite de titres courts sans texte explicatif
+- Lignes commençant par "Comment...", "Où...", "Les dispositions à..."
+- Énumération de sections sans phrase complète
+- Exemple: "Comment fixer la croyance?\\nOù la croyance s'oppose au savoir\\nL'idéal de rationalité"
+
+CONTENU RÉEL (chapter, should_index=true):
+- Texte avec phrases complètes et verbes conjugués
+- Paragraphes développés avec arguments
+- Explications, définitions, raisonnements
+- Exemple: "Comment fixer la croyance? Cette question se pose dès lors que..."
+
+SECTIONS À CLASSIFIER:
+{json.dumps(sections_for_prompt, ensure_ascii=False, indent=2)}
+
+RÉPONDS avec un JSON entre <JSON></JSON>:
+
+<JSON>
+{{
+    "classifications": [
+        {{
+            "index": 0,
+            "type": "front_matter",
+            "should_index": false,
+            "chapter_number": null,
+            "reason": "Page de titre avec métadonnées éditeur"
+        }},
+        {{
+            "index": 1,
+            "type": "chapter",
+            "should_index": true,
+            "chapter_number": 1,
+            "reason": "Premier chapitre du document"
+        }}
+    ]
+}}
+</JSON>
+"""
+
+    logger.info(f"Classification de {len(sections_for_prompt)} sections via {provider.upper()} ({model})")
+
+    try:
+        response: str = call_llm(prompt, model=model, provider=provider, temperature=temperature, timeout=300)
+        result: dict[str, Any] = _extract_json_from_response(response)
+        classifications: list[dict[str, Any]] = result.get("classifications", [])
+
+        # Créer un mapping index -> classification
+        class_map: dict[int, dict[str, Any]] = {
+            c["index"]: c for c in classifications if "index" in c
+        }
+
+        # Appliquer les classifications
+        for i, section in enumerate(sections):
+            if i in class_map:
+                c: dict[str, Any] = class_map[i]
+                section["type"] = c.get("type", "chapter")
+                section["should_index"] = c.get("should_index", True)
+                section["chapter_number"] = c.get("chapter_number")
+                section["classification_reason"] = c.get("reason", "")
+            else:
+                # Défaut: traiter comme contenu
+                section["type"] = "chapter"
+                section["should_index"] = True
+                section["chapter_number"] = None
+
+        # Stats
+        types_count: dict[str, int] = {}
+        for s in sections:
+            t: str = s.get("type", "unknown")
+            types_count[t] = types_count.get(t, 0) + 1
+
+        logger.info(f"Classification terminée: {types_count}")
+
+        return sections
+
+    except Exception as e:
+        logger.error(f"Erreur classification sections: {e}")
+        # En cas d'erreur, marquer tout comme indexable
+        for section in sections:
+            section["type"] = "chapter"
+            section["should_index"] = True
+        return sections
+
+
+# Titres à exclure automatiquement (insensible à la casse)
+EXCLUDED_SECTION_TITLES: Final[list[str]] = [
+    "table des matières",
+    "table des matieres",
+    "sommaire",
+    "table of contents",
+    "contents",
+    "toc",
+    "index",
+    "liste des figures",
+    "liste des tableaux",
+    "list of figures",
+    "list of tables",
+    "note de l'éditeur",
+    "note de l'editeur",
+    "note de la rédaction",
+    "copyright",
+    "mentions légales",
+    "crédits",
+    "colophon",
+    "achevé d'imprimer",
+]
+
+
+def is_excluded_section(section: dict[str, Any]) -> bool:
+    """Check if a section should be automatically excluded from indexing.
+
+    Excludes sections based on:
+    1. Title matching known TOC/metadata patterns
+    2. Content analysis detecting TOC-like structure (short lines, title patterns)
+
+    Args:
+        section: Section dictionary with optional keys:
+            - title: Section title
+            - chapterTitle: Parent chapter title
+            - content: Section content
+
+    Returns:
+        True if section should be excluded from indexing.
+
+    Example:
+        >>> is_excluded_section({"title": "Table des matières"})
+        True
+        >>> is_excluded_section({"title": "Introduction", "content": "..."})
+        False
+    """
+    title: str = section.get("title", "").lower().strip()
+    chapter_title: str = section.get("chapterTitle", "").lower().strip()
+
+    # Vérifier le titre de la section
+    for excluded in EXCLUDED_SECTION_TITLES:
+        if excluded in title or title == excluded:
+            return True
+        if excluded in chapter_title or chapter_title == excluded:
+            return True
+
+    # Vérifier si le contenu ressemble à une liste de titres (TOC)
+    content: str = section.get("content", "")
+    if content:
+        lines: list[str] = [l.strip() for l in content.split("\n") if l.strip()]
+
+        # Si pas assez de lignes, pas de détection
+        if len(lines) < 3:
+            return False
+
+        # Critère 1: Lignes courtes (moyenne < 50 chars)
+        avg_len: float = sum(len(l) for l in lines) / len(lines)
+
+        # Critère 2: Toutes les lignes sont courtes (< 100 chars)
+        all_short: bool = all(len(l) < 100 for l in lines[:10])
+
+        # Critère 3: Patterns typiques de titres de sections
+        title_patterns: list[str] = [
+            r'^Comment\s+.+\?',           # "Comment fixer la croyance?"
+            r'^Où\s+.+',                  # "Où la croyance s'oppose"
+            r'^Les?\s+\w+\s+à\s+',        # "Les dispositions à penser"
+            r'^Que\s+.+\?',               # "Que peut-on savoir?"
+            r'^L[ae]\s+\w+\s+(de|du)\s+', # "La critique de l'intuition"
+            r'^Entre\s+.+\s+et\s+',       # "Entre nature et norme"
+        ]
+
+        # Compter combien de lignes matchent les patterns de titres
+        title_like_count: int = 0
+        for line in lines[:10]:
+            for pattern in title_patterns:
+                if re.match(pattern, line, re.IGNORECASE):
+                    title_like_count += 1
+                    break
+
+        # Critère 4: Pas de verbes conjugués typiques du contenu narratif
+        narrative_verbs: list[str] = [
+            r'\best\b', r'\bsont\b', r'\bétait\b', r'\bsera\b',
+            r'\ba\b', r'\bont\b', r'\bavait\b', r'\bavaient\b',
+            r'\bfait\b', r'\bdit\b', r'\bpense\b', r'\bexplique\b'
+        ]
+
+        has_narrative: bool = False
+        for line in lines[:5]:
+            for verb_pattern in narrative_verbs:
+                if re.search(verb_pattern, line, re.IGNORECASE):
+                    has_narrative = True
+                    break
+            if has_narrative:
+                break
+
+        # Décision: C'est une liste de titres (TOC) si:
+        # - Lignes courtes ET toutes < 100 chars ET (beaucoup de patterns de titres OU pas de verbes narratifs)
+        if len(lines) >= 5 and avg_len < 50 and all_short:
+            if title_like_count >= len(lines) * 0.4 or not has_narrative:
+                logger.debug(f"Section '{title}' exclue: ressemble à une TOC (lignes courtes, {title_like_count}/{len(lines)} titres)")
+                return True
+
+    return False
+
+
+def filter_indexable_sections(sections: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Filter sections to keep only those that should be indexed.
+
+    Applies multiple exclusion criteria:
+    1. Automatic exclusion by title pattern (TOC, index, etc.)
+    2. Parent chapter exclusion (if parent is TOC)
+    3. LLM classification (should_index flag)
+
+    Args:
+        sections: List of classified section dictionaries.
+
+    Returns:
+        Filtered list containing only indexable sections.
+
+    Example:
+        >>> sections = [
+        ...     {"title": "TOC", "should_index": False},
+        ...     {"title": "Chapter 1", "should_index": True}
+        ... ]
+        >>> filtered = filter_indexable_sections(sections)
+        >>> len(filtered)
+        1
+    """
+    filtered: list[dict[str, Any]] = []
+    excluded_count: int = 0
+
+    for s in sections:
+        # Vérifier l'exclusion automatique
+        if is_excluded_section(s):
+            logger.info(f"Section exclue automatiquement: '{s.get('title', 'Sans titre')}'")
+            excluded_count += 1
+            continue
+
+        # Vérifier si le chapitre parent est une TOC
+        chapter_title: str = s.get("chapterTitle", "").lower().strip()
+        if any(excluded in chapter_title for excluded in EXCLUDED_SECTION_TITLES):
+            logger.info(f"Section exclue (chapitre TOC): '{s.get('title', 'Sans titre')}' dans '{chapter_title}'")
+            excluded_count += 1
+            continue
+
+        # Vérifier la classification LLM
+        if s.get("should_index", True):
+            filtered.append(s)
+        else:
+            excluded_count += 1
+
+    if excluded_count > 0:
+        logger.info(f"Sections exclues: {excluded_count}, indexables: {len(filtered)}")
+
+    return filtered
+
+
+def validate_classified_sections(sections: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Post-classification validation to detect false positives.
+
+    Performs additional checks on sections marked should_index=True to catch
+    TOC fragments that escaped initial classification:
+    1. Parent chapter is TOC -> exclude
+    2. Content is mostly short title-like lines -> reclassify as toc_display
+
+    Args:
+        sections: List of already-classified section dictionaries.
+
+    Returns:
+        Validated sections with corrections applied. Corrections are logged
+        and stored in 'validation_correction' field.
+
+    Example:
+        >>> sections = [{"title": "Part 1", "should_index": True, "content": "..."}]
+        >>> validated = validate_classified_sections(sections)
+        >>> # May reclassify sections with TOC-like content
+    """
+    validated: list[dict[str, Any]] = []
+    fixed_count: int = 0
+
+    for section in sections:
+        # Vérifier d'abord si le titre du chapitre parent est une TOC
+        chapter_title: str = section.get("chapter_title", "").lower().strip()
+        section_title: str = section.get("title", "").lower().strip()
+
+        # Exclure si le chapitre parent est une TOC
+        is_toc_chapter: bool = False
+        for excluded in EXCLUDED_SECTION_TITLES:
+            if excluded in chapter_title:
+                logger.warning(f"Section '{section.get('title', 'Sans titre')}' exclue: chapitre parent est '{chapter_title}'")
+                section["should_index"] = False
+                section["type"] = "toc_display"
+                section["validation_correction"] = f"Exclue car chapitre parent = {chapter_title}"
+                fixed_count += 1
+                is_toc_chapter = True
+                break
+
+        if is_toc_chapter:
+            validated.append(section)
+            continue
+
+        # Si déjà marquée comme non-indexable, garder tel quel
+        if not section.get("should_index", True):
+            validated.append(section)
+            continue
+
+        content: str = section.get("content", "")
+
+        # Validation supplémentaire sur le contenu
+        if content:
+            lines: list[str] = [l.strip() for l in content.split("\n") if l.strip()]
+
+            # Si très peu de lignes, probablement pas un problème
+            if len(lines) < 3:
+                validated.append(section)
+                continue
+
+            # Calculer le ratio de lignes qui ressemblent à des titres
+            title_question_pattern: str = r'^(Comment|Où|Que|Quelle|Quel|Les?\s+\w+\s+(de|du|à)|Entre\s+.+\s+et)\s+'
+            title_like: int = sum(1 for l in lines if re.match(title_question_pattern, l, re.IGNORECASE))
+
+            # Si > 50% des lignes ressemblent à des titres ET lignes courtes
+            avg_len: float = sum(len(l) for l in lines) / len(lines)
+
+            if len(lines) >= 4 and title_like >= len(lines) * 0.5 and avg_len < 55:
+                # C'est probablement une liste de titres extraite de la TOC
+                logger.warning(f"Section '{section.get('title', 'Sans titre')}' reclassée: détectée comme liste de titres TOC")
+                section["should_index"] = False
+                section["type"] = "toc_display"
+                section["validation_correction"] = "Reclassée comme toc_display (liste de titres)"
+                fixed_count += 1
+                validated.append(section)
+                continue
+
+        validated.append(section)
+
+    if fixed_count > 0:
+        logger.info(f"Validation post-classification: {fixed_count} section(s) reclassée(s)")
+
+    return validated
+
+
+def get_chapter_sections(sections: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Filter sections to return only chapter-type content.
+
+    Returns sections with types that contain main document content:
+    chapter, introduction, conclusion, abstract, preface.
+
+    Args:
+        sections: List of classified section dictionaries.
+
+    Returns:
+        Filtered list containing only chapter-type sections.
+
+    Example:
+        >>> sections = [
+        ...     {"title": "TOC", "type": "toc_display"},
+        ...     {"title": "Chapter 1", "type": "chapter"}
+        ... ]
+        >>> chapters = get_chapter_sections(sections)
+        >>> len(chapters)
+        1
+    """
+    chapter_types: set[str] = {"chapter", "introduction", "conclusion", "abstract", "preface"}
+    return [s for s in sections if s.get("type") in chapter_types]
--- a/generations/library_rag/utils/llm_cleaner.py
+++ b/generations/library_rag/utils/llm_cleaner.py
@@ -0,0 +1,389 @@
+"""Text cleaning and validation for OCR-extracted content.
+
+This module provides utilities for cleaning OCR artifacts from extracted text,
+validating chunk content, and optionally using LLM for intelligent corrections.
+It handles common OCR issues like page markers, isolated page numbers,
+repeated headers/footers, and character recognition errors.
+
+Overview:
+    The module offers three levels of cleaning:
+
+    1. **Basic cleaning** (clean_page_markers, clean_ocr_artifacts):
+       Fast regex-based cleaning for common issues. Always applied.
+
+    2. **LLM-enhanced cleaning** (clean_content_with_llm):
+       Uses an LLM to correct subtle OCR errors while preserving meaning.
+       Only applied when explicitly requested and for medium-length texts.
+
+    3. **Validation** (is_chunk_valid):
+       Checks if a text chunk contains meaningful content.
+
+Cleaning Operations:
+    - Remove page markers (<!-- Page X -->)
+    - Remove isolated page numbers
+    - Remove short/repetitive header/footer lines
+    - Normalize multiple spaces and blank lines
+    - Correct obvious OCR character errors (LLM mode)
+    - Preserve citations, technical vocabulary, paragraph structure
+
+Validation Criteria:
+    - Minimum character count (default: 20)
+    - Minimum word count (default: 5)
+    - Not pure metadata (URLs, ISBNs, DOIs, copyright notices)
+
+LLM Provider Support:
+    - ollama: Local LLM (free, slower, default)
+    - mistral: Mistral API (faster, requires API key)
+
+Example:
+    >>> from utils.llm_cleaner import clean_chunk, is_chunk_valid
+    >>> 
+    >>> # Clean a chunk with basic cleaning only
+    >>> text = "<!-- Page 42 --> Some philosophical content..."
+    >>> cleaned = clean_chunk(text)
+    >>> print(cleaned)
+    'Some philosophical content...'
+    >>> 
+    >>> # Validate chunk before processing
+    >>> if is_chunk_valid(cleaned):
+    ...     process_chunk(cleaned)
+
+See Also:
+    utils.llm_chunker: Semantic chunking of sections
+    utils.llm_validator: Document validation and concept extraction
+    utils.pdf_pipeline: Main pipeline orchestration
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import List, Optional, Pattern
+
+from .llm_structurer import call_llm, _get_default_model, _get_default_mistral_model
+from .types import LLMProvider
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+# Type alias for compiled regex patterns
+RegexPattern = Pattern[str]
+
+
+def clean_page_markers(text: str) -> str:
+    r"""Remove page markers and normalize blank lines from text.
+
+    Page markers are HTML comments inserted during OCR processing to track
+    page boundaries. This function removes them along with excessive blank
+    lines that may result from the removal.
+
+    Args:
+        text: Text content potentially containing page markers like
+            '<!-- Page 42 -->' and multiple consecutive newlines.
+
+    Returns:
+        Cleaned text with page markers removed and no more than two
+        consecutive newlines. Text is stripped of leading/trailing whitespace.
+
+    Example:
+        >>> text = "<!-- Page 1 -->\nContent here\n\n\n\n<!-- Page 2 -->"
+        >>> clean_page_markers(text)
+        'Content here'
+    """
+    # Supprimer les marqueurs <!-- Page X -->
+    text = re.sub(r'<!--\s*Page\s*\d+\s*-->', '', text)
+    
+    # Supprimer les lignes vides multiples
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    
+    return text.strip()
+
+
+def clean_ocr_artifacts(text: str) -> str:
+    r"""Remove common OCR artifacts without using LLM.
+
+    This function performs fast, rule-based cleaning of typical OCR issues:
+    - Isolated page numbers (1-4 digits on their own line)
+    - Very short lines likely to be headers/footers (<=3 chars)
+    - Multiple consecutive spaces
+    - Excessive blank lines (>2)
+
+    Lines starting with '#' (markdown headers) are preserved regardless
+    of length. Empty lines are preserved (single blank lines only).
+
+    Args:
+        text: Raw OCR-extracted text potentially containing artifacts
+            like isolated page numbers, repeated headers, and irregular spacing.
+
+    Returns:
+        Cleaned text with artifacts removed and spacing normalized.
+        Leading/trailing whitespace is stripped.
+
+    Example:
+        >>> text = "42\n\nActual content here\n\n\n\n\nMore text"
+        >>> clean_ocr_artifacts(text)
+        'Actual content here\n\nMore text'
+
+    Note:
+        This function is always called as part of clean_chunk() and provides
+        a baseline level of cleaning even when LLM cleaning is disabled.
+    """
+    # Supprimer les numéros de page isolés
+    text = re.sub(r'^\d{1,4}\s*$', '', text, flags=re.MULTILINE)
+
+    # Supprimer les en-têtes/pieds de page répétés (lignes très courtes isolées)
+    lines: List[str] = text.split('\n')
+    cleaned_lines: List[str] = []
+    for line in lines:
+        # Garder les lignes non vides et significatives
+        stripped: str = line.strip()
+        if stripped and (len(stripped) > 3 or stripped.startswith('#')):
+            cleaned_lines.append(line)
+        elif not stripped:
+            cleaned_lines.append('')  # Préserver les lignes vides simples
+
+    text = '\n'.join(cleaned_lines)
+
+    # Normaliser les espaces
+    text = re.sub(r' {2,}', ' ', text)
+
+    # Supprimer les lignes vides multiples
+    text = re.sub(r'\n{3,}', '\n\n', text)
+
+    return text.strip()
+
+
+def clean_content_with_llm(
+    text: str,
+    context: Optional[str] = None,
+    model: Optional[str] = None,
+    provider: LLMProvider = "ollama",
+    temperature: float = 0.1,
+) -> str:
+    """Clean text content using an LLM for intelligent OCR error correction.
+
+    Uses a language model to correct subtle OCR errors that rule-based
+    cleaning cannot handle, such as misrecognized characters in context.
+    The LLM is instructed to preserve the intellectual content exactly
+    while fixing obvious technical errors.
+
+    The function includes safeguards:
+    - Texts < 50 chars: Only basic cleaning (LLM skipped)
+    - Texts > 3000 chars: Only basic cleaning (timeout risk)
+    - If LLM changes text by >50%: Fallback to basic cleaning
+
+    Args:
+        text: Text content to clean. Should be between 50-3000 characters
+            for LLM processing.
+        context: Optional context about the document (title, subject) to
+            help the LLM make better corrections. Example: "Heidegger's
+            Being and Time, Chapter 2".
+        model: LLM model name. If None, uses provider default
+            (qwen2.5:7b for ollama, mistral-small-latest for mistral).
+        provider: LLM provider to use. Options: "ollama" (local, free)
+            or "mistral" (API, faster).
+        temperature: LLM temperature for response generation. Lower values
+            (0.1) produce more deterministic corrections. Defaults to 0.1.
+
+    Returns:
+        Cleaned text with OCR errors corrected. If LLM fails or produces
+        suspicious output (too short/long), returns basic-cleaned text.
+
+    Raises:
+        No exceptions raised - all errors caught and handled with fallback.
+
+    Example:
+        >>> text = "Heidegger's concept of Dase1n is central..."  # '1' should be 'i'
+        >>> clean_content_with_llm(text, context="Being and Time")
+        "Heidegger's concept of Dasein is central..."
+
+    Note:
+        The LLM is explicitly instructed NOT to:
+        - Modify meaning or intellectual content
+        - Rephrase or summarize
+        - Add any new content
+        - Alter citations or technical vocabulary
+    """
+    if model is None:
+        model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
+
+    # Ne pas traiter les textes trop courts
+    if len(text.strip()) < 50:
+        return clean_page_markers(text)
+
+    # Limiter la taille pour éviter les timeouts
+    max_chars: int = 3000
+    if len(text) > max_chars:
+        # Pour les longs textes, nettoyer sans LLM
+        return clean_page_markers(clean_ocr_artifacts(text))
+
+    context_info: str = f"Contexte: {context}\n" if context else ""
+
+    prompt: str = f"""Tu es un expert en correction de textes OCRisés.
+
+TÂCHE: Nettoyer ce texte extrait par OCR.
+
+{context_info}
+ACTIONS À EFFECTUER:
+1. Supprimer les marqueurs de page (<!-- Page X -->)
+2. Corriger les erreurs OCR ÉVIDENTES (caractères mal reconnus)
+3. Supprimer les artefacts (numéros de page isolés, en-têtes répétés)
+4. Normaliser la ponctuation et les espaces
+
+RÈGLES STRICTES:
+- NE PAS modifier le sens ou le contenu intellectuel
+- NE PAS reformuler ou résumer
+- NE PAS ajouter de contenu
+- Préserver les citations et le vocabulaire technique
+- Garder la structure des paragraphes
+
+TEXTE À NETTOYER:
+{text}
+
+RÉPONDS UNIQUEMENT avec le texte nettoyé, sans commentaires ni balises."""
+
+    try:
+        response: str = call_llm(
+            prompt, model=model, provider=provider, temperature=temperature, timeout=120
+        )
+
+        # Vérifier que la réponse est valide
+        cleaned: str = response.strip()
+
+        # Si la réponse est trop différente (LLM a trop modifié), garder l'original nettoyé basiquement
+        if len(cleaned) < len(text) * 0.5 or len(cleaned) > len(text) * 1.5:
+            logger.warning("LLM a trop modifié le texte, utilisation du nettoyage basique")
+            return clean_page_markers(clean_ocr_artifacts(text))
+
+        return cleaned
+
+    except Exception as e:
+        logger.warning(f"Erreur nettoyage LLM: {e}, utilisation du nettoyage basique")
+        return clean_page_markers(clean_ocr_artifacts(text))
+
+
+def clean_chunk(
+    chunk_text: str,
+    use_llm: bool = False,
+    context: Optional[str] = None,
+    model: Optional[str] = None,
+    provider: LLMProvider = "ollama",
+) -> str:
+    r"""Clean a text chunk with optional LLM enhancement.
+
+    This is the main entry point for chunk cleaning. It always applies
+    basic cleaning (page markers, OCR artifacts) and optionally uses
+    LLM for more intelligent error correction.
+
+    Cleaning pipeline:
+    1. Remove page markers (always)
+    2. Remove OCR artifacts (always)
+    3. LLM correction (if use_llm=True and text >= 50 chars)
+
+    Args:
+        chunk_text: Raw text content of the chunk to clean.
+        use_llm: Whether to use LLM for enhanced cleaning. Defaults to
+            False. Set to True for higher quality but slower processing.
+        context: Optional document context (title, chapter) passed to LLM
+            for better corrections. Ignored if use_llm=False.
+        model: LLM model name. If None, uses provider default.
+            Ignored if use_llm=False.
+        provider: LLM provider ("ollama" or "mistral"). Defaults to
+            "ollama". Ignored if use_llm=False.
+
+    Returns:
+        Cleaned chunk text ready for indexing or further processing.
+
+    Example:
+        >>> # Basic cleaning only (fast)
+        >>> chunk = "<!-- Page 5 -->\n42\n\nThe concept of being..."
+        >>> clean_chunk(chunk)
+        'The concept of being...'
+        >>>
+        >>> # With LLM enhancement (slower, higher quality)
+        >>> clean_chunk(chunk, use_llm=True, context="Heidegger analysis")
+        'The concept of being...'
+
+    See Also:
+        is_chunk_valid: Validate cleaned chunks before processing
+        clean_page_markers: Basic page marker removal
+        clean_ocr_artifacts: Basic artifact removal
+    """
+    # Nettoyage de base toujours appliqué
+    text: str = clean_page_markers(chunk_text)
+    text = clean_ocr_artifacts(text)
+
+    # Nettoyage LLM optionnel
+    if use_llm and len(text) >= 50:
+        text = clean_content_with_llm(text, context=context, model=model, provider=provider)
+
+    return text
+
+
+def is_chunk_valid(chunk_text: str, min_chars: int = 20, min_words: int = 5) -> bool:
+    """Check if a text chunk contains meaningful content.
+
+    Validates that a chunk has sufficient length and is not purely
+    metadata or boilerplate content. Used to filter out non-content
+    chunks before indexing.
+
+    Validation criteria:
+    1. Character count >= min_chars (after page marker removal)
+    2. Word count >= min_words
+    3. Not matching metadata patterns (URLs, ISBNs, DOIs, dates, copyright)
+
+    Args:
+        chunk_text: Text content of the chunk to validate. Page markers
+            are removed before validation.
+        min_chars: Minimum number of characters required. Defaults to 20.
+            Chunks shorter than this are considered invalid.
+        min_words: Minimum number of words required. Defaults to 5.
+            Chunks with fewer words are considered invalid.
+
+    Returns:
+        True if the chunk passes all validation criteria and contains
+        meaningful content suitable for indexing. False otherwise.
+
+    Example:
+        >>> is_chunk_valid("The concept of Dasein is central to Heidegger.")
+        True
+        >>> is_chunk_valid("42")  # Too short
+        False
+        >>> is_chunk_valid("ISBN 978-0-123456-78-9")  # Metadata
+        False
+        >>> is_chunk_valid("https://example.com/page")  # URL
+        False
+
+    Note:
+        Metadata patterns checked:
+        - URLs (http://, https://)
+        - Dates (YYYY-MM-DD format)
+        - ISBN numbers
+        - DOI identifiers
+        - Copyright notices (©)
+    """
+    text: str = clean_page_markers(chunk_text).strip()
+
+    # Vérifier la longueur
+    if len(text) < min_chars:
+        return False
+
+    # Compter les mots
+    words: List[str] = text.split()
+    if len(words) < min_words:
+        return False
+
+    # Vérifier que ce n'est pas juste des métadonnées
+    metadata_patterns: List[str] = [
+        r'^https?://',
+        r'^\d{4}-\d{2}-\d{2}$',
+        r'^ISBN',
+        r'^DOI',
+        r'^©',
+    ]
+    pattern: str
+    for pattern in metadata_patterns:
+        if re.match(pattern, text, re.IGNORECASE):
+            return False
+
+    return True
+
--- a/generations/library_rag/utils/llm_metadata.py
+++ b/generations/library_rag/utils/llm_metadata.py
@@ -0,0 +1,294 @@
+r"""LLM-based bibliographic metadata extraction from documents.
+
+This module extracts bibliographic metadata (title, author, publisher, year, etc.)
+from document text using Large Language Models. It supports both local (Ollama)
+and cloud-based (Mistral API) LLM providers.
+
+The extraction process:
+    1. Takes the first N characters of the document markdown (typically first pages)
+    2. Sends a structured prompt to the LLM requesting JSON-formatted metadata
+    3. Parses the LLM response to extract the JSON data
+    4. Applies default values and cleanup for missing/invalid fields
+
+Supported metadata fields:
+    - title: Document title (including subtitle if present)
+    - author: Primary author name
+    - collection: Series or collection name
+    - publisher: Publisher name
+    - year: Publication year
+    - doi: Digital Object Identifier
+    - isbn: ISBN number
+    - language: ISO 639-1 language code (default: "fr")
+    - confidence: Dict of confidence scores per field (0.0-1.0)
+
+LLM Provider Differences:
+    - **Ollama** (local): Free, slower, requires local installation.
+      Uses models like "mistral", "llama2", "mixtral".
+    - **Mistral API** (cloud): Fast, paid (~0.002€/call for small prompts).
+      Uses models like "mistral-small-latest", "mistral-medium-latest".
+
+Cost Implications:
+    - Ollama: No API cost, only local compute resources
+    - Mistral API: ~0.002€ per metadata extraction call (small prompt)
+
+Example:
+    >>> from utils.llm_metadata import extract_metadata
+    >>>
+    >>> markdown = '''
+    ... # La technique et le temps
+    ... ## Tome 1 : La faute d'Épiméthée
+    ...
+    ... Bernard Stiegler
+    ...
+    ... Éditions Galilée, 1994
+    ... '''
+    >>>
+    >>> metadata = extract_metadata(markdown, provider="ollama")
+    >>> print(metadata)
+    {
+        'title': 'La technique et le temps. Tome 1 : La faute d\'Épiméthée',
+        'author': 'Bernard Stiegler',
+        'publisher': 'Éditions Galilée',
+        'year': 1994,
+        'language': 'fr',
+        'confidence': {'title': 0.95, 'author': 0.98}
+    }
+
+See Also:
+    - llm_toc: Table of contents extraction via LLM
+    - llm_structurer: Core LLM call infrastructure
+    - pdf_pipeline: Orchestration using this module (Step 4)
+"""
+
+import json
+import logging
+import re
+from typing import Any, Dict, Optional
+
+from .llm_structurer import (
+    _clean_json_string,
+    _get_default_mistral_model,
+    _get_default_model,
+    call_llm,
+)
+from .types import LLMProvider
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def _extract_json_from_response(text: str) -> Dict[str, Any]:
+    """Extract JSON data from an LLM response string.
+
+    Attempts to parse JSON from the LLM response using two strategies:
+    1. First, looks for JSON enclosed in <JSON></JSON> tags (preferred format)
+    2. Falls back to finding the first {...} block in the response
+
+    The function applies JSON string cleaning to handle common LLM quirks
+    like trailing commas, unescaped quotes, etc.
+
+    Args:
+        text: Raw LLM response text that may contain JSON data.
+
+    Returns:
+        Parsed JSON as a dictionary. Returns empty dict if no valid
+        JSON could be extracted.
+
+    Example:
+        >>> response = '<JSON>{"title": "Test", "author": "Smith"}</JSON>'
+        >>> _extract_json_from_response(response)
+        {'title': 'Test', 'author': 'Smith'}
+
+        >>> response = 'Here is the metadata: {"title": "Test"}'
+        >>> _extract_json_from_response(response)
+        {'title': 'Test'}
+    """
+    # Chercher entre balises <JSON> et </JSON>
+    json_match: Optional[re.Match[str]] = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL)
+    if json_match:
+        json_str: str = _clean_json_string(json_match.group(1))
+        try:
+            result: Dict[str, Any] = json.loads(json_str)
+            return result
+        except json.JSONDecodeError:
+            pass
+
+    # Fallback: chercher le premier objet JSON
+    start: int = text.find("{")
+    end: int = text.rfind("}")
+    if start != -1 and end > start:
+        json_str = _clean_json_string(text[start:end + 1])
+        try:
+            result = json.loads(json_str)
+            return result
+        except json.JSONDecodeError as e:
+            logger.warning(f"JSON invalide: {e}")
+
+    return {}
+
+
+def extract_metadata(
+    markdown: str,
+    model: Optional[str] = None,
+    provider: LLMProvider = "ollama",
+    temperature: float = 0.1,
+    max_chars: int = 6000,
+) -> Dict[str, Any]:
+    """Extract bibliographic metadata from a document using an LLM.
+
+    Analyzes the beginning of a document (typically first few pages) to extract
+    bibliographic metadata including title, author, publisher, year, and more.
+    Uses a structured prompt that guides the LLM to distinguish between
+    document title vs. collection name vs. publisher name.
+
+    The LLM is instructed to return confidence scores for extracted fields,
+    allowing downstream processing to handle uncertain extractions appropriately.
+
+    Args:
+        markdown: Document text in Markdown format. For best results, provide
+            at least the first 2-3 pages containing title page and colophon.
+        model: LLM model name to use. If None, uses the default model for the
+            selected provider (e.g., "mistral" for Ollama, "mistral-small-latest"
+            for Mistral API).
+        provider: LLM provider to use. Options are:
+            - "ollama": Local LLM (free, slower, requires Ollama installation)
+            - "mistral": Mistral API (fast, paid, requires API key)
+        temperature: Model temperature for generation. Lower values (0.0-0.3)
+            produce more consistent, deterministic results. Default 0.1.
+        max_chars: Maximum number of characters to send to the LLM. Longer
+            documents are truncated. Default 6000 (~2 pages).
+
+    Returns:
+        Dictionary containing extracted metadata with the following keys:
+            - title (str | None): Document title with subtitle if present
+            - author (str | None): Primary author name
+            - collection (str | None): Series or collection name
+            - publisher (str | None): Publisher name
+            - year (int | None): Publication year
+            - doi (str | None): Digital Object Identifier
+            - isbn (str | None): ISBN number
+            - language (str): ISO 639-1 language code (default "fr")
+            - confidence (dict): Confidence scores per field (0.0-1.0)
+            - error (str): Error message if extraction failed (only on error)
+
+    Raises:
+        No exceptions are raised; errors are captured in the return dict.
+
+    Note:
+        - Cost for Mistral API: ~0.002€ per call (6000 chars input)
+        - Ollama is free but requires local GPU/CPU resources
+        - The prompt is in French as most processed documents are French texts
+        - Low temperature (0.1) is used for consistent metadata extraction
+
+    Example:
+        >>> # Extract from first pages of a philosophy book
+        >>> markdown = Path("output/stiegler/stiegler.md").read_text()[:6000]
+        >>> metadata = extract_metadata(markdown, provider="ollama")
+        >>> print(f"Title: {metadata['title']}")
+        Title: La technique et le temps
+
+        >>> # Using Mistral API for faster extraction
+        >>> metadata = extract_metadata(markdown, provider="mistral")
+        >>> print(f"Author: {metadata['author']} (confidence: {metadata['confidence'].get('author', 'N/A')})")
+        Author: Bernard Stiegler (confidence: 0.98)
+    """
+    if model is None:
+        model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
+    
+    # Prendre les premières pages (métadonnées souvent au début)
+    content: str = markdown[:max_chars]
+    if len(markdown) > max_chars:
+        content += "\n\n[... document tronqué ...]"
+    
+    prompt: str = f"""Tu es un expert en bibliographie et édition scientifique.
+
+TÂCHE: Extraire les métadonnées bibliographiques de ce document.
+
+ATTENTION - PIÈGES COURANTS:
+- Le titre n'est PAS forcément le premier titre H1 (peut être le nom de la collection)
+- Le sous-titre fait partie du titre
+- L'auteur peut apparaître sous le titre, dans les métadonnées éditeur, ou ailleurs
+- Distingue bien: titre de l'œuvre ≠ nom de la collection/série ≠ nom de l'éditeur
+
+INDICES POUR TROUVER LE VRAI TITRE:
+- Souvent en plus grand / plus visible
+- Accompagné du nom de l'auteur juste après
+- Répété sur la page de garde et la page de titre
+- Peut contenir un sous-titre après ":"
+
+IMPORTANT - FORMAT DES DONNÉES:
+- N'ajoute JAMAIS d'annotations comme "(correct)", "(à confirmer)", "(possiblement)", etc.
+- Retourne uniquement les noms propres et titres sans commentaires
+- NE METS PAS de phrases comme "À confirmer avec...", "Vérifier si...", "Possiblement..."
+- Le champ "confidence" sert à exprimer ton niveau de certitude
+- Si tu n'es pas sûr du titre, mets le titre le plus probable ET un confidence faible
+- EXEMPLE CORRECT: "title": "La pensée-signe" avec "confidence": {{"title": 0.6}}
+- EXEMPLE INCORRECT: "title": "À confirmer avec le titre exact"
+
+RÉPONDS UNIQUEMENT avec un JSON entre balises <JSON></JSON>:
+
+<JSON>
+{{
+    "title": "Le vrai titre de l'œuvre (avec sous-titre si présent)",
+    "author": "Prénom Nom de l'auteur principal",
+    "collection": "Nom de la collection ou série (null si absent)",
+    "publisher": "Nom de l'éditeur",
+    "year": 2023,
+    "doi": "10.xxxx/xxxxx (null si absent)",
+    "isbn": "978-x-xxxx-xxxx-x (null si absent)",
+    "language": "fr",
+    "confidence": {{
+        "title": 0.95,
+        "author": 0.90
+    }}
+}}
+</JSON>
+
+DOCUMENT À ANALYSER:
+{content}
+
+Réponds UNIQUEMENT avec le JSON."""
+
+    logger.info(f"Extraction métadonnées via {provider.upper()} ({model})")
+    
+    try:
+        response: str = call_llm(prompt, model=model, provider=provider, temperature=temperature)
+        metadata: Dict[str, Any] = _extract_json_from_response(response)
+        
+        # Valeurs par défaut si non trouvées
+        defaults: Dict[str, Optional[str]] = {
+            "title": None,
+            "author": None,
+            "collection": None,
+            "publisher": None,
+            "year": None,
+            "doi": None,
+            "isbn": None,
+            "language": "fr",
+        }
+        
+        for key, default in defaults.items():
+            if key not in metadata or metadata[key] == "":
+                metadata[key] = default
+        
+        # Nettoyer les valeurs "null" string
+        for key in metadata:
+            if metadata[key] == "null" or metadata[key] == "None":
+                metadata[key] = None
+        
+        logger.info(f"Métadonnées extraites: titre='{metadata.get('title')}', auteur='{metadata.get('author')}'")
+        return metadata
+        
+    except Exception as e:
+        logger.error(f"Erreur extraction métadonnées: {e}")
+        return {
+            "title": None,
+            "author": None,
+            "collection": None,
+            "publisher": None,
+            "year": None,
+            "doi": None,
+            "isbn": None,
+            "language": "fr",
+            "error": str(e),
+        }
+
--- a/generations/library_rag/utils/llm_structurer.py
+++ b/generations/library_rag/utils/llm_structurer.py
@@ -0,0 +1,583 @@
+"""Structuration de documents via LLM (Ollama ou Mistral API)."""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+import time
+from typing import Any, Dict, List, Optional, TypedDict, Union, cast
+
+import requests
+from dotenv import load_dotenv
+import threading
+
+# Import type definitions from central types module
+from utils.types import LLMCostStats
+
+# Charger les variables d'environnement
+load_dotenv()
+
+# Logger
+logger: logging.Logger = logging.getLogger(__name__)
+if not logging.getLogger().hasHandlers():
+    logging.basicConfig(
+        level=logging.INFO,
+        format="[%(asctime)s] %(levelname)s %(message)s"
+    )
+
+
+class LLMStructureError(RuntimeError):
+    """Erreur lors de la structuration via LLM."""
+    pass
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TypedDict Definitions
+# ═══════════════════════════════════════════════════════════════════════════════
+
+class MistralPricingEntry(TypedDict):
+    """Mistral API pricing per million tokens."""
+    input: float
+    output: float
+
+
+class LLMHierarchyPath(TypedDict, total=False):
+    """Hierarchy path in structured output."""
+    part: Optional[str]
+    chapter: Optional[str]
+    section: Optional[str]
+    subsection: Optional[str]
+
+
+class LLMChunkOutput(TypedDict, total=False):
+    """Single chunk in LLM structured output."""
+    chunk_id: str
+    text: str
+    hierarchy: LLMHierarchyPath
+    type: str
+    is_toc: bool
+
+
+class LLMDocumentSection(TypedDict, total=False):
+    """Document section in structured output."""
+    path: LLMHierarchyPath
+    type: str
+    page_start: int
+    page_end: int
+
+
+class LLMStructuredResult(TypedDict, total=False):
+    """Result from LLM document structuring."""
+    document_structure: List[LLMDocumentSection]
+    chunks: List[LLMChunkOutput]
+
+
+class OllamaResultContainer(TypedDict):
+    """Container for Ollama call result (internal use)."""
+    response: Optional[str]
+    error: Optional[Exception]
+    done: bool
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Configuration
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def _get_ollama_url() -> str:
+    """Retourne l'URL de base d'Ollama."""
+    return os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
+
+
+def _get_default_model() -> str:
+    """Retourne le modèle LLM par défaut."""
+    return os.getenv("STRUCTURE_LLM_MODEL", "qwen2.5:7b")
+
+
+def _get_mistral_api_key() -> Optional[str]:
+    """Retourne la clé API Mistral."""
+    return os.getenv("MISTRAL_API_KEY")
+
+
+def _get_default_mistral_model() -> str:
+    """Retourne le modèle Mistral par défaut pour les tâches LLM."""
+    return os.getenv("MISTRAL_LLM_MODEL", "mistral-small-latest")
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Appel Mistral API (rapide, cloud) avec tracking des coûts
+# ═══════════════════════════════════════════════════════════════════════════════
+
+# Prix Mistral API par million de tokens (€)
+MISTRAL_PRICING: Dict[str, MistralPricingEntry] = {
+    "mistral-small-latest": {"input": 0.2, "output": 0.6},
+    "mistral-medium-latest": {"input": 0.8, "output": 2.4},
+    "mistral-large-latest": {"input": 2.0, "output": 6.0},
+    # Fallback pour autres modèles
+    "default": {"input": 0.5, "output": 1.5},
+}
+
+# Accumulateur de coûts global (thread-local pour safety)
+_cost_tracker: threading.local = threading.local()
+
+
+def reset_llm_cost() -> None:
+    """Réinitialise le compteur de coût LLM."""
+    _cost_tracker.total_cost = 0.0
+    _cost_tracker.total_input_tokens = 0
+    _cost_tracker.total_output_tokens = 0
+    _cost_tracker.calls_count = 0
+
+
+def get_llm_cost() -> LLMCostStats:
+    """Retourne les statistiques de coût LLM accumulées."""
+    return {
+        "total_cost": getattr(_cost_tracker, "total_cost", 0.0),
+        "total_input_tokens": getattr(_cost_tracker, "total_input_tokens", 0),
+        "total_output_tokens": getattr(_cost_tracker, "total_output_tokens", 0),
+        "calls_count": getattr(_cost_tracker, "calls_count", 0),
+    }
+
+
+def _calculate_mistral_cost(model: str, input_tokens: int, output_tokens: int) -> float:
+    """Calcule le coût d'un appel Mistral API en euros."""
+    pricing: MistralPricingEntry = MISTRAL_PRICING.get(model, MISTRAL_PRICING["default"])
+    cost: float = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
+    return cost
+
+
+def _call_mistral_api(
+    prompt: str,
+    model: str = "mistral-small-latest",
+    temperature: float = 0.2,
+    max_tokens: int = 4096,
+    timeout: int = 120,
+) -> str:
+    """Appelle l'API Mistral pour générer une réponse.
+    
+    Modèles disponibles (du plus rapide au plus puissant) :
+    - mistral-small-latest : Rapide, économique (~0.2€/M tokens input)
+    - mistral-medium-latest : Équilibré (~0.8€/M tokens input)
+    - mistral-large-latest : Puissant (~2€/M tokens input)
+    
+    Args:
+        prompt: Le prompt à envoyer
+        model: Nom du modèle Mistral
+        temperature: Température (0-1)
+        max_tokens: Nombre max de tokens en réponse
+        timeout: Timeout en secondes
+    
+    Returns:
+        Réponse textuelle du LLM
+    """
+    api_key: Optional[str] = _get_mistral_api_key()
+    if not api_key:
+        raise LLMStructureError("MISTRAL_API_KEY non définie dans .env")
+    
+    logger.info(f"Appel Mistral API - modèle: {model}")
+    
+    url: str = "https://api.mistral.ai/v1/chat/completions"
+    headers: Dict[str, str] = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+    }
+    
+    payload: Dict[str, Any] = {
+        "model": model,
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+    }
+    
+    try:
+        start: float = time.time()
+        response: requests.Response = requests.post(url, headers=headers, json=payload, timeout=timeout)
+        elapsed: float = time.time() - start
+        
+        response.raise_for_status()
+        data: Dict[str, Any] = response.json()
+        
+        content: str = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+        usage: Dict[str, Any] = data.get("usage", {})
+        
+        input_tokens: int = usage.get("prompt_tokens", 0)
+        output_tokens: int = usage.get("completion_tokens", 0)
+        
+        # Calculer et accumuler le coût
+        call_cost: float = _calculate_mistral_cost(model, input_tokens, output_tokens)
+        
+        # Mettre à jour le tracker
+        if not hasattr(_cost_tracker, "total_cost"):
+            reset_llm_cost()
+        
+        _cost_tracker.total_cost += call_cost
+        _cost_tracker.total_input_tokens += input_tokens
+        _cost_tracker.total_output_tokens += output_tokens
+        _cost_tracker.calls_count += 1
+        
+        logger.info(f"Mistral API terminé en {elapsed:.1f}s - {input_tokens}+{output_tokens} tokens = {call_cost:.6f}€")
+
+        return content
+
+    except requests.exceptions.Timeout:
+        raise LLMStructureError(f"Timeout Mistral API ({timeout}s)")
+    except requests.exceptions.HTTPError as e:
+        raise LLMStructureError(f"Erreur HTTP Mistral: {e}")
+    except Exception as e:
+        raise LLMStructureError(f"Erreur Mistral API: {e}")
+
+
+def _prepare_prompt(
+    markdown: str,
+    hierarchy: Dict[str, Any],
+    max_chars: int = 8000,
+) -> str:
+    """Prépare le prompt pour le LLM.
+    
+    Args:
+        markdown: Texte Markdown du document
+        hierarchy: Structure hiérarchique initiale
+        max_chars: Nombre max de caractères du Markdown à inclure
+    
+    Returns:
+        Prompt formaté pour le LLM
+    """
+    # Tronquer le Markdown si nécessaire
+    truncated: str = markdown[:max_chars]
+    if len(markdown) > max_chars:
+        truncated += f"\n\n... [tronqué à {max_chars} caractères]"
+    
+    # Sérialiser la hiérarchie
+    outline_json: str = json.dumps(hierarchy, ensure_ascii=False, indent=2)
+    
+    prompt: str = f"""Tu es un expert en édition scientifique chargé d'analyser la structure logique d'un document.
+
+IMPORTANT: Réponds UNIQUEMENT avec un objet JSON valide. Pas de texte avant ou après.
+
+À partir du Markdown OCRisé et d'un premier découpage hiérarchique, tu dois :
+1. Identifier les parties liminaires (préface, introduction...), le corps du document (parties, chapitres, sections) et les parties finales (conclusion, annexes, bibliographie...).
+2. Reconstruire l'organisation réelle du texte.
+3. Produire un JSON avec :
+   - "document_structure": vue hiérarchique du document
+   - "chunks": liste des chunks avec chunk_id, text, hierarchy, type
+
+FORMAT DE RÉPONSE (entre balises <JSON></JSON>):
+<JSON>
+{{
+  "document_structure": [
+    {{
+      "path": {{"part": "Titre"}},
+      "type": "main_content",
+      "page_start": 1,
+      "page_end": 10
+    }}
+  ],
+  "chunks": [
+    {{
+      "chunk_id": "chunk_00001",
+      "text": "Contenu...",
+      "hierarchy": {{
+        "part": "Titre partie",
+        "chapter": "Titre chapitre",
+        "section": null,
+        "subsection": null
+      }},
+      "type": "main_content",
+      "is_toc": false
+    }}
+  ]
+}}
+</JSON>
+
+### Hiérarchie initiale
+{outline_json}
+
+### Markdown OCR
+{truncated}
+
+Réponds UNIQUEMENT avec le JSON entre <JSON> et </JSON>."""
+
+    return prompt.strip()
+
+
+def _call_ollama(
+    prompt: str,
+    model: str,
+    base_url: Optional[str] = None,
+    temperature: float = 0.2,
+    timeout: int = 300,
+) -> str:
+    """Appelle Ollama pour générer une réponse.
+    
+    Args:
+        prompt: Le prompt à envoyer
+        model: Nom du modèle Ollama
+        base_url: URL de base d'Ollama
+        temperature: Température du modèle
+        timeout: Timeout en secondes
+    
+    Returns:
+        Réponse textuelle du LLM
+    
+    Raises:
+        LLMStructureError: En cas d'erreur d'appel
+    """
+    # Essayer d'abord le SDK ollama
+    try:
+        import ollama
+
+        logger.info(f"Appel Ollama SDK - modèle: {model}, timeout: {timeout}s")
+
+        # Note: Le SDK ollama ne supporte pas directement le timeout
+        # On utilise un wrapper avec threading.Timer pour forcer le timeout
+        result_container: OllamaResultContainer = {"response": None, "error": None, "done": False}
+
+        def _run_ollama_call() -> None:
+            try:
+                resp: Any
+                if hasattr(ollama, "generate"):
+                    resp = ollama.generate(
+                        model=model,
+                        prompt=prompt,
+                        stream=False,
+                        options={"temperature": temperature}
+                    )
+                    if isinstance(resp, dict):
+                        result_container["response"] = resp.get("response", json.dumps(resp))
+                    elif hasattr(resp, "response"):
+                        result_container["response"] = resp.response
+                    else:
+                        result_container["response"] = str(resp)
+                else:
+                    # Fallback sur chat
+                    resp = ollama.chat(
+                        model=model,
+                        messages=[{"role": "user", "content": prompt}],
+                        options={"temperature": temperature}
+                    )
+                    if isinstance(resp, dict):
+                        result_container["response"] = resp.get("message", {}).get("content", str(resp))
+                    else:
+                        result_container["response"] = str(resp)
+                result_container["done"] = True
+            except Exception as e:
+                result_container["error"] = e
+                result_container["done"] = True
+
+        thread: threading.Thread = threading.Thread(target=_run_ollama_call, daemon=True)
+        thread.start()
+        thread.join(timeout=timeout)
+
+        if not result_container["done"]:
+            raise LLMStructureError(f"Timeout Ollama SDK après {timeout}s (modèle: {model})")
+
+        if result_container["error"]:
+            raise result_container["error"]
+
+        if result_container["response"]:
+            return result_container["response"]
+
+        raise LLMStructureError("Aucune réponse du SDK Ollama")
+            
+    except ImportError:
+        logger.info("SDK ollama non disponible, utilisation de l'API HTTP")
+    except Exception as e:
+        logger.warning(f"Erreur SDK ollama: {e}, fallback HTTP")
+    
+    # Fallback HTTP
+    base: str = base_url or _get_ollama_url()
+    url: str = f"{base.rstrip('/')}/api/generate"
+    
+    payload: Dict[str, Any] = {
+        "model": model,
+        "prompt": prompt,
+        "stream": False,
+        "options": {"temperature": temperature},
+    }
+    
+    # Retry avec backoff
+    max_retries: int = 2
+    backoff: float = 1.0
+    
+    for attempt in range(max_retries + 1):
+        try:
+            logger.info(f"Appel HTTP Ollama (tentative {attempt + 1})")
+            response: requests.Response = requests.post(url, json=payload, timeout=timeout)
+            
+            if response.status_code != 200:
+                raise LLMStructureError(
+                    f"Erreur Ollama ({response.status_code}): {response.text}"
+                )
+            
+            data: Dict[str, Any] = response.json()
+            if "response" not in data:
+                raise LLMStructureError(f"Réponse Ollama inattendue: {data}")
+            
+            return cast(str, data["response"])
+            
+        except requests.RequestException as e:
+            if attempt < max_retries:
+                time.sleep(backoff)
+                backoff *= 2
+                continue
+            raise LLMStructureError(f"Impossible de contacter Ollama: {e}") from e
+    
+    raise LLMStructureError("Échec après plusieurs tentatives")
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Fonction générique d'appel LLM
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def call_llm(
+    prompt: str,
+    model: Optional[str] = None,
+    provider: str = "ollama",  # "ollama" ou "mistral"
+    temperature: float = 0.2,
+    timeout: int = 300,
+) -> str:
+    """Appelle un LLM (Ollama local ou Mistral API).
+    
+    Args:
+        prompt: Le prompt à envoyer
+        model: Nom du modèle (auto-détecté si None)
+        provider: "ollama" (local, lent) ou "mistral" (API, rapide)
+        temperature: Température du modèle
+        timeout: Timeout en secondes
+    
+    Returns:
+        Réponse textuelle du LLM
+    """
+    resolved_model: str
+    if provider == "mistral":
+        # Mistral API (rapide, cloud)
+        resolved_model = model or _get_default_mistral_model()
+        return _call_mistral_api(
+            prompt,
+            model=resolved_model,
+            temperature=temperature,
+            timeout=timeout,
+        )
+    else:
+        # Ollama (local, lent mais gratuit)
+        resolved_model = model or _get_default_model()
+        return _call_ollama(
+            prompt,
+            model=resolved_model,
+            temperature=temperature,
+            timeout=timeout,
+        )
+
+
+def _clean_json_string(json_str: str) -> str:
+    """Nettoie une chaîne JSON des caractères de contrôle invalides.
+    
+    Stratégie robuste : Remplace TOUS les caractères de contrôle (x00-x1f)
+    par des espaces, puis réduit les espaces multiples. Cela évite les erreurs
+    "Invalid control character" de json.loads().
+    """
+    # Remplacer tous les caractères de contrôle par des espaces
+    cleaned: str = re.sub(r'[\x00-\x1f]', ' ', json_str)
+    # Réduire les espaces multiples
+    cleaned = re.sub(r'\s+', ' ', cleaned)
+    return cleaned
+
+
+def _extract_json(text: str) -> LLMStructuredResult:
+    """Extrait le JSON de la réponse du LLM.
+    
+    Args:
+        text: Réponse textuelle du LLM
+    
+    Returns:
+        Dictionnaire JSON parsé
+    
+    Raises:
+        LLMStructureError: Si le JSON est invalide ou absent
+    """
+    # Chercher entre balises <JSON> et </JSON>
+    json_start: int = text.find("<JSON>")
+    json_end: int = text.find("</JSON>")
+    
+    if json_start != -1 and json_end != -1 and json_end > json_start:
+        json_content: str = text[json_start + 6:json_end].strip()
+        json_content = _clean_json_string(json_content)
+        
+        try:
+            result: Dict[str, Any] = json.loads(json_content)
+            if "chunks" not in result:
+                raise LLMStructureError(
+                    f"JSON sans clé 'chunks'. Clés: {list(result.keys())}"
+                )
+            return cast(LLMStructuredResult, result)
+        except json.JSONDecodeError:
+            pass  # Fallback ci-dessous
+    
+    # Fallback: chercher par accolades
+    start: int = text.find("{")
+    end: int = text.rfind("}")
+    
+    if start == -1 or end == -1 or end <= start:
+        raise LLMStructureError(
+            f"Pas de JSON trouvé dans la réponse.\nDébut: {text[:500]}"
+        )
+    
+    json_str: str = _clean_json_string(text[start:end + 1])
+    
+    try:
+        result = json.loads(json_str)
+        if "chunks" not in result:
+            raise LLMStructureError(
+                f"JSON sans clé 'chunks'. Clés: {list(result.keys())}"
+            )
+        return cast(LLMStructuredResult, result)
+    except json.JSONDecodeError as e:
+        raise LLMStructureError(f"JSON invalide: {e}\nContenu: {json_str[:500]}") from e
+
+
+def structure_with_llm(
+    markdown: str,
+    hierarchy: Dict[str, Any],
+    model: Optional[str] = None,
+    base_url: Optional[str] = None,
+    temperature: float = 0.2,
+    max_chars: int = 8000,
+    timeout: int = 300,
+) -> LLMStructuredResult:
+    """Améliore la structure d'un document via LLM.
+    
+    Args:
+        markdown: Texte Markdown du document
+        hierarchy: Structure hiérarchique initiale (de build_hierarchy)
+        model: Modèle Ollama à utiliser
+        base_url: URL de base d'Ollama
+        temperature: Température du modèle
+        max_chars: Nombre max de caractères du Markdown
+        timeout: Timeout en secondes
+    
+    Returns:
+        Structure améliorée avec document_structure et chunks
+    
+    Raises:
+        LLMStructureError: En cas d'erreur
+    """
+    resolved_model: str = model or _get_default_model()
+    
+    logger.info(f"Structuration LLM - modèle: {resolved_model}")
+    
+    # Préparer le prompt
+    prompt: str = _prepare_prompt(markdown, hierarchy, max_chars)
+    
+    # Appeler le LLM
+    raw_response: str = _call_ollama(
+        prompt,
+        model=resolved_model,
+        base_url=base_url,
+        temperature=temperature,
+        timeout=timeout,
+    )
+    
+    # Extraire le JSON
+    return _extract_json(raw_response)
+
--- a/generations/library_rag/utils/llm_toc.py
+++ b/generations/library_rag/utils/llm_toc.py
@@ -0,0 +1,420 @@
+"""LLM-based Table of Contents (TOC) extraction module.
+
+This module provides functionality to extract hierarchical table of contents
+from markdown documents using Large Language Models. It intelligently parses
+document structure and creates both hierarchical and flat representations
+of the TOC.
+
+Key Features:
+    - Hierarchical TOC extraction with chapters, sections, and subsections
+    - Flat TOC generation with full paths for navigation
+    - Content-to-TOC matching for associating sections with TOC entries
+    - Support for multiple LLM providers (Ollama local, Mistral API)
+
+TOC Structure Levels:
+    - Level 1: Introduction, main chapters, Conclusion, Bibliography
+    - Level 2: Sections listed under a chapter (same visual level)
+    - Level 3: Only if explicit indentation or subsection visible
+
+Typical Usage:
+    >>> from utils.llm_toc import extract_toc
+    >>> result = extract_toc(
+    ...     markdown=document_text,
+    ...     document_title="The Republic",
+    ...     provider="ollama"
+    ... )
+    >>> print(result["toc"])  # Hierarchical structure
+    [
+        {
+            "title": "Introduction",
+            "level": 1,
+            "children": []
+        },
+        {
+            "title": "Book I: Justice",
+            "level": 1,
+            "chapter_number": 1,
+            "children": [
+                {"title": "The Nature of Justice", "level": 2, "children": []}
+            ]
+        }
+    ]
+    >>> print(result["flat_toc"])  # Flat list with paths
+    [
+        {"title": "Introduction", "level": 1, "path": "Introduction"},
+        {"title": "Book I: Justice", "level": 1, "path": "Book I: Justice"},
+        {
+            "title": "The Nature of Justice",
+            "level": 2,
+            "path": "Book I: Justice > The Nature of Justice"
+        }
+    ]
+
+LLM Provider Options:
+    - "ollama": Local processing, free but slower
+    - "mistral": Cloud API, faster but incurs costs
+
+Note:
+    For documents without a clear TOC (short articles, book reviews),
+    the module returns an empty TOC list rather than inventing structure.
+
+See Also:
+    - llm_metadata: Document metadata extraction
+    - llm_classifier: Section classification
+    - toc_extractor: Non-LLM TOC extraction alternatives
+"""
+
+import json
+import logging
+import re
+from typing import cast, Any, Dict, List, Optional
+
+from .llm_structurer import (
+    _clean_json_string,
+    _get_default_mistral_model,
+    _get_default_model,
+    call_llm,
+)
+from .types import FlatTOCEntry, LLMProvider, TOCEntry, TOCResult
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def _extract_json_from_response(text: str) -> Dict[str, Any]:
+    """Extract JSON data from an LLM response.
+
+    Parses the LLM response to extract JSON content, handling both
+    explicitly tagged JSON (between <JSON></JSON> tags) and raw JSON
+    embedded in the response text.
+
+    Args:
+        text: The raw LLM response text that may contain JSON.
+
+    Returns:
+        A dictionary containing the parsed JSON data. Returns
+        {"toc": []} if no valid JSON can be extracted.
+
+    Note:
+        This function attempts two parsing strategies:
+        1. Look for JSON between <JSON></JSON> tags
+        2. Find JSON by locating first '{' and last '}'
+    """
+    json_match: Optional[re.Match[str]] = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL)
+    if json_match:
+        json_str: str = _clean_json_string(json_match.group(1))
+        try:
+            result: Dict[str, Any] = json.loads(json_str)
+            return result
+        except json.JSONDecodeError:
+            pass
+
+    start: int = text.find("{")
+    end: int = text.rfind("}")
+    if start != -1 and end > start:
+        json_str = _clean_json_string(text[start:end + 1])
+        try:
+            result = json.loads(json_str)
+            return result
+        except json.JSONDecodeError as e:
+            logger.warning(f"JSON invalide: {e}")
+
+    return {"toc": []}
+
+
+def extract_toc(
+    markdown: str,
+    document_title: Optional[str] = None,
+    model: Optional[str] = None,
+    provider: LLMProvider = "ollama",
+    temperature: float = 0.1,
+) -> Dict[str, Any]:
+    r"""Extract a structured table of contents from a document using LLM.
+
+    Analyzes markdown content to identify the document's hierarchical
+    structure and generates both a nested TOC (with children) and a
+    flat TOC (with navigation paths).
+
+    Args:
+        markdown: Complete markdown text of the document to analyze.
+        document_title: Optional title of the document for context.
+            Helps the LLM better understand the document structure.
+        model: LLM model name to use. If None, uses the default model
+            for the specified provider.
+        provider: LLM provider to use. Either "ollama" for local
+            processing or "mistral" for cloud API.
+        temperature: Model temperature for response generation.
+            Lower values (0.1) produce more consistent results.
+
+    Returns:
+        A dictionary containing:
+            - toc: Hierarchical list of TOC entries, each with:
+                - title: Section title
+                - level: Hierarchy level (1, 2, or 3)
+                - chapter_number: Optional chapter number
+                - children: List of nested TOC entries
+            - flat_toc: Flat list of all TOC entries with paths:
+                - title: Section title
+                - level: Hierarchy level
+                - path: Full navigation path (e.g., "Chapter 1 > Section 1")
+            - error: Error message string (only if extraction failed)
+
+    Raises:
+        No exceptions are raised; errors are captured in the return dict.
+
+    Example:
+        >>> result = extract_toc(
+        ...     markdown="# Introduction\n...\n# Chapter 1\n## Section 1.1",
+        ...     document_title="My Book",
+        ...     provider="ollama"
+        ... )
+        >>> len(result["toc"])
+        2
+        >>> result["toc"][0]["title"]
+        'Introduction'
+
+    Note:
+        - Documents longer than 12,000 characters are truncated
+        - Short articles without clear TOC return empty lists
+        - The LLM is instructed to never invent structure
+    """
+    if model is None:
+        model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
+
+    # Tronquer si trop long mais garder les sections importantes
+    max_chars: int = 12000
+    content: str = markdown[:max_chars]
+    if len(markdown) > max_chars:
+        content += "\n\n[... suite du document ...]"
+
+    title_context: str = f"Titre du document: {document_title}\n" if document_title else ""
+
+    prompt: str = f"""Tu es un expert en structuration de documents académiques.
+
+TÂCHE: Extraire la table des matières FIDÈLE au document fourni.
+
+{title_context}
+⚠️ RÈGLES CRITIQUES:
+
+1. **ANALYSER LE DOCUMENT RÉEL** - Ne JAMAIS copier les exemples ci-dessous!
+2. **DOCUMENTS SANS TOC** - Si le document est un article court, une revue de livre, ou n'a pas de table des matières explicite, retourner {{"toc": []}}
+3. **RESPECTER LA STRUCTURE PLATE** - Ne pas inventer de hiérarchie entre des lignes au même niveau
+4. **IGNORER** - Métadonnées éditoriales (DOI, ISBN, éditeur, copyright, numéros de page)
+
+NIVEAUX DE STRUCTURE:
+- level 1: Introduction, Chapitres principaux, Conclusion, Bibliographie
+- level 2: Sections listées sous un chapitre (même niveau visuel)
+- level 3: UNIQUEMENT si indentation ou sous-titre explicite visible
+
+FORMAT DE RÉPONSE (JSON entre balises <JSON></JSON>):
+
+Pour un livre avec TOC:
+<JSON>
+{{
+    "toc": [
+        {{
+            "title": "Titre Chapitre 1",
+            "level": 1,
+            "chapter_number": 1,
+            "children": [
+                {{"title": "Section 1.1", "level": 2, "children": []}},
+                {{"title": "Section 1.2", "level": 2, "children": []}}
+            ]
+        }}
+    ]
+}}
+</JSON>
+
+Pour un article SANS TOC (revue de livre, article court, etc.):
+<JSON>
+{{
+    "toc": []
+}}
+</JSON>
+
+⚠️ NE PAS COPIER CES EXEMPLES ! Analyser uniquement le DOCUMENT RÉEL ci-dessous.
+
+DOCUMENT À ANALYSER:
+{content}
+
+Réponds UNIQUEMENT avec le JSON correspondant à CE document (pas aux exemples)."""
+
+    logger.info(f"Extraction TOC via {provider.upper()} ({model})")
+
+    try:
+        response: str = call_llm(prompt, model=model, provider=provider, temperature=temperature, timeout=360)
+        result: Dict[str, Any] = _extract_json_from_response(response)
+
+        toc: List[Dict[str, Any]] = result.get("toc", [])
+
+        # Générer la version plate de la TOC
+        flat_toc: List[Dict[str, Any]] = _flatten_toc(toc)
+
+        logger.info(f"TOC extraite: {len(toc)} entrées niveau 1, {len(flat_toc)} entrées totales")
+
+        return {
+            "toc": toc,
+            "flat_toc": flat_toc,
+        }
+
+    except Exception as e:
+        logger.error(f"Erreur extraction TOC: {e}")
+        return {
+            "toc": [],
+            "flat_toc": [],
+            "error": str(e),
+        }
+
+
+def _flatten_toc(
+    toc: List[Dict[str, Any]],
+    parent_path: str = "",
+    result: Optional[List[Dict[str, Any]]] = None
+) -> List[Dict[str, Any]]:
+    """Flatten a hierarchical TOC into a list with navigation paths.
+
+    Recursively traverses a nested TOC structure and produces a flat
+    list where each entry includes its full path from the root.
+
+    Args:
+        toc: Hierarchical TOC list with nested children.
+        parent_path: Path accumulated from parent entries. Used
+            internally during recursion.
+        result: Accumulator list for results. Used internally
+            during recursion.
+
+    Returns:
+        A flat list of TOC entries, each containing:
+            - title: The section title
+            - level: Hierarchy level (1, 2, or 3)
+            - path: Full navigation path (e.g., "Chapter > Section")
+            - chapter_number: Optional chapter number if present
+
+    Example:
+        >>> hierarchical_toc = [
+        ...     {
+        ...         "title": "Chapter 1",
+        ...         "level": 1,
+        ...         "children": [
+        ...             {"title": "Section 1.1", "level": 2, "children": []}
+        ...         ]
+        ...     }
+        ... ]
+        >>> flat = _flatten_toc(hierarchical_toc)
+        >>> flat[0]["path"]
+        'Chapter 1'
+        >>> flat[1]["path"]
+        'Chapter 1 > Section 1.1'
+    """
+    if result is None:
+        result = []
+
+    for item in toc:
+        title: str = item.get("title", "")
+        level: int = item.get("level", 1)
+
+        # Construire le chemin
+        path: str
+        if parent_path:
+            path = f"{parent_path} > {title}"
+        else:
+            path = title
+
+        result.append({
+            "title": title,
+            "level": level,
+            "path": path,
+            "chapter_number": item.get("chapter_number"),
+        })
+
+        # Récursion sur les enfants
+        children: List[Dict[str, Any]] = item.get("children", [])
+        if children:
+            _flatten_toc(children, path, result)
+
+    return result
+
+
+def match_content_to_toc(
+    content_sections: List[Dict[str, Any]],
+    flat_toc: List[Dict[str, Any]],
+    model: Optional[str] = None,
+    provider: LLMProvider = "ollama",
+) -> List[Dict[str, Any]]:
+    """Match content sections to TOC entries using LLM.
+
+    Uses an LLM to intelligently associate extracted content sections
+    with their corresponding entries in the table of contents. This
+    enables navigation and context-aware content organization.
+
+    Args:
+        content_sections: List of content sections extracted from
+            the document. Each section should have a "title" key.
+        flat_toc: Flat TOC list as returned by extract_toc()["flat_toc"].
+            Each entry should have a "title" key.
+        model: LLM model name to use. If None, uses the default
+            model for the specified provider.
+        provider: LLM provider to use. Either "ollama" for local
+            processing or "mistral" for cloud API.
+
+    Returns:
+        The input content_sections list with a "toc_match" key added
+        to each section. The value is either:
+            - The matched TOC entry dict (if a match was found)
+            - None (if no match was found)
+
+    Example:
+        >>> sections = [{"title": "Introduction"}, {"title": "Methods"}]
+        >>> toc = [{"title": "Introduction", "level": 1, "path": "Introduction"}]
+        >>> matched = match_content_to_toc(sections, toc)
+        >>> matched[0]["toc_match"]["title"]
+        'Introduction'
+        >>> matched[1]["toc_match"] is None
+        True
+
+    Note:
+        - Only the first 30 content sections are processed to limit costs
+        - Failed matches are silently handled (sections get toc_match=None)
+    """
+    if model is None:
+        model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
+
+    # Préparer les données pour le prompt
+    toc_titles: List[str] = [item["title"] for item in flat_toc]
+    section_titles: List[str] = [s.get("title", "") for s in content_sections[:30]]  # Limiter
+
+    prompt: str = f"""Tu dois associer les sections de contenu aux entrées de la table des matières.
+
+TABLE DES MATIÈRES:
+{json.dumps(toc_titles, ensure_ascii=False, indent=2)}
+
+SECTIONS DE CONTENU:
+{json.dumps(section_titles, ensure_ascii=False, indent=2)}
+
+Pour chaque section de contenu, indique l'index (0-based) de l'entrée TOC correspondante.
+Si pas de correspondance, indique -1.
+
+RÉPONDS avec un JSON:
+<JSON>
+{{
+    "matches": [0, 1, 2, -1, 3, ...]
+}}
+</JSON>
+"""
+
+    try:
+        response: str = call_llm(prompt, model=model, provider=provider, temperature=0.1)
+        result: Dict[str, Any] = _extract_json_from_response(response)
+        matches: List[int] = result.get("matches", [])
+
+        # Appliquer les correspondances
+        for i, section in enumerate(content_sections):
+            if i < len(matches) and matches[i] >= 0 and matches[i] < len(flat_toc):
+                section["toc_match"] = flat_toc[matches[i]]
+            else:
+                section["toc_match"] = None
+
+        return content_sections
+
+    except Exception as e:
+        logger.warning(f"Erreur correspondance TOC: {e}")
+        return content_sections
--- a/generations/library_rag/utils/llm_validator.py
+++ b/generations/library_rag/utils/llm_validator.py
@@ -0,0 +1,513 @@
+"""Document validation and enrichment using Large Language Models.
+
+This module provides comprehensive validation, correction, and enrichment
+functionality for parsed documents. It uses LLMs to verify document coherence,
+detect inconsistencies, suggest corrections, and extract key concepts from
+text chunks.
+
+Overview:
+    The module performs three main functions:
+
+    1. **Document Validation** (validate_document):
+       Verifies the coherence of parsed documents by checking metadata,
+       table of contents, and chunk content quality. Returns detailed
+       validation results with issues, corrections, and confidence scores.
+
+    2. **Content Enrichment** (enrich_chunks_with_concepts, generate_section_summary):
+       Enhances document content by extracting key philosophical concepts
+       from chunks and generating concise summaries for sections.
+
+    3. **Correction Application** (apply_corrections, clean_validation_annotations):
+       Applies suggested corrections from validation results and cleans
+       LLM-generated annotation artifacts from text.
+
+Validation Criteria:
+    The validator checks several aspects of document quality:
+
+    - **Metadata Quality**: Verifies title and author are correctly identified
+      (not collection names, not "Unknown" when visible in text)
+    - **TOC Coherence**: Checks for duplicates, proper ordering, completeness
+    - **Chunk Content**: Ensures chunks contain substantive content, not just
+      metadata fragments or headers
+
+Validation Result Structure:
+    The ValidationResult TypedDict contains:
+
+    - valid (bool): Overall validation pass/fail
+    - errors (List[str]): Critical issues requiring attention
+    - warnings (List[str]): Non-critical suggestions
+    - corrections (Dict[str, str]): Suggested field corrections
+    - concepts (List[str]): Extracted key concepts
+    - score (float): Confidence score (0.0 to 1.0)
+
+LLM Provider Support:
+    - ollama: Local LLM (free, slower, privacy-preserving)
+    - mistral: Mistral API (faster, requires API key, ~0.001 per validation)
+
+Example:
+    >>> from utils.llm_validator import validate_document, apply_corrections
+    >>>
+    >>> # Validate a parsed document
+    >>> parsed_doc = {
+    ...     "metadata": {"title": "Phenomenologie", "author": "Hegel"},
+    ...     "toc": [{"title": "Preface", "level": 1, "page": 1}],
+    ...     "chunks": [{"text": "La conscience...", "section_path": "Preface"}]
+    ... }
+    >>> result = validate_document(parsed_doc, provider="ollama")
+    >>> print(f"Valid: {result['valid']}, Score: {result['score']}")
+    Valid: True, Score: 0.85
+
+See Also:
+    utils.llm_cleaner: Text cleaning and validation
+    utils.llm_chunker: Semantic chunking of sections
+    utils.pdf_pipeline: Main pipeline orchestration
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from typing import Any, Dict, List, Optional, Match
+
+from .llm_structurer import call_llm, _get_default_model, _get_default_mistral_model, _clean_json_string
+from .types import LLMProvider, ValidationResult, ParsedDocument, ChunkData
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def _extract_json_from_response(text: str) -> Dict[str, Any]:
+    """Extract JSON from an LLM response text.
+
+    Attempts to parse JSON from the response using two strategies:
+    1. Look for content wrapped in <JSON></JSON> tags
+    2. Find the first { and last } to extract raw JSON
+
+    Args:
+        text: LLM response text potentially containing JSON data.
+            May include markdown, explanatory text, or XML-style tags.
+
+    Returns:
+        Parsed dictionary from the JSON content. Returns an empty dict
+        if no valid JSON is found or parsing fails.
+
+    Example:
+        >>> response = '<JSON>{"valid": true, "score": 0.9}</JSON>'
+        >>> _extract_json_from_response(response)
+        {'valid': True, 'score': 0.9}
+    """
+    json_match: Optional[Match[str]] = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL)
+    if json_match:
+        json_str: str = _clean_json_string(json_match.group(1))
+        try:
+            result: Dict[str, Any] = json.loads(json_str)
+            return result
+        except json.JSONDecodeError:
+            pass
+
+    start: int = text.find("{")
+    end: int = text.rfind("}")
+    if start != -1 and end > start:
+        json_str = _clean_json_string(text[start:end + 1])
+        try:
+            result = json.loads(json_str)
+            return result
+        except json.JSONDecodeError as e:
+            logger.warning(f"JSON invalide: {e}")
+
+    return {}
+
+
+def validate_document(
+    parsed_doc: Dict[str, Any],
+    model: Optional[str] = None,
+    provider: LLMProvider = "ollama",
+    temperature: float = 0.1,
+) -> ValidationResult:
+    """Validate a parsed document's coherence and suggest corrections.
+
+    Uses an LLM to analyze the document structure and content, checking
+    for common issues like incorrect metadata, inconsistent TOC, or
+    low-quality chunk content.
+
+    Args:
+        parsed_doc: Dictionary containing the parsed document with keys:
+            - metadata: Dict with title, author, year, language
+            - toc: List of TOC entries with title, level, page
+            - chunks: List of text chunks with content and metadata
+        model: LLM model name. If None, uses provider's default model.
+        provider: LLM provider, either "ollama" (local) or "mistral" (API).
+        temperature: Model temperature for response generation (0.0-1.0).
+            Lower values produce more deterministic results.
+
+    Returns:
+        ValidationResult TypedDict containing:
+            - valid: Overall validation status (True if no critical errors)
+            - errors: List of critical issues as strings
+            - warnings: List of non-critical suggestions
+            - corrections: Dict mapping field names to suggested corrections
+            - concepts: Extracted key concepts (empty for this function)
+            - score: Confidence score from 0.0 to 1.0
+
+    Note:
+        The function always returns a valid result, even on LLM errors.
+        Check the 'score' field - a score of 0.0 indicates an error occurred.
+    """
+    if model is None:
+        model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
+
+    # Extraire les infos clés
+    metadata: Dict[str, Any] = parsed_doc.get("metadata", {})
+    toc: List[Dict[str, Any]] = parsed_doc.get("toc", [])
+    chunks: List[Dict[str, Any]] = parsed_doc.get("chunks", [])
+    
+    # Préparer le résumé du document
+    doc_summary: Dict[str, Any] = {
+        "title": metadata.get("title"),
+        "author": metadata.get("author"),
+        "toc_count": len(toc),
+        "toc_preview": [t.get("title") for t in toc[:10]] if toc else [],
+        "chunks_count": len(chunks),
+        "first_chunks_preview": [
+            c.get("text", "")[:100] for c in chunks[:5]
+        ] if chunks else [],
+    }
+    
+    prompt: str = f"""Tu es un expert en validation de documents structurés.
+
+TÂCHE: Vérifier la cohérence de ce document parsé et détecter les erreurs.
+
+DOCUMENT PARSÉ:
+{json.dumps(doc_summary, ensure_ascii=False, indent=2)}
+
+VÉRIFICATIONS À EFFECTUER:
+1. Le titre correspond-il au contenu? (pas le nom d'une collection)
+2. L'auteur est-il correctement identifié? (pas "Inconnu" si visible)
+3. La TOC est-elle cohérente? (pas de doublons, bon ordre)
+4. Les chunks contiennent-ils du vrai contenu? (pas que des métadonnées)
+
+RÉPONDS avec un JSON entre <JSON></JSON>:
+
+<JSON>
+{{
+    "is_valid": true,
+    "confidence": 0.85,
+    "issues": [
+        {{
+            "field": "title",
+            "severity": "warning",
+            "message": "Le titre semble être le nom de la collection",
+            "suggestion": "Vrai titre suggéré"
+        }}
+    ],
+    "corrections": {{
+        "title": "Titre corrigé si nécessaire",
+        "author": "Auteur corrigé si nécessaire"
+    }},
+    "quality_score": {{
+        "metadata": 0.8,
+        "toc": 0.9,
+        "chunks": 0.7
+    }}
+}}
+</JSON>
+"""
+
+    logger.info(f"Validation du document parsé via {provider.upper()}")
+
+    try:
+        response: str = call_llm(
+            prompt, model=model, provider=provider, temperature=temperature, timeout=180
+        )
+        result: Dict[str, Any] = _extract_json_from_response(response)
+
+        # Construire ValidationResult avec valeurs par défaut
+        is_valid: bool = result.get("is_valid", True)
+        issues: List[str] = result.get("issues", [])
+        corrections: Dict[str, str] = result.get("corrections", {})
+        confidence: float = result.get("confidence", 0.5)
+
+        logger.info(f"Validation terminée: valid={is_valid}, issues={len(issues)}")
+
+        validation_result: ValidationResult = {
+            "valid": is_valid,
+            "errors": [str(issue) for issue in issues] if issues else [],
+            "warnings": [],
+            "corrections": corrections,
+            "concepts": [],
+            "score": confidence,
+        }
+        return validation_result
+
+    except Exception as e:
+        logger.error(f"Erreur validation document: {e}")
+        error_result: ValidationResult = {
+            "valid": True,
+            "errors": [str(e)],
+            "warnings": [],
+            "corrections": {},
+            "concepts": [],
+            "score": 0.0,
+        }
+        return error_result
+
+
+def generate_section_summary(
+    section_content: str,
+    section_title: str,
+    model: Optional[str] = None,
+    provider: LLMProvider = "ollama",
+    max_words: int = 50,
+) -> str:
+    """Generate a concise summary for a document section using LLM.
+
+    Creates a single-sentence summary capturing the main idea of the section.
+    For very short sections (< 100 characters), returns the section title
+    instead of calling the LLM.
+
+    Args:
+        section_content: Full text content of the section to summarize.
+        section_title: Title of the section, used as fallback if summarization
+            fails or content is too short.
+        model: LLM model name. If None, uses provider's default model.
+        provider: LLM provider, either "ollama" (local) or "mistral" (API).
+        max_words: Maximum number of words for the generated summary.
+            Defaults to 50 words.
+
+    Returns:
+        Generated summary string, truncated to max_words if necessary.
+        Returns section_title if content is too short or on error.
+
+    Note:
+        Only the first 2000 characters of section_content are sent to the LLM
+        to manage context window limits and costs.
+    """
+    if model is None:
+        model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
+
+    if len(section_content) < 100:
+        return section_title
+
+    prompt: str = f"""Résume cette section en maximum {max_words} mots.
+Le résumé doit capturer l'idée principale.
+
+Titre: {section_title}
+Contenu:
+{section_content[:2000]}
+
+Résumé (en une phrase):"""
+
+    try:
+        response: str = call_llm(
+            prompt, model=model, provider=provider, temperature=0.2, timeout=60
+        )
+
+        # Nettoyer la réponse
+        summary: str = response.strip()
+
+        # Limiter la longueur
+        words: List[str] = summary.split()
+        if len(words) > max_words:
+            summary = ' '.join(words[:max_words]) + '...'
+
+        return summary or section_title
+
+    except Exception as e:
+        logger.warning(f"Erreur génération résumé: {e}")
+        return section_title
+
+
+def enrich_chunks_with_concepts(
+    chunks: List[Dict[str, Any]],
+    model: Optional[str] = None,
+    provider: LLMProvider = "ollama",
+) -> List[Dict[str, Any]]:
+    """Enrich text chunks with extracted key concepts using LLM.
+
+    Processes each chunk to extract 3-5 key philosophical or thematic
+    concepts, adding them to the chunk's 'concepts' field. Skips chunks
+    that already have concepts or are too short (< 100 characters).
+
+    Args:
+        chunks: List of chunk dictionaries, each containing at minimum:
+            - text: The chunk's text content
+            May also contain existing 'concepts' field (will be skipped).
+        model: LLM model name. If None, uses provider's default model.
+        provider: LLM provider, either "ollama" (local) or "mistral" (API).
+
+    Returns:
+        The same list of chunks, modified in-place with 'concepts' field
+        added to each chunk. Each concepts field is a list of 0-5 strings.
+
+    Note:
+        - Chunks are processed individually with logging every 10 chunks.
+        - Only the first 1000 characters of each chunk are analyzed.
+        - The function modifies chunks in-place AND returns them.
+        - On extraction error, sets concepts to an empty list.
+    """
+    if model is None:
+        model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
+
+    # Limiter le nombre de chunks à traiter en une fois
+    batch_size: int = 10
+
+    i: int
+    chunk: Dict[str, Any]
+    for i, chunk in enumerate(chunks):
+        if "concepts" in chunk and chunk["concepts"]:
+            continue  # Déjà enrichi
+
+        text: str = chunk.get("text", "")
+        if len(text) < 100:
+            chunk["concepts"] = []
+            continue
+
+        # Traiter par batch pour optimiser
+        if i % batch_size == 0:
+            logger.info(f"Enrichissement concepts: chunks {i} à {min(i+batch_size, len(chunks))}")
+
+        prompt: str = f"""Extrait 3-5 concepts clés de ce texte.
+Réponds avec une liste JSON: ["concept1", "concept2", ...]
+
+Texte:
+{text[:1000]}
+
+Concepts:"""
+
+        try:
+            response: str = call_llm(
+                prompt, model=model, provider=provider, temperature=0.1, timeout=30
+            )
+
+            # Chercher la liste JSON
+            match: Optional[Match[str]] = re.search(r'\[.*?\]', response, re.DOTALL)
+            if match:
+                concepts: List[str] = json.loads(match.group())
+                chunk["concepts"] = concepts[:5]
+            else:
+                chunk["concepts"] = []
+
+        except Exception as e:
+            logger.warning(f"Erreur extraction concepts chunk {i}: {e}")
+            chunk["concepts"] = []
+
+    return chunks
+
+
+def clean_validation_annotations(text: str) -> str:
+    """Remove LLM-generated validation annotations from text.
+
+    Cleans common annotation patterns that LLMs may add when validating
+    or correcting text, such as confidence markers or verification notes.
+
+    Patterns removed:
+        - "(correct)" or "(a confirmer)" at end of text
+        - "(a confirmer comme titre principal)"
+        - "(possiblement...)" or "(probablement...)"
+        - Isolated "(correct)" or "(a confirmer)" mid-text
+
+    Args:
+        text: Text potentially containing LLM annotation artifacts.
+
+    Returns:
+        Cleaned text with annotations removed and whitespace normalized.
+        Returns the original text if input is None or empty.
+
+    Example:
+        >>> clean_validation_annotations("Phenomenologie (a confirmer)")
+        "Phenomenologie"
+        >>> clean_validation_annotations("G.W.F. Hegel (correct)")
+        'G.W.F. Hegel'
+    """
+    if not text:
+        return text
+
+    # Supprimer les annotations à la fin du texte
+    text = re.sub(
+        r'\s*\([^)]*(?:correct|à confirmer|possiblement|probablement)[^)]*\)\s*$',
+        '',
+        text,
+        flags=re.IGNORECASE
+    )
+
+    # Nettoyer aussi les annotations au milieu si elles sont isolées
+    text = re.sub(r'\s*\((?:correct|à confirmer)\)\s*', ' ', text, flags=re.IGNORECASE)
+
+    return text.strip()
+
+
+def apply_corrections(
+    parsed_doc: Dict[str, Any],
+    validation_result: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    """Apply validation corrections to a parsed document.
+
+    Takes the corrections suggested by validate_document() and applies them
+    to the document's metadata. Also cleans any LLM annotation artifacts
+    from existing metadata fields.
+
+    Args:
+        parsed_doc: Parsed document dictionary containing at minimum:
+            - metadata: Dict with title, author, and other fields
+            May also contain 'work' field as fallback title source.
+        validation_result: Result from validate_document() containing:
+            - corrections: Dict mapping field names to corrected values
+            If None, only cleans existing metadata annotations.
+
+    Returns:
+        The modified parsed_doc with:
+            - Corrected metadata fields applied
+            - Original values preserved in 'original_<field>' keys
+            - LLM annotations cleaned from all text fields
+            - 'validation' key added with the validation_result
+
+    Note:
+        - Modifies parsed_doc in-place AND returns it
+        - Empty correction values are ignored
+        - If title contains validation phrases and 'work' field exists,
+          the work field value is used as the corrected title
+    """
+    corrections: Dict[str, str] = (
+        validation_result.get("corrections", {}) if validation_result else {}
+    )
+
+    metadata: Dict[str, Any] = parsed_doc.get("metadata", {})
+
+    # Appliquer les corrections de métadonnées
+    if "title" in corrections and corrections["title"]:
+        old_title: Optional[str] = metadata.get("title")
+        # Nettoyer les annotations de validation
+        clean_title: str = clean_validation_annotations(corrections["title"])
+        metadata["title"] = clean_title
+        metadata["original_title"] = old_title
+        logger.info(f"Titre corrigé: '{old_title}' -> '{clean_title}'")
+
+    if "author" in corrections and corrections["author"]:
+        old_author: Optional[str] = metadata.get("author")
+        # Nettoyer les annotations de validation
+        clean_author: str = clean_validation_annotations(corrections["author"])
+        metadata["author"] = clean_author
+        metadata["original_author"] = old_author
+        logger.info(f"Auteur corrigé: '{old_author}' -> '{clean_author}'")
+
+    # Nettoyer aussi les métadonnées existantes si pas de corrections
+    if "title" in metadata and metadata["title"]:
+        title: str = metadata["title"]
+        # Si le titre contient des phrases de validation, utiliser le champ "work" à la place
+        validation_phrases: List[str] = ["à confirmer", "confirmer avec", "vérifier"]
+        if any(phrase in title.lower() for phrase in validation_phrases):
+            if "work" in metadata and metadata["work"]:
+                logger.info(f"Titre remplacé par 'work': '{title}' -> '{metadata['work']}'")
+                metadata["original_title"] = title
+                metadata["title"] = metadata["work"]
+        else:
+            metadata["title"] = clean_validation_annotations(title)
+
+    if "author" in metadata and metadata["author"]:
+        metadata["author"] = clean_validation_annotations(metadata["author"])
+
+    parsed_doc["metadata"] = metadata
+    parsed_doc["validation"] = validation_result
+
+    return parsed_doc
+
--- a/generations/library_rag/utils/markdown_builder.py
+++ b/generations/library_rag/utils/markdown_builder.py
@@ -0,0 +1,141 @@
+"""Markdown document builder from OCR API responses.
+
+This module transforms Mistral OCR API responses into structured Markdown text.
+It handles text extraction, page marker insertion, and image processing
+(either base64 embedding or disk-based storage with relative path references).
+
+The builder is a core component of the PDF processing pipeline, sitting between
+OCR extraction and hierarchical parsing.
+
+Pipeline Position:
+    PDF → OCR (mistral_client) → **Markdown Builder** → Hierarchy Parser → Chunks
+
+Features:
+    - Page markers: Inserts HTML comments (<!-- Page N -->) for traceability
+    - Image handling: Supports both inline base64 and external file references
+    - Type safety: Uses Protocol-based typing for OCR response structures
+
+Workflow:
+    1. Iterate through pages in the OCR response
+    2. Extract Markdown content from each page
+    3. Process images (embed as base64 or save via ImageWriter callback)
+    4. Assemble the complete Markdown document
+
+Image Handling Modes:
+    1. **No images**: Set embed_images=False and image_writer=None
+    2. **Inline base64**: Set embed_images=True (large file size)
+    3. **External files**: Provide image_writer callback (recommended)
+
+Example:
+    >>> from pathlib import Path
+    >>> from utils.image_extractor import create_image_writer
+    >>>
+    >>> # Create image writer for output directory
+    >>> writer = create_image_writer(Path("output/my_doc/images"))
+    >>>
+    >>> # Build markdown with external image references
+    >>> markdown = build_markdown(
+    ...     ocr_response,
+    ...     embed_images=False,
+    ...     image_writer=writer
+    ... )
+    >>> print(markdown[:100])
+    <!-- Page 1 -->
+    # Document Title
+    ...
+
+Note:
+    - Page indices are 1-based for human readability
+    - The OCR response must follow the Mistral API structure
+    - Empty pages produce only the page marker comment
+
+See Also:
+    - utils.mistral_client: OCR API client for obtaining responses
+    - utils.image_extractor: Image writer factory and extraction
+    - utils.hierarchy_parser: Next step in pipeline (structure parsing)
+"""
+
+from typing import Any, Callable, List, Optional, Protocol
+
+
+# Type pour le writer d'images
+ImageWriterCallable = Callable[[int, int, str], Optional[str]]
+
+
+class OCRImage(Protocol):
+    """Protocol pour une image extraite par OCR."""
+
+    image_base64: Optional[str]
+
+
+class OCRPage(Protocol):
+    """Protocol pour une page extraite par OCR."""
+
+    markdown: Optional[str]
+    images: Optional[List[OCRImage]]
+
+
+class OCRResponseProtocol(Protocol):
+    """Protocol pour la réponse complète de l'API OCR Mistral."""
+
+    pages: List[OCRPage]
+
+
+def build_markdown(
+    ocr_response: OCRResponseProtocol,
+    embed_images: bool = False,
+    image_writer: Optional[ImageWriterCallable] = None,
+) -> str:
+    """Construit le texte Markdown à partir de la réponse OCR.
+
+    Args:
+        ocr_response: Réponse de l'API OCR Mistral contenant les pages extraites.
+        embed_images: Intégrer les images en base64 dans le Markdown.
+        image_writer: Fonction pour sauvegarder les images sur disque.
+                     Signature: (page_idx, img_idx, base64_data) -> chemin_relatif.
+
+    Returns:
+        Texte Markdown complet du document avec marqueurs de page et images.
+
+    Example:
+        >>> markdown = build_markdown(
+        ...     ocr_response,
+        ...     embed_images=False,
+        ...     image_writer=lambda p, i, b64: f"images/p{p}_i{i}.png"
+        ... )
+    """
+    md_parts: List[str] = []
+
+    for page_index, page in enumerate(ocr_response.pages, start=1):
+        # Commentaire de page
+        md_parts.append(f"<!-- Page {page_index} -->\n\n")
+
+        # Contenu Markdown de la page
+        page_markdown: Optional[str] = getattr(page, "markdown", None)
+        if page_markdown:
+            md_parts.append(page_markdown)
+            md_parts.append("\n\n")
+
+        # Traitement des images
+        page_images: Optional[List[OCRImage]] = getattr(page, "images", None)
+        if page_images:
+            for img_idx, img in enumerate(page_images, start=1):
+                image_b64: Optional[str] = getattr(img, "image_base64", None)
+                if not image_b64:
+                    continue
+
+                if embed_images:
+                    # Image intégrée en base64
+                    data_uri: str = f"data:image/png;base64,{image_b64}"
+                    md_parts.append(
+                        f"![Page {page_index} – Image {img_idx}]({data_uri})\n\n"
+                    )
+                elif image_writer:
+                    # Image sauvegardée sur disque
+                    rel_path: Optional[str] = image_writer(page_index, img_idx, image_b64)
+                    if rel_path:
+                        md_parts.append(
+                            f"![Page {page_index} – Image {img_idx}]({rel_path})\n\n"
+                        )
+
+    return "".join(md_parts)
--- a/generations/library_rag/utils/mistral_client.py
+++ b/generations/library_rag/utils/mistral_client.py
@@ -0,0 +1,169 @@
+"""Mistral API Client Management.
+
+This module provides utilities for managing the Mistral API client,
+including API key retrieval and OCR cost estimation. It serves as the
+foundation for all Mistral API interactions in the Library RAG pipeline.
+
+Key Features:
+    - Automatic API key discovery from multiple sources
+    - Client instantiation with proper authentication
+    - OCR cost estimation for budget planning
+
+API Key Priority:
+    The module searches for the Mistral API key in this order:
+    1. Explicit argument passed to functions
+    2. MISTRAL_API_KEY environment variable
+    3. .env file in the project root
+
+Cost Estimation:
+    Mistral OCR pricing (as of 2024):
+    - Standard OCR: ~1 EUR per 1000 pages (0.001 EUR/page)
+    - OCR with annotations: ~3 EUR per 1000 pages (0.003 EUR/page)
+
+Example:
+    Basic client creation and usage::
+
+        from utils.mistral_client import create_client, estimate_ocr_cost
+
+        # Create authenticated client
+        client = create_client()
+
+        # Estimate cost for a 100-page document
+        cost = estimate_ocr_cost(100, use_annotations=False)
+        print(f"Estimated cost: {cost:.2f} EUR")  # Output: Estimated cost: 0.10 EUR
+
+    Using explicit API key::
+
+        client = create_client(api_key="your-api-key-here")
+
+See Also:
+    - :mod:`utils.ocr_processor`: OCR execution functions using this client
+    - :mod:`utils.pdf_uploader`: PDF upload utilities for OCR processing
+
+Note:
+    Ensure MISTRAL_API_KEY is set before using this module in production.
+    The API key can be obtained from the Mistral AI platform dashboard.
+"""
+
+import os
+from typing import Optional
+
+from dotenv import load_dotenv
+from mistralai import Mistral
+
+
+def get_api_key(api_key: Optional[str] = None) -> str:
+    """Retrieve the Mistral API key from available sources.
+
+    Searches for the API key in the following priority order:
+    1. Explicit argument passed to this function
+    2. MISTRAL_API_KEY environment variable
+    3. .env file in the project root
+
+    Args:
+        api_key: Optional API key to use directly. If provided and non-empty,
+            this value is used without checking other sources.
+
+    Returns:
+        The Mistral API key as a string.
+
+    Raises:
+        RuntimeError: If no API key is found in any of the checked sources.
+
+    Example:
+        >>> # Using environment variable
+        >>> key = get_api_key()
+        >>> len(key) > 0
+        True
+
+        >>> # Using explicit key
+        >>> key = get_api_key("my-api-key")
+        >>> key
+        'my-api-key'
+    """
+    # 1. Argument fourni
+    if api_key and api_key.strip():
+        return api_key.strip()
+
+    # 2. Variable d environnement
+    env_key = os.getenv("MISTRAL_API_KEY", "").strip()
+    if env_key:
+        return env_key
+
+    # 3. Fichier .env
+    load_dotenv()
+    env_key = os.getenv("MISTRAL_API_KEY", "").strip()
+    if env_key:
+        return env_key
+
+    raise RuntimeError(
+        "MISTRAL_API_KEY manquante. "
+        "Definissez la variable d environnement ou creez un fichier .env"
+    )
+
+
+def create_client(api_key: Optional[str] = None) -> Mistral:
+    """Create and return an authenticated Mistral client.
+
+    This is the primary entry point for obtaining a Mistral client instance.
+    The client can be used for OCR operations, chat completions, and other
+    Mistral API features.
+
+    Args:
+        api_key: Optional API key. If not provided, the key is automatically
+            retrieved from environment variables or .env file.
+
+    Returns:
+        An authenticated Mistral client instance ready for API calls.
+
+    Raises:
+        RuntimeError: If no API key is found (propagated from get_api_key).
+
+    Example:
+        >>> client = create_client()
+        >>> # Client is now ready for OCR or other operations
+        >>> response = client.ocr.process(...)  # doctest: +SKIP
+    """
+    key = get_api_key(api_key)
+    return Mistral(api_key=key)
+
+
+def estimate_ocr_cost(nb_pages: int, use_annotations: bool = False) -> float:
+    """Estimate the cost of OCR processing for a document.
+
+    Calculates the expected cost based on Mistral OCR pricing model.
+    This is useful for budget planning before processing large document
+    collections.
+
+    Pricing Model:
+        - Standard OCR: ~1 EUR per 1000 pages (0.001 EUR/page)
+        - OCR with annotations: ~3 EUR per 1000 pages (0.003 EUR/page)
+
+    The annotation mode is approximately 3x more expensive but provides
+    additional structural information useful for TOC extraction.
+
+    Args:
+        nb_pages: Number of pages in the document to process.
+        use_annotations: If True, uses the higher annotation pricing.
+            Annotations provide bounding box and structural data.
+
+    Returns:
+        Estimated cost in euros as a float.
+
+    Example:
+        >>> # Standard OCR for 100 pages
+        >>> estimate_ocr_cost(100)
+        0.1
+
+        >>> # OCR with annotations for 100 pages
+        >>> estimate_ocr_cost(100, use_annotations=True)
+        0.3
+
+        >>> # Large document collection
+        >>> estimate_ocr_cost(10000)
+        10.0
+    """
+    if use_annotations:
+        return nb_pages * 0.003  # 3 EUR / 1000 pages
+    else:
+        return nb_pages * 0.001  # 1 EUR / 1000 pages
--- a/generations/library_rag/utils/ocr_processor.py
+++ b/generations/library_rag/utils/ocr_processor.py
@@ -0,0 +1,312 @@
+"""OCR Processing via Mistral API.
+
+This module provides functions for executing OCR (Optical Character Recognition)
+on PDF documents using the Mistral API. It handles both standard OCR and advanced
+OCR with structured annotations for better document understanding.
+
+Key Features:
+    - Standard OCR for text extraction with optional image embedding
+    - Advanced OCR with document and bounding box annotations
+    - Response serialization for JSON storage and further processing
+    - Support for page-by-page processing
+
+OCR Modes:
+    1. **Standard OCR** (run_ocr):
+       - Extracts text and optionally images
+       - Cost: ~1 EUR per 1000 pages (0.001 EUR/page)
+       - Best for: Simple text extraction, content indexing
+
+    2. **OCR with Annotations** (run_ocr_with_annotations):
+       - Extracts text with structural metadata (bounding boxes, document structure)
+       - Cost: ~3 EUR per 1000 pages (0.003 EUR/page)
+       - Best for: TOC extraction, layout analysis, structured documents
+       - Document annotations limited to 8 pages max
+       - Bounding box annotations have no page limit
+
+Response Structure:
+    The OCR response contains:
+    - pages: List of page objects with text content
+    - images: Optional base64-encoded images (if include_images=True)
+    - annotations: Structural metadata (if using annotation mode)
+
+Example:
+    Basic OCR processing::
+
+        from utils.mistral_client import create_client
+        from utils.ocr_processor import run_ocr, serialize_ocr_response
+
+        # Create client and read PDF
+        client = create_client()
+        with open("document.pdf", "rb") as f:
+            pdf_bytes = f.read()
+
+        # Run OCR
+        response = run_ocr(client, pdf_bytes, "document.pdf")
+
+        # Serialize for storage
+        ocr_dict = serialize_ocr_response(response)
+        print(f"Extracted {len(ocr_dict['pages'])} pages")
+
+Cost Considerations:
+    - Always estimate costs before batch processing with estimate_ocr_cost()
+    - Use pages parameter to limit processing when full document is not needed
+    - Annotation mode is 3x more expensive - use only when structure is needed
+    - Cache OCR results to avoid reprocessing (saved in output/<doc>/<doc>.json)
+
+See Also:
+    - utils.mistral_client: Client creation and cost estimation
+    - utils.pdf_uploader: PDF upload utilities
+    - utils.pdf_pipeline: Full pipeline orchestration
+
+Note:
+    OCR responses are Pydantic models from the Mistral SDK. Use
+    serialize_ocr_response() to convert to dictionaries before JSON storage.
+"""
+
+import json
+from typing import Any, Dict, List, Optional, Type
+
+from mistralai import Mistral
+from pydantic import BaseModel
+
+from .pdf_uploader import upload_pdf
+from .types import OCRResponse
+
+
+def run_ocr(
+    client: Mistral,
+    file_bytes: bytes,
+    filename: str,
+    include_images: bool = True,
+) -> Any:
+    """Execute standard OCR on a PDF document via Mistral API.
+
+    Uploads the PDF to Mistral servers and runs OCR to extract text content.
+    Optionally includes base64-encoded images from the document.
+
+    This is the most cost-effective OCR mode (~0.001 EUR/page) suitable for
+    basic text extraction and content indexing.
+
+    Args:
+        client: Authenticated Mistral client instance created via
+            utils.mistral_client.create_client().
+        file_bytes: Binary content of the PDF file to process.
+        filename: Original filename of the PDF (used for identification).
+        include_images: If True, includes base64-encoded images from each page
+            in the response. Set to False to reduce response size when images
+            are not needed. Defaults to True.
+
+    Returns:
+        OCR response object from Mistral API (Pydantic model). Contains:
+        - pages: List of page objects with extracted text
+        - images: Base64 images if include_images=True
+
+        Use serialize_ocr_response() to convert to a dictionary.
+
+    Raises:
+        RuntimeError: If the Mistral client is not properly authenticated.
+        HTTPError: If the API request fails (network issues, rate limits).
+
+    Example:
+        >>> from utils.mistral_client import create_client
+        >>> client = create_client()
+        >>> with open("document.pdf", "rb") as f:
+        ...     pdf_bytes = f.read()
+        >>> response = run_ocr(client, pdf_bytes, "document.pdf")
+        >>> # Access extracted text from first page
+        >>> first_page_text = response.pages[0].markdown  # doctest: +SKIP
+
+    Note:
+        The PDF is first uploaded to Mistral servers via
+        utils.pdf_uploader.upload_pdf(), then processed. The uploaded
+        file is automatically cleaned up by Mistral after processing.
+    """
+    # Upload du document
+    doc_url: str = upload_pdf(client, file_bytes, filename)
+
+    # Appel OCR
+    response = client.ocr.process(
+        model="mistral-ocr-latest",
+        document={
+            "type": "document_url",
+            "document_url": doc_url,
+        },
+        include_image_base64=include_images,
+    )
+
+    return response
+
+
+def run_ocr_with_annotations(
+    client: Mistral,
+    file_bytes: bytes,
+    filename: str,
+    include_images: bool = True,
+    document_annotation_format: Optional[Type[BaseModel]] = None,
+    bbox_annotation_format: Optional[Type[BaseModel]] = None,
+    pages: Optional[List[int]] = None,
+) -> Any:
+    """Execute OCR with structured annotations on a PDF document.
+
+    This advanced OCR mode extracts text along with structural metadata
+    defined by Pydantic schemas. Useful for extracting structured data
+    like table of contents, form fields, or document hierarchy.
+
+    Two annotation modes are available:
+    - Document annotations: Extract document-level structure (limited to 8 pages)
+    - Bounding box annotations: Extract element positions (no page limit)
+
+    This mode is approximately 3x more expensive than standard OCR (~0.003 EUR/page).
+
+    Args:
+        client: Authenticated Mistral client instance created via
+            utils.mistral_client.create_client().
+        file_bytes: Binary content of the PDF file to process.
+        filename: Original filename of the PDF (used for identification).
+        include_images: If True, includes base64-encoded images from each page.
+            Defaults to True.
+        document_annotation_format: Optional Pydantic model defining the expected
+            document-level annotation structure. The model is converted to JSON
+            schema for the API. Limited to processing 8 pages maximum.
+        bbox_annotation_format: Optional Pydantic model defining the expected
+            bounding box annotation structure. No page limit applies.
+        pages: Optional list of 0-indexed page numbers to process. If None,
+            all pages are processed. Use this to limit costs and processing time.
+
+    Returns:
+        OCR response object with annotations from Mistral API. Contains:
+        - pages: List of page objects with extracted text
+        - annotations: Structured data matching the provided Pydantic schema
+        - images: Base64 images if include_images=True
+
+        Use serialize_ocr_response() to convert to a dictionary.
+
+    Raises:
+        RuntimeError: If the Mistral client is not properly authenticated.
+        HTTPError: If the API request fails (network issues, rate limits).
+        ValueError: If document_annotation_format is used with more than 8 pages.
+
+    Example:
+        Extract table of contents from first 8 pages::
+
+            from pydantic import BaseModel
+            from typing import List, Optional
+
+            class TOCEntry(BaseModel):
+                title: str
+                page: int
+                level: int
+                children: Optional[List["TOCEntry"]] = None
+
+            response = run_ocr_with_annotations(
+                client,
+                pdf_bytes,
+                "book.pdf",
+                document_annotation_format=TOCEntry,
+                pages=[0, 1, 2, 3, 4, 5, 6, 7]
+            )
+
+            # Access annotations
+            toc_data = response.annotations  # doctest: +SKIP
+
+    Note:
+        - Document annotations are more expensive but provide rich structure
+        - For large documents, use pages parameter to limit processing
+        - Consider caching results to avoid reprocessing costs
+    """
+    from mistralai.extra import response_format_from_pydantic_model
+    
+    # Upload du document
+    doc_url: str = upload_pdf(client, file_bytes, filename)
+
+    # Construire les arguments de l'appel OCR
+    kwargs: Dict[str, Any] = {
+        "model": "mistral-ocr-latest",
+        "document": {
+            "type": "document_url",
+            "document_url": doc_url,
+        },
+        "include_image_base64": include_images,
+    }
+    
+    # Ajouter les pages si spécifié
+    if pages is not None:
+        kwargs["pages"] = pages
+    
+    # Ajouter le format d'annotation document si fourni
+    if document_annotation_format is not None:
+        kwargs["document_annotation_format"] = response_format_from_pydantic_model(
+            document_annotation_format
+        )
+    
+    # Ajouter le format d'annotation bbox si fourni
+    if bbox_annotation_format is not None:
+        kwargs["bbox_annotation_format"] = response_format_from_pydantic_model(
+            bbox_annotation_format
+        )
+    
+    # Appel OCR avec annotations
+    response = client.ocr.process(**kwargs)
+    return response
+
+
+def serialize_ocr_response(response: Any) -> Dict[str, Any]:
+    """Convert an OCR response object to a JSON-serializable dictionary.
+
+    The Mistral OCR API returns Pydantic model objects that need to be
+    converted to plain dictionaries for JSON storage or further processing.
+    This function handles various response formats from different versions
+    of the Mistral SDK.
+
+    Args:
+        response: OCR response object from Mistral API. Can be any object
+            that has model_dump(), dict(), or json() method.
+
+    Returns:
+        A dictionary representation of the OCR response, suitable for:
+        - JSON serialization with json.dumps()
+        - Storage in files (output/<doc>/<doc>.json)
+        - Further processing in the pipeline
+
+        The dictionary typically contains:
+        - pages: List of page data with text content
+        - images: Base64-encoded images (if requested)
+        - model: OCR model used
+        - usage: Token/page usage statistics
+
+    Raises:
+        TypeError: If the response object cannot be serialized using any
+            of the supported methods (model_dump, dict, json).
+
+    Example:
+        >>> # Assuming response is from run_ocr()
+        >>> ocr_dict = serialize_ocr_response(response)  # doctest: +SKIP
+        >>> import json
+        >>> with open("ocr_result.json", "w") as f:
+        ...     json.dump(ocr_dict, f, indent=2)  # doctest: +SKIP
+
+        >>> # Access page count
+        >>> num_pages = len(ocr_dict["pages"])  # doctest: +SKIP
+
+    Note:
+        This function tries multiple serialization methods in order of
+        preference:
+        1. model_dump() - Pydantic v2 (preferred)
+        2. dict() - Pydantic v1 compatibility
+        3. json() - Fallback for other Pydantic models
+    """
+    if hasattr(response, "model_dump"):
+        result: Dict[str, Any] = response.model_dump()
+        return result
+
+    if hasattr(response, "dict"):
+        result = response.dict()
+        return result
+
+    if hasattr(response, "json"):
+        result = json.loads(response.json())
+        return result
+
+    raise TypeError("Réponse OCR non sérialisable")
+
+
--- a/generations/library_rag/utils/ocr_schemas.py
+++ b/generations/library_rag/utils/ocr_schemas.py
@@ -0,0 +1,55 @@
+"""Schémas Pydantic pour l'extraction structurée via OCR avec annotations.
+
+Utilisés avec document_annotation_format et bbox_annotation_format de l'API Mistral.
+"""
+
+from typing import List, Optional
+from pydantic import BaseModel, Field
+from enum import Enum
+
+
+class TocEntryType(str, Enum):
+    """Type d'entrée de table des matières."""
+    CHAPTER = "chapter"
+    SECTION = "section"
+    SUBSECTION = "subsection"
+    PREAMBLE = "preamble"
+    APPENDIX = "appendix"
+
+
+class TocEntry(BaseModel):
+    """Entrée de table des matières avec hiérarchie."""
+    title: str = Field(..., description="Titre exact de la section tel qu'il apparaît dans la table des matières")
+    page_number: int = Field(..., description="Numéro de page réel tel qu'imprimé/affiché dans le livre (PAS l'index séquentiel du PDF, mais le numéro visible sur la page elle-même)")
+    level: int = Field(..., description="""Niveau hiérarchique détecté VISUELLEMENT dans la mise en page de la table des matières:
+    - level=1 si le titre est aligné à gauche SANS indentation (titres principaux)
+    - level=2 si le titre a une PETITE indentation ou est légèrement décalé vers la droite
+    - level=3 si le titre a une DOUBLE indentation ou est très décalé vers la droite
+    Regardez attentivement l'alignement horizontal et les espaces avant chaque titre pour déterminer le niveau.""")
+    entry_type: TocEntryType = Field(default=TocEntryType.SECTION, description="Type d'entrée: 'preamble' pour préfaces/introductions, 'chapter' pour chapitres, 'section' pour sections, 'subsection' pour sous-sections, 'appendix' pour annexes")
+    parent_title: Optional[str] = Field(None, description="Si level > 1, indiquer le titre du parent direct (l'entrée de level=1 sous laquelle cette entrée est indentée)")
+
+
+class DocumentTOC(BaseModel):
+    """Table des matières complète du document."""
+    entries: List[TocEntry] = Field(..., description="""Liste COMPLÈTE de TOUTES les entrées de la table des matières dans l'ordre d'apparition. 
+    IMPORTANT : Analysez attentivement l'indentation/alignement horizontal de chaque titre pour assigner le bon niveau hiérarchique:
+    - Les titres alignés à gauche (non indentés) = level 1
+    - Les titres légèrement indentés/décalés vers la droite = level 2 (sous-sections du titre level 1 précédent)
+    - Les titres avec double indentation = level 3 (sous-sections du titre level 2 précédent)
+    Chaque entrée doit avoir son vrai numéro de page tel qu'imprimé dans le livre.""")
+    has_explicit_toc: bool = Field(..., description="Le document contient-il une table des matières explicite et visible ? (généralement en début de document)")
+    toc_page_numbers: List[int] = Field(..., description="Liste des numéros de pages où se trouve la table des matières (généralement pages 2-5)")
+
+
+class DocumentMetadata(BaseModel):
+    """Métadonnées enrichies du document."""
+    title: str = Field(..., description="Titre complet du document")
+    author: str = Field(..., description="Auteur principal du document")
+    languages: List[str] = Field(..., description="Liste des langues présentes dans le document (codes ISO 639-1, ex: ['fr', 'en'])")
+    summary: str = Field(..., description="Résumé du document en 2-3 phrases maximum")
+    collection: Optional[str] = Field(None, description="Nom de la collection ou série éditoriale")
+    publisher: Optional[str] = Field(None, description="Nom de l'éditeur")
+    year: Optional[int] = Field(None, description="Année de publication")
+    total_pages: int = Field(..., description="Nombre total de pages dans le document")
+    toc: DocumentTOC = Field(..., description="Table des matières structurée avec hiérarchie et numéros de page réels")
--- a/generations/library_rag/utils/pdf_pipeline.py
+++ b/generations/library_rag/utils/pdf_pipeline.py
--- a/generations/library_rag/utils/pdf_uploader.py
+++ b/generations/library_rag/utils/pdf_uploader.py
@@ -0,0 +1,31 @@
+"""Upload de fichiers PDF vers l'API Mistral."""
+
+from mistralai import Mistral
+
+
+def upload_pdf(client: Mistral, file_bytes: bytes, filename: str) -> str:
+    """Upload un PDF vers Mistral et retourne l'URL signée.
+
+    Args:
+        client: Client Mistral authentifié
+        file_bytes: Contenu binaire du fichier PDF
+        filename: Nom du fichier
+
+    Returns:
+        URL signée du document uploadé
+    """
+    # Upload du fichier
+    uploaded = client.files.upload(
+        file={
+            "file_name": filename,
+            "content": file_bytes,
+        },
+        purpose="ocr",
+    )
+    
+    # Récupération de l'URL signée
+    signed = client.files.get_signed_url(file_id=uploaded.id)
+    
+    return signed.url
+
+
--- a/generations/library_rag/utils/toc_enricher.py
+++ b/generations/library_rag/utils/toc_enricher.py
@@ -0,0 +1,382 @@
+"""TOC Enrichment Module for Chunk Metadata Enhancement.
+
+This module provides functions to enrich chunk metadata with hierarchical
+information from the table of contents (TOC). It matches chunks to their
+corresponding TOC entries and extracts:
+- Full hierarchical paths (e.g., "Peirce: CP 1.628 > 628. It is...")
+- Chapter titles
+- Canonical academic references (e.g., "CP 1.628", "Ménon 80a")
+
+The enrichment happens before Weaviate ingestion to ensure chunks have
+complete metadata for rigorous academic citation.
+
+Usage:
+    >>> from utils.toc_enricher import enrich_chunks_with_toc
+    >>> enriched_chunks = enrich_chunks_with_toc(chunks, toc, hierarchy)
+
+See Also:
+    - utils.types: FlatTOCEntryEnriched type definition
+    - utils.weaviate_ingest: Integration point for enrichment
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import Any, Dict, List, Optional
+
+from .types import FlatTOCEntryEnriched
+
+logger = logging.getLogger(__name__)
+
+
+def flatten_toc_with_paths(
+    toc: List[Dict[str, Any]],
+    hierarchy: Dict[str, Any],
+) -> List[FlatTOCEntryEnriched]:
+    """Flatten hierarchical or flat TOC and build full paths with metadata.
+
+    Handles both hierarchical TOCs (with 'children' keys) and flat TOCs
+    (where parent-child relationships are inferred from 'level' field).
+
+    Traverses the TOC structure and creates enriched flat entries with:
+    - Full hierarchical path (e.g., "Peirce: CP 1.628 > 628. It is...")
+    - Canonical reference extraction (e.g., "CP 1.628")
+    - Chapter title tracking (first level 1 ancestor)
+    - Parent title list for context
+
+    Args:
+        toc: TOC structure with 'title' and 'level' fields, optionally 'children'
+        hierarchy: Document hierarchy (currently unused, reserved for future)
+
+    Returns:
+        List of enriched flat TOC entries with full metadata.
+
+    Example:
+        >>> toc = [
+        ...     {"title": "Peirce: CP 1.628", "level": 1},
+        ...     {"title": "628. It is the instincts...", "level": 2}
+        ... ]
+        >>> flat = flatten_toc_with_paths(toc, {})
+        >>> flat[1]["full_path"]
+        'Peirce: CP 1.628 > 628. It is the instincts...'
+        >>> flat[1]["canonical_ref"]
+        'CP 1.628'
+    """
+    flat_toc: List[FlatTOCEntryEnriched] = []
+
+    # Check if TOC is hierarchical (has children) or flat (level-based)
+    is_hierarchical = any("children" in entry for entry in toc if entry)
+
+    if is_hierarchical:
+        # Original recursive approach for hierarchical TOCs
+        def traverse(
+            entries: List[Dict[str, Any]],
+            parent_titles: List[str],
+            current_chapter: str,
+            current_canonical: Optional[str],
+        ) -> None:
+            """Recursively traverse TOC entries and build flat list."""
+            for entry in entries:
+                title = entry.get("title", "")
+                level = entry.get("level", 0)
+                children = entry.get("children", [])
+
+                # Build full path from parents + current title
+                full_path_parts = parent_titles + [title]
+                full_path = " > ".join(full_path_parts)
+
+                # Extract canonical reference if present in title
+                canonical_ref = current_canonical
+                cp_match = re.search(r'CP\s+(\d+\.\d+)', title)
+                stephanus_match = re.search(r'(\w+\s+\d+[a-z])', title)
+
+                if cp_match:
+                    canonical_ref = f"CP {cp_match.group(1)}"
+                elif stephanus_match:
+                    canonical_ref = stephanus_match.group(1)
+
+                # Update chapter title when entering level 1
+                chapter_title = current_chapter
+                if level == 1:
+                    chapter_title = title
+
+                # Create enriched entry
+                enriched_entry: FlatTOCEntryEnriched = {
+                    "title": title,
+                    "level": level,
+                    "full_path": full_path,
+                    "chapter_title": chapter_title,
+                    "canonical_ref": canonical_ref,
+                    "parent_titles": parent_titles.copy(),
+                    "index_in_flat_list": len(flat_toc),
+                }
+                flat_toc.append(enriched_entry)
+
+                # Recursively process children
+                if children:
+                    traverse(
+                        children,
+                        parent_titles + [title],
+                        chapter_title,
+                        canonical_ref,
+                    )
+
+        traverse(toc, [], "", None)
+    else:
+        # New iterative approach for flat TOCs (infer hierarchy from levels)
+        parent_stack: List[Dict[str, Any]] = []  # Stack of (level, title, canonical_ref)
+        current_chapter = ""
+        current_canonical: Optional[str] = None
+
+        for entry in toc:
+            title = entry.get("title", "")
+            level = entry.get("level", 1)
+
+            # Pop parents that are at same or deeper level
+            while parent_stack and parent_stack[-1]["level"] >= level:
+                parent_stack.pop()
+
+            # Build parent titles list
+            parent_titles = [p["title"] for p in parent_stack]
+
+            # Build full path
+            full_path_parts = parent_titles + [title]
+            full_path = " > ".join(full_path_parts)
+
+            # Extract canonical reference if present in title
+            cp_match = re.search(r'CP\s+(\d+\.\d+)', title)
+            stephanus_match = re.search(r'(\w+\s+\d+[a-z])', title)
+
+            if cp_match:
+                current_canonical = f"CP {cp_match.group(1)}"
+            elif stephanus_match:
+                current_canonical = stephanus_match.group(1)
+            elif level == 1:
+                # Reset canonical ref at level 1 if none found
+                current_canonical = None
+
+            # Inherit canonical ref from parent if not found
+            if not current_canonical and parent_stack:
+                current_canonical = parent_stack[-1].get("canonical_ref")
+
+            # Update chapter title when at level 1
+            if level == 1:
+                current_chapter = title
+
+            # Create enriched entry
+            enriched_entry: FlatTOCEntryEnriched = {
+                "title": title,
+                "level": level,
+                "full_path": full_path,
+                "chapter_title": current_chapter,
+                "canonical_ref": current_canonical,
+                "parent_titles": parent_titles.copy(),
+                "index_in_flat_list": len(flat_toc),
+            }
+            flat_toc.append(enriched_entry)
+
+            # Add current entry to parent stack for next iteration
+            parent_stack.append({
+                "level": level,
+                "title": title,
+                "canonical_ref": current_canonical,
+            })
+
+    return flat_toc
+
+
+def extract_paragraph_number(section_text: str) -> Optional[str]:
+    """Extract paragraph number from section text.
+
+    Handles various academic paragraph numbering formats:
+    - "628. Text..." → "628"
+    - "§42 Text..." → "42"
+    - "80a. Text..." → "80a" (Stephanus pagination)
+    - "CP 5.628. Text..." → "628"
+
+    Args:
+        section_text: Section title or path text
+
+    Returns:
+        Extracted paragraph number or None if not found.
+
+    Example:
+        >>> extract_paragraph_number("628. It is the instincts...")
+        '628'
+        >>> extract_paragraph_number("§42 On the nature of...")
+        '42'
+        >>> extract_paragraph_number("80a. SOCRATE: Sais-tu...")
+        '80a'
+    """
+    if not section_text:
+        return None
+
+    # Pattern 1: Standard paragraph number at start "628. Text"
+    match = re.match(r'^(\d+[a-z]?)\.\s', section_text)
+    if match:
+        return match.group(1)
+
+    # Pattern 2: Section symbol "§42 Text"
+    match = re.match(r'^§\s*(\d+[a-z]?)\s', section_text)
+    if match:
+        return match.group(1)
+
+    # Pattern 3: CP reference "CP 5.628. Text" → extract paragraph only
+    match = re.match(r'^CP\s+\d+\.(\d+)\.\s', section_text)
+    if match:
+        return match.group(1)
+
+    return None
+
+
+def find_matching_toc_entry(
+    chunk: Dict[str, Any],
+    flat_toc: List[FlatTOCEntryEnriched],
+) -> Optional[FlatTOCEntryEnriched]:
+    """Find matching TOC entry for a chunk using multi-strategy matching.
+
+    Matching strategies (in priority order):
+    1. **Exact text match**: chunk.section == toc.title
+    2. **Paragraph number match**: Extract paragraph number from both and compare
+    3. **Proximity match**: Use order_index to find nearest TOC entry
+
+    Args:
+        chunk: Chunk dict with 'section', 'sectionPath', 'order_index' fields
+        flat_toc: Flattened TOC with enriched metadata
+
+    Returns:
+        Best matching TOC entry or None if no match found.
+
+    Example:
+        >>> chunk = {"section": "628. It is the instincts...", "order_index": 42}
+        >>> toc_entry = find_matching_toc_entry(chunk, flat_toc)
+        >>> toc_entry["canonical_ref"]
+        'CP 1.628'
+    """
+    if not flat_toc:
+        return None
+
+    chunk_section = chunk.get("section", chunk.get("sectionPath", ""))
+    if not chunk_section:
+        return None
+
+    # Strategy 1: Exact title match
+    for entry in flat_toc:
+        if entry["title"] == chunk_section:
+            return entry
+
+    # Strategy 2: Paragraph number match
+    chunk_para = extract_paragraph_number(chunk_section)
+    if chunk_para:
+        # Look for matching paragraph in level 2 entries (actual content)
+        for i, entry in enumerate(flat_toc):
+            if entry["level"] == 2:
+                entry_para = extract_paragraph_number(entry["title"])
+                if entry_para == chunk_para:
+                    # Additional text similarity check to disambiguate
+                    # Get first significant word from chunk section
+                    chunk_words = [w for w in chunk_section.split() if len(w) > 3]
+                    entry_words = [w for w in entry["title"].split() if len(w) > 3]
+
+                    if chunk_words and entry_words:
+                        # Check if first significant words match
+                        if chunk_words[0].lower() in entry["title"].lower():
+                            return entry
+                    else:
+                        # No text to compare, return paragraph match
+                        return entry
+
+    # Strategy 3: Proximity match using order_index
+    chunk_order = chunk.get("order_index")
+    if chunk_order is not None and flat_toc:
+        # Find TOC entry with closest index_in_flat_list to chunk order
+        # This is a fallback heuristic assuming TOC and chunks follow similar order
+        closest_entry = min(
+            flat_toc,
+            key=lambda e: abs(e["index_in_flat_list"] - chunk_order),
+        )
+        return closest_entry
+
+    return None
+
+
+def enrich_chunks_with_toc(
+    chunks: List[Dict[str, Any]],
+    toc: List[Dict[str, Any]],
+    hierarchy: Dict[str, Any],
+) -> List[Dict[str, Any]]:
+    """Enrich chunks with hierarchical metadata from TOC.
+
+    Main orchestration function that:
+    1. Checks if TOC is available (guard clause)
+    2. Flattens TOC once for efficiency
+    3. Matches each chunk to its TOC entry
+    4. Updates chunk metadata: sectionPath, chapterTitle, canonical_reference
+
+    Args:
+        chunks: List of chunk dicts from pdf_pipeline
+        toc: Hierarchical TOC structure (may be empty)
+        hierarchy: Document hierarchy dict (may be empty)
+
+    Returns:
+        List of chunks with enriched metadata (same objects, modified in place).
+        If TOC is empty, returns chunks unchanged (no regression).
+
+    Example:
+        >>> chunks = [{"text": "...", "section": "628. It is..."}]
+        >>> toc = [
+        ...     {"title": "Peirce: CP 1.628", "level": 1, "children": [
+        ...         {"title": "628. It is...", "level": 2, "children": []}
+        ...     ]}
+        ... ]
+        >>> enriched = enrich_chunks_with_toc(chunks, toc, {})
+        >>> enriched[0]["sectionPath"]
+        'Peirce: CP 1.628 > 628. It is the instincts...'
+        >>> enriched[0]["chapterTitle"]
+        'Peirce: CP 1.628'
+        >>> enriched[0]["canonical_reference"]
+        'CP 1.628'
+    """
+    # Guard: If no TOC, return chunks unchanged (graceful fallback)
+    if not toc:
+        logger.info("No TOC available, skipping chunk enrichment")
+        return chunks
+
+    logger.info(f"Enriching {len(chunks)} chunks with TOC metadata...")
+
+    # Flatten TOC once for efficient matching
+    try:
+        flat_toc = flatten_toc_with_paths(toc, hierarchy)
+        logger.info(f"Flattened TOC: {len(flat_toc)} entries")
+    except Exception as e:
+        logger.error(f"Failed to flatten TOC: {e}")
+        return chunks  # Fallback on error
+
+    # Match each chunk to TOC entry and enrich
+    enriched_count = 0
+    for chunk in chunks:
+        matching_entry = find_matching_toc_entry(chunk, flat_toc)
+
+        if matching_entry:
+            # Update sectionPath with full hierarchical path
+            chunk["sectionPath"] = matching_entry["full_path"]
+
+            # Update chapterTitle
+            chunk["chapterTitle"] = matching_entry["chapter_title"]
+
+            # Add canonicalReference if available
+            if matching_entry["canonical_ref"]:
+                chunk["canonicalReference"] = matching_entry["canonical_ref"]
+
+            enriched_count += 1
+
+    if chunks:
+        logger.info(
+            f"Enriched {enriched_count}/{len(chunks)} chunks "
+            f"({100 * enriched_count / len(chunks):.1f}%)"
+        )
+    else:
+        logger.info("No chunks to enrich")
+
+    return chunks
--- a/generations/library_rag/utils/toc_extractor.py
+++ b/generations/library_rag/utils/toc_extractor.py
@@ -0,0 +1,260 @@
+"""Table of Contents (TOC) extraction using Mistral OCR with annotations.
+
+This module is the **primary entry point** for TOC extraction in the Library RAG
+pipeline. It provides intelligent routing between two extraction strategies:
+
+1. **Visual (bbox) Analysis** (default, recommended): Uses bounding box coordinates
+   to detect indentation and hierarchy based on horizontal positioning.
+2. **Semantic (annotation) Analysis**: Uses Mistral's document_annotation_format
+   for structured metadata and TOC extraction.
+
+The visual approach is more reliable for philosophical texts with complex
+hierarchies (parts, chapters, sections, subsections).
+
+Extraction Strategies:
+    ┌─────────────────────────────────────────────────────────────┐
+    │  extract_toc_from_annotations(use_visual_bbox=True)         │
+    │         ↓ (default)                                         │
+    │  toc_extractor_visual.py → X-coordinate based hierarchy     │
+    │                                                             │
+    │  extract_toc_from_annotations(use_visual_bbox=False)        │
+    │         ↓                                                   │
+    │  DocumentMetadata Pydantic schema → Structured extraction   │
+    └─────────────────────────────────────────────────────────────┘
+
+Cost Considerations:
+    - Annotated OCR: ~0.003€/page (3x standard OCR cost)
+    - Only first N pages are processed (default: 8)
+    - Total cost: max_toc_pages × 0.003€
+
+Output Structure:
+    {
+        "success": bool,
+        "metadata": {...},           # Document metadata
+        "toc": [...],               # Hierarchical TOC (nested children)
+        "toc_flat": [...],          # Flat list with levels
+        "cost_ocr_annotated": float
+    }
+
+Example:
+    >>> from pathlib import Path
+    >>> from utils.toc_extractor import extract_toc_from_annotations
+    >>>
+    >>> # Extract TOC using visual analysis (recommended)
+    >>> result = extract_toc_from_annotations(
+    ...     pdf_path=Path("input/philosophy_book.pdf"),
+    ...     max_toc_pages=8,
+    ...     use_visual_bbox=True  # default
+    ... )
+    >>> if result["success"]:
+    ...     for entry in result["toc"]:
+    ...         print(f"{entry['title']} (p.{entry['page']})")
+
+Functions:
+    - extract_toc_from_annotations(): Main entry point with strategy routing
+    - build_hierarchical_toc(): Converts flat TOC entries to nested structure
+    - map_toc_to_content(): Associates TOC entries with document content
+
+See Also:
+    - utils.toc_extractor_visual: Visual/bbox-based extraction (default)
+    - utils.toc_extractor_markdown: Markdown indentation-based extraction
+    - utils.llm_toc: LLM-based TOC extraction (alternative approach)
+"""
+
+import json
+import logging
+from typing import Any, Dict, List, Optional, Union, cast
+from pathlib import Path
+
+from .ocr_schemas import DocumentMetadata, TocEntry
+from .ocr_processor import run_ocr_with_annotations
+from .mistral_client import create_client
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+# TypedDict for hierarchical TOC nodes
+class TOCNode(Dict[str, Any]):
+    """Type alias for TOC node structure with title, page, level, type, children."""
+    pass
+
+
+def extract_toc_from_annotations(
+    pdf_path: Path,
+    api_key: Optional[str] = None,
+    max_toc_pages: int = 8,
+    use_visual_bbox: bool = True,  # NOUVEAU : Utiliser l'analyse visuelle par défaut
+) -> Dict[str, Any]:
+    """Extrait la TOC structurée via OCR avec annotations.
+    
+    Coût : 3€/1000 pages pour les pages annotées (vs 1€/1000 pour OCR basique).
+    
+    Args:
+        pdf_path: Chemin du fichier PDF
+        api_key: Clé API Mistral (optionnel, sinon charge depuis .env)
+        max_toc_pages: Nombre max de pages à annoter (défaut 8, limite API pour document_annotation)
+        use_visual_bbox: Si True, utilise l'analyse visuelle des bounding boxes (plus fiable)
+    
+    Returns:
+        Dict avec :
+        - success: bool
+        - metadata: dict avec métadonnées enrichies
+        - toc: liste hiérarchique [{title, page, level, children}]
+        - toc_flat: liste plate [{title, page, level, type, parent_title}]
+        - cost_ocr_annotated: float (coût en €)
+        - error: str (si échec)
+    """
+    # Si demandé, utiliser l'approche visuelle (bbox)
+    if use_visual_bbox:
+        logger.info("Utilisation de l'analyse visuelle (bbox) pour extraction TOC")
+        from .toc_extractor_visual import extract_toc_with_visual_analysis
+        return cast(Dict[str, Any], extract_toc_with_visual_analysis(pdf_path, api_key, max_toc_pages))
+    
+    # Sinon, continuer avec l'approche sémantique (document_annotation_format)
+    try:
+        client = create_client(api_key)
+        pdf_bytes = pdf_path.read_bytes()
+    except Exception as e:
+        logger.error(f"Erreur initialisation client/lecture PDF : {e}")
+        return {"success": False, "error": f"Initialisation échouée : {str(e)}"}
+    
+    # Phase 1 : Annoter les premières pages pour extraire TOC + métadonnées
+    logger.info(f"Extraction TOC avec annotations sur {max_toc_pages} premières pages")
+    
+    try:
+        annotated_response = run_ocr_with_annotations(
+            client=client,
+            file_bytes=pdf_bytes,
+            filename=pdf_path.name,
+            include_images=False,  # Pas besoin d'images pour la TOC
+            document_annotation_format=DocumentMetadata,
+            pages=list(range(max_toc_pages)),  # Pages 0 à max_toc_pages-1
+        )
+    except Exception as e:
+        logger.error(f"Erreur appel OCR avec annotations : {e}")
+        return {"success": False, "error": f"Appel OCR échoué : {str(e)}"}
+    
+    # Extraire les annotations du document
+    doc_annotation = getattr(annotated_response, "document_annotation", None)
+    
+    if not doc_annotation:
+        return {"success": False, "error": "Aucune annotation retournée par l'API"}
+    
+    # Convertir en dictionnaire
+    try:
+        if isinstance(doc_annotation, str):
+            metadata_dict = json.loads(doc_annotation)
+        else:
+            metadata_dict = doc_annotation
+    except Exception as e:
+        logger.error(f"Erreur parsing annotations : {e}")
+        return {"success": False, "error": f"Parsing annotations échoué : {str(e)}"}
+    
+    # Valider avec Pydantic
+    try:
+        metadata = DocumentMetadata(**metadata_dict)
+        toc_entries = metadata.toc.entries
+        
+        logger.info(f"TOC extraite : {len(toc_entries)} entrées")
+        
+        # Construire la TOC hiérarchique
+        hierarchical_toc = build_hierarchical_toc(toc_entries)
+        
+        return {
+            "success": True,
+            "metadata": metadata.model_dump(),
+            "toc": hierarchical_toc,
+            "toc_flat": [entry.model_dump() for entry in toc_entries],
+            "cost_ocr_annotated": max_toc_pages * 0.003,  # 3€/1000 pages
+        }
+    except Exception as e:
+        logger.error(f"Erreur validation annotations : {e}")
+        return {"success": False, "error": f"Validation Pydantic échouée : {str(e)}"}
+
+
+def build_hierarchical_toc(entries: List[TocEntry]) -> List[Dict[str, Any]]:
+    """Construit une TOC hiérarchique à partir des entrées plates avec niveaux.
+
+    Utilise une stack pour gérer la hiérarchie basée sur les niveaux.
+
+    Args:
+        entries: Liste d'entrées TocEntry avec level (1=racine, 2=enfant de 1, etc.)
+
+    Returns:
+        TOC hiérarchique avec structure [{title, page, level, type, children: [...]}]
+    """
+    if not entries:
+        return []
+
+    toc: List[Dict[str, Any]] = []
+    stack: List[Dict[str, Any]] = []  # Stack pour gérer la hiérarchie courante
+
+    for entry in entries:
+        node: Dict[str, Any] = {
+            "title": entry.title,
+            "page": entry.page_number,
+            "level": entry.level,
+            "type": entry.entry_type.value,
+            "children": [],
+        }
+
+        # Remonter dans la stack jusqu'au parent approprié
+        # Un élément de level N doit être enfant du dernier élément de level < N
+        while stack and stack[-1]["level"] >= entry.level:
+            stack.pop()
+
+        if stack:
+            # Ajouter comme enfant du dernier élément de la stack
+            children: List[Dict[str, Any]] = stack[-1]["children"]
+            children.append(node)
+        else:
+            # Ajouter à la racine de la TOC
+            toc.append(node)
+
+        # Empiler ce nœud pour les prochaines itérations
+        stack.append(node)
+
+    return toc
+
+
+def map_toc_to_content(
+    toc_entries: List[TocEntry],
+    all_pages_markdown: str,
+) -> Dict[str, str]:
+    """Associe les entrées de TOC au contenu réel du document.
+
+    Utilise les vrais numéros de page pour découper le contenu par section.
+
+    Args:
+        toc_entries: Entrées de TOC avec numéros de page réels
+        all_pages_markdown: Markdown complet du document avec <!-- Page N --> markers
+
+    Returns:
+        Mapping {section_title: content_text}
+    """
+    # Découper le markdown par commentaires de page
+    pages: List[str] = all_pages_markdown.split("<!-- Page ")
+
+    content_map: Dict[str, str] = {}
+
+    for i, entry in enumerate(toc_entries):
+        start_page: int = entry.page_number
+
+        # Trouver la page de fin (numéro de page de la prochaine entrée ou fin du doc)
+        end_page: int
+        if i < len(toc_entries) - 1:
+            end_page = toc_entries[i + 1].page_number
+        else:
+            end_page = len(pages)  # Jusqu'à la fin
+
+        # Extraire le contenu entre start_page et end_page
+        section_content: List[str] = []
+        for page_idx in range(start_page, end_page):
+            if page_idx < len(pages):
+                # Nettoyer le commentaire de page et extraire le contenu
+                page_text: str = pages[page_idx].split("-->", 1)[-1].strip()
+                section_content.append(page_text)
+
+        content_map[entry.title] = "\n\n".join(section_content)
+
+    return content_map
--- a/generations/library_rag/utils/toc_extractor_markdown.py
+++ b/generations/library_rag/utils/toc_extractor_markdown.py
@@ -0,0 +1,303 @@
+"""TOC extraction via Markdown indentation analysis.
+
+This module provides a **cost-free** TOC extraction strategy that works on
+already-generated Markdown text. Unlike the OCR annotation approach, this
+method doesn't require additional API calls.
+
+Strategy:
+    1. Search for "Table des matières" heading in the first N lines
+    2. Parse lines matching pattern: "Title.....Page" or "Title  Page"
+    3. Detect hierarchy from leading whitespace (indentation)
+    4. Build nested TOC structure using stack-based algorithm
+
+When to Use:
+    - When OCR has already been performed (markdown available)
+    - When cost optimization is critical (no additional API calls)
+    - For documents with clear indentation in the TOC
+
+Limitations:
+    - Requires French "Table des matières" header (can be extended)
+    - Indentation detection may be less accurate than visual/bbox analysis
+    - Only works if OCR preserved whitespace accurately
+
+Indentation Levels:
+    - 0-2 spaces: Level 1 (main chapters/parts)
+    - 3-6 spaces: Level 2 (sections)
+    - 7+ spaces: Level 3 (subsections)
+
+Output Structure:
+    {
+        "success": bool,
+        "toc": [...],               # Hierarchical TOC
+        "toc_flat": [...],          # Flat entries with levels
+        "cost_ocr_annotated": 0.0,  # No additional cost
+        "method": "markdown_indentation"
+    }
+
+Example:
+    >>> from utils.toc_extractor_markdown import extract_toc_from_markdown
+    >>>
+    >>> markdown = '''
+    ... # Table des matières
+    ... Introduction.............................5
+    ... Première partie..........................10
+    ...    Chapitre 1............................15
+    ...    Chapitre 2............................25
+    ... Deuxième partie..........................50
+    ... '''
+    >>> result = extract_toc_from_markdown(markdown)
+    >>> if result["success"]:
+    ...     print(f"Found {len(result['toc_flat'])} entries")
+    Found 5 entries
+
+Functions:
+    - extract_toc_from_markdown(): Main extraction from markdown text
+    - build_hierarchy(): Converts flat entries to nested structure
+
+See Also:
+    - utils.toc_extractor: Main entry point (routes to visual by default)
+    - utils.toc_extractor_visual: More accurate X-position based extraction
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import Any, Dict, List, Optional, TypedDict, Union
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+# Type definitions for internal data structures
+class MarkdownTOCEntryRaw(TypedDict):
+    """Raw TOC entry extracted from markdown with indentation info."""
+    title: str
+    page_number: int
+    level: int
+    leading_spaces: int
+
+
+class MarkdownTOCNode(TypedDict):
+    """Hierarchical TOC node with children."""
+    title: str
+    page: int
+    level: int
+    type: str
+    children: List[MarkdownTOCNode]
+
+
+class MarkdownTOCFlatEntry(TypedDict):
+    """Flat TOC entry with parent information."""
+    title: str
+    page_number: int
+    level: int
+    entry_type: str
+    parent_title: Optional[str]
+
+
+class MarkdownTOCResultSuccess(TypedDict):
+    """Successful TOC extraction result."""
+    success: bool  # Always True
+    metadata: Dict[str, Any]
+    toc: List[MarkdownTOCNode]
+    toc_flat: List[MarkdownTOCFlatEntry]
+    cost_ocr_annotated: float
+    method: str
+
+
+class MarkdownTOCResultError(TypedDict):
+    """Failed TOC extraction result."""
+    success: bool  # Always False
+    error: str
+
+
+# Union type for function return
+MarkdownTOCResult = Union[MarkdownTOCResultSuccess, MarkdownTOCResultError]
+
+
+def extract_toc_from_markdown(
+    markdown_text: str,
+    max_lines: int = 200,
+) -> MarkdownTOCResult:
+    """Extract table of contents by analyzing raw markdown text.
+
+    Detects hierarchy by counting leading spaces (indentation) at the
+    beginning of each line. This is a cost-free alternative to OCR
+    annotation-based extraction.
+
+    Args:
+        markdown_text: Complete markdown text of the document.
+        max_lines: Maximum number of lines to analyze (searches TOC at start).
+
+    Returns:
+        Dictionary with hierarchical TOC structure. On success, includes:
+            - success: True
+            - metadata: Empty dict (for consistency with other extractors)
+            - toc: Hierarchical nested TOC structure
+            - toc_flat: Flat list of entries with levels
+            - cost_ocr_annotated: 0.0 (no additional cost)
+            - method: "markdown_indentation"
+        On failure, includes:
+            - success: False
+            - error: Error message string
+
+    Example:
+        >>> markdown = '''
+        ... # Table des matières
+        ... Introduction.....5
+        ... Part One........10
+        ...   Chapter 1.....15
+        ... '''
+        >>> result = extract_toc_from_markdown(markdown)
+        >>> if result["success"]:
+        ...     print(len(result["toc_flat"]))
+        3
+    """
+    logger.info("Extraction TOC depuis markdown (analyse indentation)")
+
+    lines: List[str] = markdown_text.split('\n')[:max_lines]
+
+    # Find "Table des matières" section
+    toc_start: Optional[int] = None
+    for i, line in enumerate(lines):
+        if re.search(r'table\s+des\s+mati[èe]res', line, re.IGNORECASE):
+            toc_start = i + 1
+            logger.info(f"TOC trouvée à la ligne {i}")
+            break
+
+    if toc_start is None:
+        logger.warning("Aucune table des matières trouvée dans le markdown")
+        return MarkdownTOCResultError(
+            success=False,
+            error="Table des matières introuvable"
+        )
+
+    # Extract TOC entries
+    entries: List[MarkdownTOCEntryRaw] = []
+    toc_pattern: re.Pattern[str] = re.compile(r'^(\s*)(.+?)\s*\.+\s*(\d+)\s*$')
+
+    for line in lines[toc_start:toc_start + 100]:  # Max 100 lines of TOC
+        line_stripped: str = line.strip()
+        if not line_stripped or line_stripped.startswith('#') or line_stripped.startswith('---'):
+            continue
+
+        # Search for pattern "Title.....Page"
+        # Must analyze line BEFORE strip() to count leading spaces
+        original_line: str = lines[lines.index(line) if line in lines else 0]
+        leading_spaces: int = len(original_line) - len(original_line.lstrip())
+
+        # Alternative pattern: search for title + number at end
+        match: Optional[re.Match[str]] = re.match(r'^(.+?)\s*\.{2,}\s*(\d+)\s*$', line_stripped)
+        if not match:
+            # Try without dotted leaders
+            match = re.match(r'^(.+?)\s+(\d+)\s*$', line_stripped)
+
+        if match:
+            title: str = match.group(1).strip()
+            page: int = int(match.group(2))
+
+            # Ignore lines too short or that don't look like titles
+            if len(title) < 3 or title.isdigit():
+                continue
+
+            # Determine level based on indentation
+            # 0-2 spaces = level 1
+            # 3-6 spaces = level 2
+            # 7+ spaces = level 3
+            level: int
+            if leading_spaces <= 2:
+                level = 1
+            elif leading_spaces <= 6:
+                level = 2
+            else:
+                level = 3
+
+            entries.append(MarkdownTOCEntryRaw(
+                title=title,
+                page_number=page,
+                level=level,
+                leading_spaces=leading_spaces,
+            ))
+
+            logger.debug(f"  '{title}' → {leading_spaces} espaces → level {level} (page {page})")
+
+    if not entries:
+        logger.warning("Aucune entrée TOC extraite")
+        return MarkdownTOCResultError(
+            success=False,
+            error="Aucune entrée TOC trouvée"
+        )
+
+    logger.info(f"✅ {len(entries)} entrées extraites depuis markdown")
+
+    # Build hierarchy
+    toc: List[MarkdownTOCNode] = build_hierarchy(entries)
+
+    return MarkdownTOCResultSuccess(
+        success=True,
+        metadata={},
+        toc=toc,
+        toc_flat=[
+            MarkdownTOCFlatEntry(
+                title=e["title"],
+                page_number=e["page_number"],
+                level=e["level"],
+                entry_type="section",
+                parent_title=None,
+            )
+            for e in entries
+        ],
+        cost_ocr_annotated=0.0,  # No additional cost, uses existing OCR
+        method="markdown_indentation",
+    )
+
+
+def build_hierarchy(entries: List[MarkdownTOCEntryRaw]) -> List[MarkdownTOCNode]:
+    """Build hierarchical structure from flat entries based on levels.
+
+    Uses a stack-based algorithm to construct nested TOC structure where
+    entries with higher indentation become children of the previous
+    less-indented entry.
+
+    Args:
+        entries: List of raw TOC entries with title, page, and level.
+
+    Returns:
+        Nested list of TOC nodes where each node contains children.
+
+    Example:
+        >>> entries = [
+        ...     {"title": "Part 1", "page_number": 1, "level": 1, "leading_spaces": 0},
+        ...     {"title": "Chapter 1", "page_number": 5, "level": 2, "leading_spaces": 4},
+        ... ]
+        >>> hierarchy = build_hierarchy(entries)
+        >>> len(hierarchy[0]["children"])
+        1
+    """
+    toc: List[MarkdownTOCNode] = []
+    stack: List[MarkdownTOCNode] = []
+
+    for entry in entries:
+        node: MarkdownTOCNode = MarkdownTOCNode(
+            title=entry["title"],
+            page=entry["page_number"],
+            level=entry["level"],
+            type="section",
+            children=[],
+        )
+
+        # Pop from stack until we find a parent at lower level
+        while stack and stack[-1]["level"] >= node["level"]:
+            stack.pop()
+
+        if stack:
+            # Add as child to top of stack
+            stack[-1]["children"].append(node)
+        else:
+            # Add as root-level entry
+            toc.append(node)
+
+        stack.append(node)
+
+    return toc
--- a/generations/library_rag/utils/toc_extractor_visual.py
+++ b/generations/library_rag/utils/toc_extractor_visual.py
@@ -0,0 +1,512 @@
+"""Visual TOC extraction using bounding box X-coordinate analysis.
+
+This module provides the **most accurate** TOC extraction strategy for
+philosophical texts by analyzing the horizontal position (X-coordinate)
+of each TOC entry. This approach is more reliable than text indentation
+analysis because it directly measures visual layout.
+
+How It Works:
+    1. OCR with annotations extracts text + bounding box positions
+    2. Pydantic schema (TocEntryBbox) captures title, page, and x_position
+    3. X-coordinates are clustered to identify distinct indentation levels
+    4. Hierarchy is built based on relative X-positions
+
+X-Position Interpretation:
+    The x_position is normalized between 0.0 (left edge) and 1.0 (right edge):
+
+    - x ≈ 0.05-0.12: Level 1 (no indentation, main parts/chapters)
+    - x ≈ 0.13-0.22: Level 2 (small indentation, sections)
+    - x ≈ 0.23-0.35: Level 3 (double indentation, subsections)
+
+    Positions within 0.03 tolerance are grouped into the same level.
+
+Advantages over Markdown Analysis:
+    - Works regardless of OCR whitespace accuracy
+    - More reliable for complex hierarchies
+    - Handles both printed and handwritten indentation
+
+Cost:
+    - Uses OCR with annotations: ~0.003€/page
+    - Only processes first N pages (default: 8)
+
+Pydantic Schemas:
+    - TocEntryBbox: Single TOC entry with text, page_number, x_position
+    - DocumentTocBbox: Container for list of entries
+
+Output Structure:
+    {
+        "success": bool,
+        "metadata": {...},
+        "toc": [...],               # Hierarchical TOC
+        "toc_flat": [...],          # Flat entries with levels
+        "cost_ocr_annotated": float,
+        "method": "visual_x_position"
+    }
+
+Example:
+    >>> from pathlib import Path
+    >>> from utils.toc_extractor_visual import extract_toc_with_visual_analysis
+    >>>
+    >>> result = extract_toc_with_visual_analysis(
+    ...     pdf_path=Path("input/philosophy_book.pdf"),
+    ...     max_toc_pages=8
+    ... )
+    >>> if result["success"]:
+    ...     for entry in result["toc"]:
+    ...         indent = "  " * (entry["level"] - 1)
+    ...         print(f"{indent}{entry['title']} (p.{entry['page']})")
+
+Algorithm Details:
+    1. Collect all x_position values from OCR response
+    2. Sort and cluster positions (tolerance: 0.03)
+    3. Compute cluster centroids as level thresholds
+    4. Assign level to each entry based on nearest centroid
+    5. Build hierarchy using stack-based approach
+
+Functions:
+    - extract_toc_with_visual_analysis(): Main extraction function
+    - build_hierarchy_from_bbox(): Converts entries with X-positions to hierarchy
+    - flatten_toc(): Flattens hierarchical TOC for storage
+
+See Also:
+    - utils.toc_extractor: Main entry point (routes here by default)
+    - utils.toc_extractor_markdown: Alternative cost-free extraction
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Type, TypedDict, Union
+
+from pydantic import BaseModel, Field
+
+from .mistral_client import create_client
+from .ocr_processor import run_ocr_with_annotations
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class TocEntryBbox(BaseModel):
+    """TOC entry with bounding box for visual detection.
+
+    Attributes:
+        text: Complete entry text as it appears in the table of contents.
+            Example: 'Presentation' or 'What is virtue?' or 'Meno or on virtue'.
+            DO NOT include leader dots or page number in this field.
+        page_number: Actual page number as printed in the book (the visible number
+            on the right in the TOC). Example: if the line says 'Presentation.....3',
+            extract the number 3. This is the BOOK page number, not the PDF index.
+        x_position: Horizontal position (X coordinate) of the text start, normalized
+            between 0 and 1. This is the CRUCIAL COORDINATE for detecting indentation:
+            - x ≈ 0.05-0.12 = left-aligned title, NOT indented (hierarchical level 1)
+            - x ≈ 0.13-0.22 = title with SMALL indentation (hierarchical level 2)
+            - x ≈ 0.23-0.35 = title with DOUBLE indentation (hierarchical level 3)
+            Measure precisely where the first character of the title begins.
+    """
+    text: str = Field(..., description="""Texte COMPLET de l'entrée tel qu'il apparaît dans la table des matières. 
+    Exemple: 'Présentation' ou 'Qu'est-ce que la vertu ?' ou 'Ménon ou de la vertu'.
+    NE PAS inclure les points de suite ni le numéro de page dans ce champ.""")
+    page_number: int = Field(..., description="""Numéro de page réel tel qu'imprimé dans le livre (le numéro visible à droite dans la TOC).
+    Exemple: si la ligne dit 'Présentation.....3', extraire le nombre 3.
+    C'est le numéro de page du LIVRE, pas l'index PDF.""")
+    x_position: float = Field(..., description="""Position horizontale (coordonnée X) du début du texte, normalisée entre 0 et 1.
+    C'est LA COORDONNÉE CRUCIALE pour détecter l'indentation:
+    - x ≈ 0.05-0.12 = titre aligné à gauche, NON indenté (niveau hiérarchique 1)
+    - x ≈ 0.13-0.22 = titre avec PETITE indentation (niveau hiérarchique 2)
+    - x ≈ 0.23-0.35 = titre avec DOUBLE indentation (niveau hiérarchique 3)
+    Mesurer précisément où commence le premier caractère du titre.""")
+
+
+class DocumentTocBbox(BaseModel):
+    """Schema for extracting all TOC entries with their positions.
+
+    Attributes:
+        entries: Complete list of ALL entries found in the table of contents.
+            For EACH line in the TOC, extract:
+            1. The title text (without leader dots)
+            2. The page number (the number on the right)
+            3. The exact horizontal X position of the title start (to detect indentation)
+
+            Include ALL entries, even those that appear to be at the same visual level.
+    """
+
+    entries: List[TocEntryBbox] = Field(
+        ...,
+        description="""Complete list of ALL entries found in the table of contents.
+    For EACH line in the TOC, extract:
+    1. The title text (without leader dots)
+    2. The page number (the number on the right)
+    3. The exact horizontal X position of the title start (to detect indentation)
+
+    Include ALL entries, even those that appear to be at the same visual level.""",
+    )
+
+
+# TypedDict classes for structured return types
+class VisualTOCMetadata(TypedDict):
+    """Metadata extracted from the document.
+
+    Attributes:
+        title: Document title.
+        author: Document author.
+        languages: List of languages present in the document.
+        summary: Brief document summary.
+    """
+
+    title: str
+    author: str
+    languages: List[str]
+    summary: str
+
+
+class VisualTOCNode(TypedDict):
+    """Hierarchical TOC node.
+
+    Attributes:
+        title: Entry title text.
+        page: Page number in the book.
+        level: Hierarchical level (1 = top level, 2 = subsection, etc.).
+        type: Entry type (e.g., "section", "chapter").
+        children: List of child nodes.
+    """
+
+    title: str
+    page: int
+    level: int
+    type: str
+    children: List[VisualTOCNode]
+
+
+class VisualTOCFlatEntry(TypedDict):
+    """Flattened TOC entry for storage.
+
+    Attributes:
+        title: Entry title text.
+        page_number: Page number in the book.
+        level: Hierarchical level.
+        entry_type: Entry type (e.g., "section", "chapter").
+        parent_title: Title of the parent entry, if any.
+    """
+
+    title: str
+    page_number: int
+    level: int
+    entry_type: str
+    parent_title: Optional[str]
+
+
+class VisualTOCResultSuccess(TypedDict):
+    """Successful TOC extraction result.
+
+    Attributes:
+        success: Always True for success case.
+        metadata: Document metadata.
+        toc: Hierarchical TOC structure.
+        toc_flat: Flattened TOC entries.
+        cost_ocr_annotated: OCR processing cost in euros.
+        method: Extraction method identifier.
+    """
+
+    success: bool
+    metadata: VisualTOCMetadata
+    toc: List[VisualTOCNode]
+    toc_flat: List[VisualTOCFlatEntry]
+    cost_ocr_annotated: float
+    method: str
+
+
+class VisualTOCResultError(TypedDict):
+    """Failed TOC extraction result.
+
+    Attributes:
+        success: Always False for error case.
+        error: Error message describing the failure.
+    """
+
+    success: bool
+    error: str
+
+
+# Union type for the function return
+VisualTOCResult = Union[VisualTOCResultSuccess, VisualTOCResultError]
+
+
+class VisualTOCEntryInternal(TypedDict):
+    """Internal representation of TOC entry during processing.
+
+    Attributes:
+        text: Entry title text.
+        page_number: Page number in the book.
+        x_position: Normalized X position (0.0 to 1.0).
+        x_start: Same as x_position (for processing).
+        page: Same as page_number (for processing).
+        level: Computed hierarchical level.
+    """
+
+    text: str
+    page_number: int
+    x_position: float
+    x_start: float
+    page: int
+    level: int
+
+
+def extract_toc_with_visual_analysis(
+    pdf_path: Path,
+    api_key: Optional[str] = None,
+    max_toc_pages: int = 8,
+) -> VisualTOCResult:
+    """Extract TOC by visually analyzing bounding boxes.
+
+    Detects hierarchy from horizontal alignment (X coordinate). This method
+    uses OCR with annotations to extract the precise X-coordinate of each
+    TOC entry, then clusters these positions to identify indentation levels.
+
+    Args:
+        pdf_path: Path to the PDF file.
+        api_key: Mistral API key (optional, uses environment variable if not provided).
+        max_toc_pages: Number of pages to analyze (default: 8).
+
+    Returns:
+        Dictionary containing either:
+            - Success: metadata, hierarchical TOC, flat TOC, cost, method
+            - Error: success=False and error message
+
+    Raises:
+        Does not raise exceptions; errors are returned in the result dictionary.
+
+    Example:
+        >>> from pathlib import Path
+        >>> result = extract_toc_with_visual_analysis(Path("book.pdf"))
+        >>> if result["success"]:
+        ...     print(f"Extracted {len(result['toc'])} top-level entries")
+        ... else:
+        ...     print(f"Error: {result['error']}")
+    """
+    try:
+        client = create_client(api_key)
+        pdf_bytes: bytes = pdf_path.read_bytes()
+    except Exception as e:
+        logger.error(f"Initialization error: {e}")
+        return {"success": False, "error": str(e)}
+
+    logger.info(f"Visual TOC extraction on {max_toc_pages} pages")
+
+    # Call OCR with document_annotation_format for global structure
+    try:
+        response = run_ocr_with_annotations(
+            client=client,
+            file_bytes=pdf_bytes,
+            filename=pdf_path.name,
+            include_images=False,
+            document_annotation_format=DocumentTocBbox,
+            pages=list(range(max_toc_pages)),
+        )
+    except Exception as e:
+        logger.error(f"OCR with annotations error: {e}")
+        return {"success": False, "error": f"OCR failed: {str(e)}"}
+
+    # Extract annotations
+    doc_annotation: Any = getattr(response, "document_annotation", None)
+
+    if not doc_annotation:
+        return {"success": False, "error": "No annotation returned"}
+
+    # Parse entries
+    try:
+        if isinstance(doc_annotation, str):
+            toc_data: Any = json.loads(doc_annotation)
+        else:
+            toc_data = doc_annotation
+
+        entries_data: List[Dict[str, Any]] = (
+            toc_data.get("entries", []) if isinstance(toc_data, dict) else toc_data
+        )
+
+        # Build hierarchy from X coordinates
+        toc_entries: List[VisualTOCNode] = build_hierarchy_from_bbox(entries_data)
+
+        logger.info(f"TOC extracted visually: {len(toc_entries)} entries")
+
+        # Basic metadata (no enriched metadata in visual mode)
+        metadata: VisualTOCMetadata = {
+            "title": pdf_path.stem,
+            "author": "Unknown author",
+            "languages": [],
+            "summary": "",
+        }
+
+        result: VisualTOCResultSuccess = {
+            "success": True,
+            "metadata": metadata,
+            "toc": toc_entries,
+            "toc_flat": flatten_toc(toc_entries),
+            "cost_ocr_annotated": max_toc_pages * 0.003,
+            "method": "visual_x_position",
+        }
+        return result
+    except Exception as e:
+        logger.error(f"Bbox parsing error: {e}")
+        return {"success": False, "error": f"Parsing failed: {str(e)}"}
+
+
+def build_hierarchy_from_bbox(entries: List[Dict[str, Any]]) -> List[VisualTOCNode]:
+    """Build TOC hierarchy from X positions (indentation).
+
+    Detects the hierarchical level by analyzing the horizontal X coordinate.
+    Clusters nearby X positions to identify distinct indentation levels, then
+    builds a tree structure using a stack-based approach.
+
+    Args:
+        entries: List of entries with x_position field. Each entry should have:
+            - text: Entry title
+            - page_number: Page number
+            - x_position: Normalized X coordinate (0.0 to 1.0)
+
+    Returns:
+        Hierarchical TOC structure as a list of nodes. Each node contains:
+            - title: Entry title
+            - page: Page number
+            - level: Hierarchical level (1, 2, 3, ...)
+            - type: Entry type (always "section")
+            - children: List of child nodes
+
+    Example:
+        >>> entries = [
+        ...     {"text": "Chapter 1", "page_number": 1, "x_position": 0.1},
+        ...     {"text": "Section 1.1", "page_number": 2, "x_position": 0.2},
+        ... ]
+        >>> hierarchy = build_hierarchy_from_bbox(entries)
+        >>> hierarchy[0]["children"][0]["title"]
+        'Section 1.1'
+    """
+    if not entries:
+        return []
+
+    # Extract X positions and normalize entry data
+    entry_list: List[VisualTOCEntryInternal] = []
+    for entry in entries:
+        x_start: float = entry.get("x_position", 0.1)
+        page_num: int = entry.get("page_number", 0)
+        entry["x_start"] = x_start
+        entry["page"] = page_num
+        entry_list.append(entry)  # type: ignore[arg-type]
+
+    # Find unique indentation thresholds
+    x_positions: List[float] = sorted(set(e["x_start"] for e in entry_list))
+
+    if not x_positions:
+        logger.warning("No X position detected")
+        return []
+
+    # Group nearby positions (tolerance 0.03 to normalize small variations)
+    x_levels: List[float] = []
+    current_group: List[float] = [x_positions[0]]
+
+    for x in x_positions[1:]:
+        if x - current_group[-1] < 0.03:
+            current_group.append(x)
+        else:
+            x_levels.append(sum(current_group) / len(current_group))
+            current_group = [x]
+
+    if current_group:
+        x_levels.append(sum(current_group) / len(current_group))
+
+    logger.info(
+        f"Indentation levels detected (X positions): {[f'{x:.3f}' for x in x_levels]}"
+    )
+
+    # Assign levels based on X position
+    for entry_item in entry_list:
+        x_val: float = entry_item["x_start"]
+        # Find the closest level
+        level: int = min(range(len(x_levels)), key=lambda i: abs(x_levels[i] - x_val)) + 1
+        entry_item["level"] = level
+        logger.debug(f"  '{entry_item.get('text', '')}' -> X={x_val:.3f} -> level {level}")
+
+    # Build hierarchy
+    toc: List[VisualTOCNode] = []
+    stack: List[VisualTOCNode] = []
+
+    for entry_item in entry_list:
+        node: VisualTOCNode = {
+            "title": entry_item.get("text", "").strip(),
+            "page": entry_item["page"],
+            "level": entry_item["level"],
+            "type": "section",
+            "children": [],
+        }
+
+        # Pop from stack while current level is less than or equal to stack top
+        while stack and stack[-1]["level"] >= node["level"]:
+            stack.pop()
+
+        if stack:
+            stack[-1]["children"].append(node)
+        else:
+            toc.append(node)
+
+        stack.append(node)
+
+    return toc
+
+
+def flatten_toc(toc: List[VisualTOCNode]) -> List[VisualTOCFlatEntry]:
+    """Flatten a hierarchical TOC.
+
+    Converts a nested TOC structure into a flat list of entries, preserving
+    parent-child relationships through the parent_title field.
+
+    Args:
+        toc: Hierarchical TOC structure (list of VisualTOCNode).
+
+    Returns:
+        Flat list of TOC entries with parent references.
+
+    Example:
+        >>> toc = [{
+        ...     "title": "Chapter 1",
+        ...     "page": 1,
+        ...     "level": 1,
+        ...     "type": "section",
+        ...     "children": [{
+        ...         "title": "Section 1.1",
+        ...         "page": 2,
+        ...         "level": 2,
+        ...         "type": "section",
+        ...         "children": []
+        ...     }]
+        ... }]
+        >>> flat = flatten_toc(toc)
+        >>> len(flat)
+        2
+        >>> flat[1]["parent_title"]
+        'Chapter 1'
+    """
+    flat: List[VisualTOCFlatEntry] = []
+
+    def recurse(items: List[VisualTOCNode], parent_title: Optional[str] = None) -> None:
+        """Recursively flatten TOC nodes.
+
+        Args:
+            items: List of TOC nodes to process.
+            parent_title: Title of the parent node (None for top level).
+        """
+        for item in items:
+            flat_entry: VisualTOCFlatEntry = {
+                "title": item["title"],
+                "page_number": item["page"],
+                "level": item["level"],
+                "entry_type": item["type"],
+                "parent_title": parent_title,
+            }
+            flat.append(flat_entry)
+            if item.get("children"):
+                recurse(item["children"], item["title"])
+
+    recurse(toc)
+    return flat
+
--- a/generations/library_rag/utils/types.py
+++ b/generations/library_rag/utils/types.py
--- a/generations/library_rag/utils/weaviate_ingest.py
+++ b/generations/library_rag/utils/weaviate_ingest.py
@@ -0,0 +1,815 @@
+"""Weaviate document ingestion module for the Library RAG pipeline.
+
+This module handles the ingestion of processed documents (chunks, metadata,
+summaries) into the Weaviate vector database. It supports the V3.0 schema
+with nested objects for efficient semantic search.
+
+Architecture:
+    The module uses four Weaviate collections:
+
+    - **Work**: Represents a literary/philosophical work (title, author, year)
+    - **Document**: A specific edition/version of a work (sourceId, pages, TOC)
+    - **Chunk**: Text chunks with vectorized content for semantic search
+    - **Summary**: Section summaries with vectorized concepts
+
+    Chunks and Summaries use nested objects to reference their parent
+    Work and Document, avoiding data duplication while enabling
+    efficient filtering.
+
+Batch Operations:
+    The module uses Weaviate insert_many() for efficient batch insertion.
+    Chunks are prepared as a list and inserted in a single operation,
+    which is significantly faster than individual insertions.
+
+Nested Objects:
+    Each Chunk contains nested work and document objects::
+
+        {
+            "text": "La justice est une vertu...",
+            "work": {"title": "La Republique", "author": "Platon"},
+            "document": {"sourceId": "platon_republique", "edition": "GF"}
+        }
+
+    This enables filtering like: document.sourceId == "platon_republique"
+
+Typical Usage:
+    >>> from utils.weaviate_ingest import ingest_document, delete_document_chunks
+    >>>
+    >>> # Ingest a processed document
+    >>> result = ingest_document(
+    ...     doc_name="platon_republique",
+    ...     chunks=[{"text": "La justice est...", "section": "Livre I"}],
+    ...     metadata={"title": "La Republique", "author": "Platon"},
+    ...     language="fr",
+    ... )
+    >>> print(f"Ingested {result['count']} chunks")
+
+Connection:
+    The module connects to a local Weaviate instance using:
+
+    - HTTP port: 8080
+    - gRPC port: 50051
+
+    Ensure Weaviate is running via: docker-compose up -d
+
+See Also:
+    - schema.py: Weaviate schema definitions
+    - pdf_pipeline.py: Document processing pipeline
+    - flask_app.py: Web interface for search
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from contextlib import contextmanager
+from datetime import datetime, timezone
+from typing import Any, Dict, Generator, List, Optional, TypedDict
+
+import weaviate
+from weaviate import WeaviateClient
+from weaviate.collections import Collection
+import weaviate.classes.query as wvq
+
+# Import type definitions from central types module
+from utils.types import WeaviateIngestResult as IngestResult
+
+# Import TOC enrichment functions
+from .toc_enricher import enrich_chunks_with_toc
+
+
+# =============================================================================
+# Type Definitions (module-specific, not exported to utils.types)
+# =============================================================================
+
+
+class SummaryObject(TypedDict):
+    """Weaviate Summary object structure for section summaries.
+
+    This TypedDict defines the structure of Summary objects stored in Weaviate.
+    Summaries are vectorized and can be searched semantically.
+
+    Attributes:
+        sectionPath: Full hierarchical path (e.g., "Livre I > Chapitre 2").
+        title: Section title.
+        level: Hierarchy level (1 = top level, 2 = subsection, etc.).
+        text: Summary text content (vectorized for search).
+        concepts: List of key concepts extracted from the section.
+        chunksCount: Number of chunks in this section.
+        document: Nested object with document reference (sourceId).
+    """
+
+    sectionPath: str
+    title: str
+    level: int
+    text: str
+    concepts: List[str]
+    chunksCount: int
+    document: Dict[str, str]
+
+
+class ChunkObject(TypedDict, total=False):
+    """Weaviate Chunk object structure for text chunks.
+
+    This TypedDict defines the structure of Chunk objects stored in Weaviate.
+    The text and keywords fields are vectorized for semantic search.
+
+    Attributes:
+        text: Chunk text content (vectorized for search).
+        sectionPath: Full hierarchical path (e.g., "Livre I > Chapitre 2").
+        sectionLevel: Hierarchy level (1 = top level).
+        chapterTitle: Title of the containing chapter.
+        canonicalReference: Canonical academic reference (e.g., "CP 1.628", "Ménon 80a").
+        unitType: Type of argumentative unit (main_content, exposition, etc.).
+        keywords: List of keywords/concepts (vectorized for search).
+        language: Language code (e.g., "fr", "en").
+        orderIndex: Position in document for ordering.
+        work: Nested object with work metadata (title, author).
+        document: Nested object with document reference (sourceId, edition).
+
+    Note:
+        Uses total=False because some fields are optional during creation.
+    """
+
+    text: str
+    sectionPath: str
+    sectionLevel: int
+    chapterTitle: str
+    canonicalReference: str
+    unitType: str
+    keywords: List[str]
+    language: str
+    orderIndex: int
+    work: Dict[str, str]
+    document: Dict[str, str]
+
+
+class InsertedChunkSummary(TypedDict):
+    """Summary of an inserted chunk for display purposes.
+
+    This TypedDict provides a preview of inserted chunks, useful for
+    displaying ingestion results to users.
+
+    Attributes:
+        chunk_id: Generated chunk identifier.
+        sectionPath: Hierarchical path of the chunk.
+        work: Title of the work.
+        author: Author name.
+        text_preview: First 150 characters of chunk text.
+        unitType: Type of argumentative unit.
+    """
+
+    chunk_id: str
+    sectionPath: str
+    work: str
+    author: str
+    text_preview: str
+    unitType: str
+
+
+# Note: IngestResult is imported from utils.types as WeaviateIngestResult
+
+
+class DeleteResult(TypedDict, total=False):
+    """Result from document deletion operation.
+
+    This TypedDict contains the result of a deletion operation,
+    including counts of deleted objects from each collection.
+
+    Attributes:
+        success: Whether deletion succeeded.
+        error: Error message if deletion failed.
+        deleted_chunks: Number of chunks deleted from Chunk collection.
+        deleted_summaries: Number of summaries deleted from Summary collection.
+        deleted_document: Whether the Document object was deleted.
+
+    Example:
+        >>> result = delete_document_chunks("platon_republique")
+        >>> print(f"Deleted {result['deleted_chunks']} chunks")
+    """
+
+    success: bool
+    error: str
+    deleted_chunks: int
+    deleted_summaries: int
+    deleted_document: bool
+
+
+class DocumentStats(TypedDict, total=False):
+    """Document statistics from Weaviate.
+
+    This TypedDict contains statistics about a document stored in Weaviate,
+    retrieved by querying the Chunk collection.
+
+    Attributes:
+        success: Whether stats retrieval succeeded.
+        error: Error message if retrieval failed.
+        sourceId: Document identifier.
+        chunks_count: Total number of chunks for this document.
+        work: Title of the work (from first chunk).
+        author: Author name (from first chunk).
+
+    Example:
+        >>> stats = get_document_stats("platon_republique")
+        >>> print(f"Document has {stats['chunks_count']} chunks")
+    """
+
+    success: bool
+    error: str
+    sourceId: str
+    chunks_count: int
+    work: Optional[str]
+    author: Optional[str]
+
+
+# Logger
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+@contextmanager
+def get_weaviate_client() -> Generator[Optional[WeaviateClient], None, None]:
+    """Context manager for Weaviate connection with automatic cleanup.
+
+    Creates a connection to the local Weaviate instance and ensures
+    proper cleanup when the context exits. Handles connection errors
+    gracefully by yielding None instead of raising.
+
+    Yields:
+        Connected WeaviateClient instance, or None if connection failed.
+
+    Example:
+        >>> with get_weaviate_client() as client:
+        ...     if client is not None:
+        ...         chunks = client.collections.get("Chunk")
+        ...         # Perform operations...
+        ...     else:
+        ...         print("Connection failed")
+
+    Note:
+        Connects to localhost:8080 (HTTP) and localhost:50051 (gRPC).
+        Ensure Weaviate is running via docker-compose up -d.
+    """
+    client: Optional[WeaviateClient] = None
+    try:
+        # Increased timeout for long text vectorization (e.g., Peirce CP 3.403, CP 8.388, Menon chunk 10)
+        # Default is 60s, increased to 600s (10 minutes) for exceptionally large texts
+        from weaviate.classes.init import AdditionalConfig, Timeout
+
+        client = weaviate.connect_to_local(
+            host="localhost",
+            port=8080,
+            grpc_port=50051,
+            additional_config=AdditionalConfig(
+                timeout=Timeout(init=30, query=600, insert=600)  # 10 min for insert/query
+            )
+        )
+        yield client
+    except Exception as e:
+        logger.error(f"Erreur connexion Weaviate: {e}")
+        yield None
+    finally:
+        if client:
+            client.close()
+
+
+def ingest_document_metadata(
+    client: WeaviateClient,
+    doc_name: str,
+    metadata: Dict[str, Any],
+    toc: List[Dict[str, Any]],
+    hierarchy: Dict[str, Any],
+    chunks_count: int,
+    pages: int,
+) -> Optional[str]:
+    """Insert document metadata into the Document collection.
+
+    Creates a Document object containing metadata about a processed document,
+    including its table of contents, hierarchy structure, and statistics.
+
+    Args:
+        client: Active Weaviate client connection.
+        doc_name: Unique document identifier (sourceId).
+        metadata: Extracted metadata dict with keys: title, author, language.
+        toc: Table of contents as a hierarchical list of dicts.
+        hierarchy: Complete document hierarchy structure.
+        chunks_count: Total number of chunks in the document.
+        pages: Number of pages in the source PDF.
+
+    Returns:
+        UUID string of the created Document object, or None if insertion failed.
+
+    Example:
+        >>> with get_weaviate_client() as client:
+        ...     uuid = ingest_document_metadata(
+        ...         client,
+        ...         doc_name="platon_republique",
+        ...         metadata={"title": "La Republique", "author": "Platon"},
+        ...         toc=[{"title": "Livre I", "level": 1}],
+        ...         hierarchy={},
+        ...         chunks_count=150,
+        ...         pages=300,
+        ...     )
+
+    Note:
+        The TOC and hierarchy are serialized to JSON strings for storage.
+        The createdAt field is set to the current timestamp.
+    """
+    try:
+        doc_collection: Collection[Any, Any] = client.collections.get("Document")
+    except Exception as e:
+        logger.warning(f"Collection Document non trouvée: {e}")
+        return None
+
+    try:
+        doc_obj: Dict[str, Any] = {
+            "sourceId": doc_name,
+            "title": metadata.get("title") or doc_name,
+            "author": metadata.get("author") or "Inconnu",
+            "toc": json.dumps(toc, ensure_ascii=False) if toc else "[]",
+            "hierarchy": json.dumps(hierarchy, ensure_ascii=False) if hierarchy else "{}",
+            "pages": pages,
+            "chunksCount": chunks_count,
+            "language": metadata.get("language", "fr"),
+            "createdAt": datetime.now(timezone.utc).isoformat(),
+        }
+
+        result = doc_collection.data.insert(doc_obj)
+        logger.info(f"Document metadata ingéré: {doc_name}")
+        return str(result)
+
+    except Exception as e:
+        logger.warning(f"Erreur ingestion document metadata: {e}")
+        return None
+
+
+def ingest_summaries(
+    client: WeaviateClient,
+    doc_name: str,
+    toc: List[Dict[str, Any]],
+    summaries_content: Dict[str, str],
+) -> int:
+    """Insert section summaries into the Summary collection.
+
+    Creates Summary objects for each entry in the table of contents,
+    with optional summary text content. Summaries are vectorized and
+    can be searched semantically.
+
+    Args:
+        client: Active Weaviate client connection.
+        doc_name: Document identifier for linking summaries.
+        toc: Hierarchical table of contents list.
+        summaries_content: Mapping of section titles to summary text.
+            If a title is not in this dict, the title itself is used as text.
+
+    Returns:
+        Number of summaries successfully inserted.
+
+    Example:
+        >>> with get_weaviate_client() as client:
+        ...     count = ingest_summaries(
+        ...         client,
+        ...         doc_name="platon_republique",
+        ...         toc=[{"title": "Livre I", "level": 1}],
+        ...         summaries_content={"Livre I": "Discussion sur la justice..."},
+        ...     )
+        ...     print(f"Inserted {count} summaries")
+
+    Note:
+        Uses batch insertion via insert_many() for efficiency.
+        Recursively processes nested TOC entries (children).
+    """
+    try:
+        summary_collection: Collection[Any, Any] = client.collections.get("Summary")
+    except Exception as e:
+        logger.warning(f"Collection Summary non trouvée: {e}")
+        return 0
+
+    summaries_to_insert: List[SummaryObject] = []
+
+    def process_toc(items: List[Dict[str, Any]], parent_path: str = "") -> None:
+        for item in items:
+            title: str = item.get("title", "")
+            level: int = item.get("level", 1)
+            path: str = f"{parent_path} > {title}" if parent_path else title
+
+            summary_obj: SummaryObject = {
+                "sectionPath": path,
+                "title": title,
+                "level": level,
+                "text": summaries_content.get(title, title),
+                "concepts": item.get("concepts", []),
+                "chunksCount": 0,
+                "document": {
+                    "sourceId": doc_name,
+                },
+            }
+            summaries_to_insert.append(summary_obj)
+
+            if "children" in item:
+                process_toc(item["children"], path)
+
+    process_toc(toc)
+
+    if not summaries_to_insert:
+        return 0
+
+    # Insérer par petits lots pour éviter les timeouts
+    BATCH_SIZE = 50
+    total_inserted = 0
+
+    try:
+        logger.info(f"Ingesting {len(summaries_to_insert)} summaries in batches of {BATCH_SIZE}...")
+
+        for batch_start in range(0, len(summaries_to_insert), BATCH_SIZE):
+            batch_end = min(batch_start + BATCH_SIZE, len(summaries_to_insert))
+            batch = summaries_to_insert[batch_start:batch_end]
+
+            try:
+                summary_collection.data.insert_many(batch)
+                total_inserted += len(batch)
+                logger.info(f"  Batch {batch_start//BATCH_SIZE + 1}: Inserted {len(batch)} summaries ({total_inserted}/{len(summaries_to_insert)})")
+            except Exception as batch_error:
+                logger.warning(f"  Batch {batch_start//BATCH_SIZE + 1} failed: {batch_error}")
+                continue
+
+        logger.info(f"{total_inserted} résumés ingérés pour {doc_name}")
+        return total_inserted
+    except Exception as e:
+        logger.warning(f"Erreur ingestion résumés: {e}")
+        return 0
+
+
+def ingest_document(
+    doc_name: str,
+    chunks: List[Dict[str, Any]],
+    metadata: Dict[str, Any],
+    language: str = "fr",
+    toc: Optional[List[Dict[str, Any]]] = None,
+    hierarchy: Optional[Dict[str, Any]] = None,
+    pages: int = 0,
+    ingest_document_collection: bool = True,
+    ingest_summary_collection: bool = False,
+) -> IngestResult:
+    """Ingest document chunks into Weaviate with nested objects.
+
+    Main ingestion function that inserts chunks into the Chunk collection
+    with nested Work and Document references. Optionally also creates
+    entries in the Document and Summary collections.
+
+    This function uses batch insertion for optimal performance and
+    constructs proper nested objects for filtering capabilities.
+
+    Args:
+        doc_name: Unique document identifier (used as sourceId).
+        chunks: List of chunk dicts, each containing at minimum:
+            - text: The chunk text content
+            - section (optional): Section path string
+            - hierarchy (optional): Dict with part/chapter/section
+            - type (optional): Argumentative unit type
+            - concepts/keywords (optional): List of keywords
+        metadata: Document metadata dict with keys:
+            - title: Work title
+            - author: Author name
+            - edition (optional): Edition identifier
+        language: ISO language code. Defaults to "fr".
+        toc: Optional table of contents for Document/Summary collections.
+        hierarchy: Optional complete document hierarchy structure.
+        pages: Number of pages in source document. Defaults to 0.
+        ingest_document_collection: If True, also insert into Document
+            collection. Defaults to True.
+        ingest_summary_collection: If True, also insert into Summary
+            collection (requires toc). Defaults to False.
+
+    Returns:
+        IngestResult dict containing:
+            - success: True if ingestion succeeded
+            - count: Number of chunks inserted
+            - inserted: Preview of first 10 inserted chunks
+            - work: Work title
+            - author: Author name
+            - document_uuid: UUID of Document object (if created)
+            - all_objects: Complete list of inserted ChunkObjects
+            - error: Error message (if failed)
+
+    Raises:
+        No exceptions are raised; errors are returned in the result dict.
+
+    Example:
+        >>> result = ingest_document(
+        ...     doc_name="platon_republique",
+        ...     chunks=[{"text": "La justice est...", "section": "Livre I"}],
+        ...     metadata={"title": "La Republique", "author": "Platon"},
+        ...     language="fr",
+        ...     pages=450,
+        ... )
+        >>> if result["success"]:
+        ...     print(f"Ingested {result['count']} chunks")
+
+    Note:
+        Empty chunks (no text or whitespace-only) are automatically skipped.
+        The function logs progress and errors using the module logger.
+    """
+    try:
+        with get_weaviate_client() as client:
+            if client is None:
+                return IngestResult(
+                    success=False,
+                    error="Connexion Weaviate impossible",
+                    inserted=[],
+                )
+
+            # Récupérer la collection Chunk
+            try:
+                chunk_collection: Collection[Any, Any] = client.collections.get("Chunk")
+            except Exception as e:
+                return IngestResult(
+                    success=False,
+                    error=f"Collection Chunk non trouvée: {e}",
+                    inserted=[],
+                )
+
+            # Insérer les métadonnées du document (optionnel)
+            doc_uuid: Optional[str] = None
+            if ingest_document_collection:
+                doc_uuid = ingest_document_metadata(
+                    client, doc_name, metadata, toc or [], hierarchy or {},
+                    len(chunks), pages
+                )
+
+            # Insérer les résumés (optionnel)
+            if ingest_summary_collection and toc:
+                ingest_summaries(client, doc_name, toc, {})
+
+            # NOUVEAU : Enrichir chunks avec métadonnées TOC si disponibles
+            if toc and hierarchy:
+                logger.info(f"Enriching {len(chunks)} chunks with TOC metadata...")
+                chunks = enrich_chunks_with_toc(chunks, toc, hierarchy)
+            else:
+                logger.info("No TOC/hierarchy available, using basic metadata")
+
+            # Préparer les objets Chunk à insérer avec nested objects
+            objects_to_insert: List[ChunkObject] = []
+
+            title: str = metadata.get("title") or metadata.get("work") or doc_name
+            author: str = metadata.get("author") or "Inconnu"
+            edition: str = metadata.get("edition", "")
+
+            for idx, chunk in enumerate(chunks):
+                # Extraire le texte du chunk
+                text: str = chunk.get("text", "")
+                if not text or not text.strip():
+                    continue
+
+                # Utiliser sectionPath enrichi si disponible, sinon fallback vers logique existante
+                section_path: str = chunk.get("sectionPath", "")
+                if not section_path:
+                    section_path = chunk.get("section", "")
+                    if not section_path:
+                        chunk_hierarchy: Dict[str, Any] = chunk.get("hierarchy", {})
+                        section_parts: List[str] = []
+                        if chunk_hierarchy.get("part"):
+                            section_parts.append(chunk_hierarchy["part"])
+                        if chunk_hierarchy.get("chapter"):
+                            section_parts.append(chunk_hierarchy["chapter"])
+                        if chunk_hierarchy.get("section"):
+                            section_parts.append(chunk_hierarchy["section"])
+                        section_path = " > ".join(section_parts) if section_parts else chunk.get("title", f"Section {idx}")
+
+                # Utiliser chapterTitle enrichi si disponible
+                chapter_title: str = chunk.get("chapterTitle", chunk.get("chapter_title", ""))
+
+                # Utiliser canonicalReference enrichi si disponible
+                canonical_ref: str = chunk.get("canonicalReference", "")
+
+                # Créer l objet Chunk avec nested objects
+                chunk_obj: ChunkObject = {
+                    "text": text,
+                    "sectionPath": section_path,
+                    "sectionLevel": chunk.get("section_level", chunk.get("level", 1)),
+                    "chapterTitle": chapter_title,
+                    "canonicalReference": canonical_ref,
+                    "unitType": chunk.get("type", "main_content"),
+                    "keywords": chunk.get("concepts", chunk.get("keywords", [])),
+                    "language": language,
+                    "orderIndex": idx,
+                    "work": {
+                        "title": title,
+                        "author": author,
+                    },
+                    "document": {
+                        "sourceId": doc_name,
+                        "edition": edition,
+                    },
+                }
+
+                objects_to_insert.append(chunk_obj)
+
+            if not objects_to_insert:
+                return IngestResult(
+                    success=True,
+                    message="Aucun chunk à insérer",
+                    inserted=[],
+                    count=0,
+                )
+
+            # Insérer les objets par petits lots pour éviter les timeouts
+            BATCH_SIZE = 50  # Process 50 chunks at a time
+            total_inserted = 0
+
+            logger.info(f"Ingesting {len(objects_to_insert)} chunks in batches of {BATCH_SIZE}...")
+
+            for batch_start in range(0, len(objects_to_insert), BATCH_SIZE):
+                batch_end = min(batch_start + BATCH_SIZE, len(objects_to_insert))
+                batch = objects_to_insert[batch_start:batch_end]
+
+                try:
+                    _response = chunk_collection.data.insert_many(objects=batch)
+                    total_inserted += len(batch)
+                    logger.info(f"  Batch {batch_start//BATCH_SIZE + 1}: Inserted {len(batch)} chunks ({total_inserted}/{len(objects_to_insert)})")
+                except Exception as batch_error:
+                    logger.error(f"  Batch {batch_start//BATCH_SIZE + 1} failed: {batch_error}")
+                    # Continue with next batch instead of failing completely
+                    continue
+
+            # Préparer le résumé des objets insérés
+            inserted_summary: List[InsertedChunkSummary] = []
+            for i, obj in enumerate(objects_to_insert[:10]):
+                text_content: str = obj.get("text", "")
+                work_obj: Dict[str, str] = obj.get("work", {})
+                inserted_summary.append(InsertedChunkSummary(
+                    chunk_id=f"chunk_{i:05d}",
+                    sectionPath=obj.get("sectionPath", ""),
+                    work=work_obj.get("title", ""),
+                    author=work_obj.get("author", ""),
+                    text_preview=text_content[:150] + "..." if len(text_content) > 150 else text_content,
+                    unitType=obj.get("unitType", ""),
+                ))
+
+            logger.info(f"Ingestion réussie: {total_inserted} chunks insérés pour {doc_name}")
+
+            return IngestResult(
+                success=True,
+                count=total_inserted,
+                inserted=inserted_summary,
+                work=title,
+                author=author,
+                document_uuid=doc_uuid,
+                all_objects=objects_to_insert,
+            )
+
+    except Exception as e:
+        logger.error(f"Erreur ingestion: {e}")
+        return IngestResult(
+            success=False,
+            error=str(e),
+            inserted=[],
+        )
+
+
+def delete_document_chunks(doc_name: str) -> DeleteResult:
+    """Delete all data for a document from Weaviate collections.
+
+    Removes chunks, summaries, and the document metadata from their
+    respective collections. Uses nested object filtering to find
+    related objects.
+
+    This function is useful for re-processing a document after changes
+    to the processing pipeline or to clean up test data.
+
+    Args:
+        doc_name: Document identifier (sourceId) to delete.
+
+    Returns:
+        DeleteResult dict containing:
+            - success: True if deletion succeeded (even if no objects found)
+            - deleted_chunks: Number of Chunk objects deleted
+            - deleted_summaries: Number of Summary objects deleted
+            - deleted_document: True if Document object was deleted
+            - error: Error message (if failed)
+
+    Example:
+        >>> result = delete_document_chunks("platon_republique")
+        >>> if result["success"]:
+        ...     print(f"Deleted {result['deleted_chunks']} chunks")
+        ...     # Now safe to re-ingest
+        ...     ingest_document("platon_republique", new_chunks, metadata)
+
+    Note:
+        Uses delete_many() with filters on nested object properties.
+        Continues even if some collections fail (logs warnings).
+    """
+    try:
+        with get_weaviate_client() as client:
+            if client is None:
+                return DeleteResult(success=False, error="Connexion Weaviate impossible")
+
+            deleted_chunks: int = 0
+            deleted_summaries: int = 0
+            deleted_document: bool = False
+
+            # Supprimer les chunks (filtrer sur document.sourceId nested)
+            try:
+                chunk_collection: Collection[Any, Any] = client.collections.get("Chunk")
+                result = chunk_collection.data.delete_many(
+                    where=wvq.Filter.by_property("document.sourceId").equal(doc_name)
+                )
+                deleted_chunks = result.successful
+            except Exception as e:
+                logger.warning(f"Erreur suppression chunks: {e}")
+
+            # Supprimer les summaries (filtrer sur document.sourceId nested)
+            try:
+                summary_collection: Collection[Any, Any] = client.collections.get("Summary")
+                result = summary_collection.data.delete_many(
+                    where=wvq.Filter.by_property("document.sourceId").equal(doc_name)
+                )
+                deleted_summaries = result.successful
+            except Exception as e:
+                logger.warning(f"Erreur suppression summaries: {e}")
+
+            # Supprimer le document
+            try:
+                doc_collection: Collection[Any, Any] = client.collections.get("Document")
+                result = doc_collection.data.delete_many(
+                    where=wvq.Filter.by_property("sourceId").equal(doc_name)
+                )
+                deleted_document = result.successful > 0
+            except Exception as e:
+                logger.warning(f"Erreur suppression document: {e}")
+
+            logger.info(f"Suppression: {deleted_chunks} chunks, {deleted_summaries} summaries pour {doc_name}")
+
+            return DeleteResult(
+                success=True,
+                deleted_chunks=deleted_chunks,
+                deleted_summaries=deleted_summaries,
+                deleted_document=deleted_document,
+            )
+
+    except Exception as e:
+        logger.error(f"Erreur suppression: {e}")
+        return DeleteResult(success=False, error=str(e))
+
+
+def get_document_stats(doc_name: str) -> DocumentStats:
+    """Retrieve statistics for a document from Weaviate.
+
+    Queries the Chunk collection to count chunks and extract work
+    metadata for a given document identifier.
+
+    Args:
+        doc_name: Document identifier (sourceId) to query.
+
+    Returns:
+        DocumentStats dict containing:
+            - success: True if query succeeded
+            - sourceId: The queried document identifier
+            - chunks_count: Number of chunks found
+            - work: Work title (from first chunk, if any)
+            - author: Author name (from first chunk, if any)
+            - error: Error message (if failed)
+
+    Example:
+        >>> stats = get_document_stats("platon_republique")
+        >>> if stats["success"]:
+        ...     print(f"Document: {stats['work']} by {stats['author']}")
+        ...     print(f"Chunks: {stats['chunks_count']}")
+
+    Note:
+        Limited to 1000 chunks for counting. For documents with more
+        chunks, consider using Weaviate's aggregate queries.
+    """
+    try:
+        with get_weaviate_client() as client:
+            if client is None:
+                return DocumentStats(success=False, error="Connexion Weaviate impossible")
+
+            # Compter les chunks (filtrer sur document.sourceId nested)
+            chunk_collection: Collection[Any, Any] = client.collections.get("Chunk")
+            chunks = chunk_collection.query.fetch_objects(
+                filters=wvq.Filter.by_property("document.sourceId").equal(doc_name),
+                limit=1000,
+            )
+
+            chunks_count: int = len(chunks.objects)
+
+            # Récupérer les infos du premier chunk
+            work: Optional[str] = None
+            author: Optional[str] = None
+            if chunks.objects:
+                first: Dict[str, Any] = chunks.objects[0].properties
+                work_obj: Any = first.get("work", {})
+                work = work_obj.get("title") if isinstance(work_obj, dict) else None
+                author = work_obj.get("author") if isinstance(work_obj, dict) else None
+
+            return DocumentStats(
+                success=True,
+                sourceId=doc_name,
+                chunks_count=chunks_count,
+                work=work,
+                author=author,
+            )
+
+    except Exception as e:
+        logger.error(f"Erreur stats document: {e}")
+        return DocumentStats(success=False, error=str(e))