Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server)
  - PDF processing pipeline with OCR and LLM extraction
  - Weaviate vector database integration (BGE-M3 embeddings)
  - Flask web interface with search and document management
  - MCP server for Claude Desktop integration
  - Comprehensive test suite (134 tests)

- Clean up root directory
  - Remove obsolete documentation files
  - Remove backup and temporary files
  - Update autonomous agent configuration

- Update prompts
  - Enhance initializer bis prompt with better instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions

View File

@@ -0,0 +1,74 @@
"""Utils - Pipeline de parsing PDF avec OCR Mistral et structuration LLM.
Version 2.0 : Pipeline intelligent avec extraction LLM des métadonnées,
TOC, classification des sections, chunking sémantique et validation.
"""
from .mistral_client import create_client, get_api_key, estimate_ocr_cost
from .pdf_uploader import upload_pdf
from .ocr_processor import run_ocr, serialize_ocr_response
from .markdown_builder import build_markdown
from .image_extractor import extract_images, create_image_writer
from .hierarchy_parser import build_hierarchy
from .llm_structurer import structure_with_llm, LLMStructureError
# Nouveaux modules LLM v2
from .llm_metadata import extract_metadata
from .llm_toc import extract_toc
from .llm_classifier import classify_sections, filter_indexable_sections
from .llm_cleaner import clean_chunk, clean_page_markers, is_chunk_valid
from .llm_chunker import chunk_section_with_llm, simple_chunk_by_paragraphs, extract_concepts_from_chunk, extract_paragraph_number
from .llm_validator import validate_document, apply_corrections, enrich_chunks_with_concepts
# Pipeline
from .pdf_pipeline import process_pdf, process_pdf_v2, process_pdf_bytes
from .weaviate_ingest import ingest_document, delete_document_chunks
__all__ = [
# Client Mistral
"create_client",
"get_api_key",
"estimate_ocr_cost",
# Upload
"upload_pdf",
# OCR
"run_ocr",
"serialize_ocr_response",
# Markdown
"build_markdown",
# Images
"extract_images",
"create_image_writer",
# Hiérarchie
"build_hierarchy",
# LLM Legacy
"structure_with_llm",
"LLMStructureError",
# LLM v2 - Métadonnées
"extract_metadata",
# LLM v2 - TOC
"extract_toc",
# LLM v2 - Classification
"classify_sections",
"filter_indexable_sections",
# LLM v2 - Nettoyage
"clean_chunk",
"clean_page_markers",
"is_chunk_valid",
# LLM v2 - Chunking
"chunk_section_with_llm",
"simple_chunk_by_paragraphs",
"extract_concepts_from_chunk",
"extract_paragraph_number",
# LLM v2 - Validation
"validate_document",
"apply_corrections",
"enrich_chunks_with_concepts",
# Pipeline
"process_pdf",
"process_pdf_v2",
"process_pdf_bytes",
# Weaviate
"ingest_document",
"delete_document_chunks",
]

View File

@@ -0,0 +1,267 @@
"""Hierarchical Markdown document parser for semantic chunking.
This module provides utilities for parsing Markdown documents into
hierarchical structures based on heading levels (# to ######). It is
a key component of the RAG pipeline, enabling:
1. **Structure Extraction**: Parse Markdown into a tree of sections
2. **Context Preservation**: Maintain hierarchical context (part > chapter > section)
3. **Semantic Chunking**: Flatten hierarchy into chunks with full path context
The parser uses a stack-based algorithm to build nested section trees,
preserving the document's logical structure for downstream processing.
Architecture:
Input: Raw Markdown text with headings
build_hierarchy() → DocumentHierarchy (tree structure)
flatten_hierarchy() → List[FlatChunk] (with hierarchical context)
TypedDict Definitions:
- HierarchyPath: Hierarchical path (part/chapter/section/subsection)
- HierarchyNode: Tree node with title, level, content, children
- DocumentHierarchy: Complete document structure
- FlatChunk: Flattened chunk with context for RAG ingestion
Algorithm:
The build_hierarchy() function uses a stack-based approach:
1. Initialize a virtual root node at level 0
2. For each line in the document:
- If heading: pop stack until parent level found, then push new node
- If content: append to current node's content
3. Finalize nodes by joining content lines
Example:
>>> markdown = '''
... # Introduction
... This is the intro.
...
... ## Background
... Some background text.
...
... ## Methodology
... Methods used here.
... '''
>>> hierarchy = build_hierarchy(markdown)
>>> print(hierarchy["sections"][0]["title"])
'Introduction'
>>> chunks = flatten_hierarchy(hierarchy)
>>> for chunk in chunks:
... print(f"{chunk['chunk_id']}: {chunk['title']}")
chunk_00001: Introduction
chunk_00002: Background
chunk_00003: Methodology
See Also:
- utils.llm_chunker: Semantic chunking using LLM
- utils.markdown_builder: Markdown generation from OCR
- utils.weaviate_ingest: Ingestion of chunks into Weaviate
"""
from __future__ import annotations
import re
from typing import List, Optional, Pattern, TypedDict
# Import type definitions from central types module
from utils.types import (
DocumentHierarchy,
FlatChunk,
HierarchyNode,
HierarchyPath,
)
class _BuildNode(TypedDict):
"""Noeud interne pour la construction de la hiérarchie."""
title: Optional[str]
level: int
content: List[str]
children: List[_BuildNode]
def build_hierarchy(markdown_text: str) -> DocumentHierarchy:
"""Construit une structure hiérarchique à partir des titres Markdown.
Analyse les titres (# à ######) et construit un arbre de sections
avec leur contenu textuel.
Args:
markdown_text: Texte Markdown à analyser
Returns:
Dictionnaire avec :
- preamble: Texte avant le premier titre
- sections: Liste de sections imbriquées
Chaque section contient :
- title: Titre de la section
- level: Niveau (1-6)
- content: Contenu textuel
- children: Sous-sections
"""
# Regex pour les titres Markdown
heading_re: Pattern[str] = re.compile(r"^(#{1,6})\s+(.*)$")
lines: List[str] = markdown_text.splitlines()
# Noeud racine (niveau 0, virtuel)
root: _BuildNode = {
"title": None,
"level": 0,
"content": [],
"children": [],
}
# Pile pour suivre la hiérarchie
stack: List[_BuildNode] = [root]
for line in lines:
stripped: str = line.rstrip()
match: Optional[re.Match[str]] = heading_re.match(stripped)
if match:
# C'est un titre
level: int = len(match.group(1))
title: str = match.group(2).strip()
# Remonter dans la pile jusqu'au parent approprié
while stack and stack[-1]["level"] >= level:
stack.pop()
# Créer le nouveau noeud
node: _BuildNode = {
"title": title,
"level": level,
"content": [],
"children": [],
}
# Ajouter au parent
parent: _BuildNode = stack[-1]
parent["children"].append(node)
# Empiler le nouveau noeud
stack.append(node)
else:
# C'est du contenu, l'ajouter au noeud courant
stack[-1]["content"].append(stripped)
# Finaliser les noeuds (joindre le contenu)
def finalize(node: _BuildNode) -> HierarchyNode:
"""Convertit un noeud de construction en noeud final."""
return HierarchyNode(
title=node["title"],
level=node["level"],
content="\n".join(node["content"]).strip(),
children=[finalize(child) for child in node["children"]],
)
# Extraire le préambule et les sections
preamble: str = "\n".join(root["content"]).strip()
sections: List[HierarchyNode] = [finalize(child) for child in root["children"]]
return DocumentHierarchy(
preamble=preamble,
sections=sections,
)
def flatten_hierarchy(hierarchy: DocumentHierarchy) -> List[FlatChunk]:
"""Aplatit la hiérarchie en une liste de chunks.
Args:
hierarchy: Structure hiérarchique (sortie de build_hierarchy)
Returns:
Liste de chunks avec leur contexte hiérarchique
"""
chunks: List[FlatChunk] = []
# Préambule comme premier chunk
if hierarchy.get("preamble"):
preamble_chunk: FlatChunk = {
"chunk_id": "chunk_00000",
"text": hierarchy["preamble"],
"hierarchy": HierarchyPath(
part=None,
chapter=None,
section=None,
subsection=None,
),
"type": "preamble",
"level": 0,
"title": None,
}
chunks.append(preamble_chunk)
def process_section(
section: HierarchyNode,
path: HierarchyPath,
index: int,
) -> int:
"""Traite récursivement une section.
Args:
section: Noeud de section à traiter
path: Chemin hiérarchique courant
index: Index du prochain chunk
Returns:
Nouvel index après traitement
"""
level: int = section["level"]
title: Optional[str] = section["title"]
# Mettre à jour le chemin hiérarchique
current_path: HierarchyPath = path.copy()
if level == 1:
current_path = HierarchyPath(
part=title,
chapter=None,
section=None,
subsection=None,
)
elif level == 2:
current_path["chapter"] = title
current_path["section"] = None
current_path["subsection"] = None
elif level == 3:
current_path["section"] = title
current_path["subsection"] = None
elif level >= 4:
current_path["subsection"] = title
# Créer le chunk si contenu
if section["content"]:
chunk: FlatChunk = {
"chunk_id": f"chunk_{index:05d}",
"text": section["content"],
"hierarchy": current_path.copy(),
"type": "main_content",
"level": level,
"title": title,
}
chunks.append(chunk)
index += 1
# Traiter les enfants
for child in section["children"]:
index = process_section(child, current_path, index)
return index
# Traiter toutes les sections
idx: int = 1
initial_path: HierarchyPath = HierarchyPath(
part=None,
chapter=None,
section=None,
subsection=None,
)
for section in hierarchy.get("sections", []):
idx = process_section(section, initial_path, idx)
return chunks

View File

@@ -0,0 +1,192 @@
"""Image extraction and storage from OCR API responses.
This module provides utilities for extracting and saving images from
Mistral OCR API responses. It is a companion module to markdown_builder,
handling the image-specific aspects of document processing.
Features:
- **Image Writer Factory**: Creates reusable callbacks for image saving
- **Batch Extraction**: Processes all images from an OCR response
- **Protocol-based Design**: Flexible interface for custom implementations
Pipeline Position:
OCR Response → **Image Extractor** → Saved images + paths for Markdown
Components:
1. ImageWriterProtocol: Interface definition for image saving
2. create_image_writer(): Factory for standard file-based writers
3. extract_images(): Batch extraction from OCR responses
Integration:
The image writer is designed to integrate with markdown_builder:
>>> from utils.image_extractor import create_image_writer
>>> from utils.markdown_builder import build_markdown
>>>
>>> writer = create_image_writer(Path("output/doc/images"))
>>> markdown = build_markdown(ocr_response, image_writer=writer)
Standalone Usage:
>>> from pathlib import Path
>>> from utils.image_extractor import extract_images
>>>
>>> # Extract all images from OCR response
>>> paths = extract_images(ocr_response, Path("output/my_doc"))
>>> print(f"Extracted {len(paths)} images")
File Naming Convention:
Images are named: page{N}_img{M}.png
- N: Page number (1-based)
- M: Image index within page (1-based)
- Format: Always PNG (base64 from Mistral is PNG)
Note:
- All indices are 1-based for consistency with page numbering
- The images subdirectory is created automatically if needed
- Base64 data without proper encoding is silently skipped
- Large documents may produce many images; monitor disk space
See Also:
- utils.markdown_builder: Uses ImageWriter for markdown generation
- utils.mistral_client: Source of OCR responses with image data
"""
import base64
from pathlib import Path
from typing import Any, Callable, List, Optional, Protocol
class ImageWriterProtocol(Protocol):
"""Protocol for image writing callbacks.
This protocol defines the interface for functions that save
images extracted from OCR responses and return a relative
path for markdown references.
The protocol expects:
- page_idx: 1-based page number
- img_idx: 1-based image index within the page
- image_b64: Base64-encoded image data
Returns:
Relative path to the saved image for markdown inclusion.
Example:
>>> def my_writer(page_idx: int, img_idx: int, image_b64: str) -> str:
... # Custom saving logic
... return f"images/page{page_idx}_img{img_idx}.png"
"""
def __call__(self, page_idx: int, img_idx: int, image_b64: str) -> str:
"""Save image and return relative path for markdown reference."""
...
# Type alias for image writer callables
ImageWriter = Callable[[int, int, str], str]
def create_image_writer(images_dir: Path) -> ImageWriter:
"""Create a function for saving images to disk.
This factory function creates a closure that saves base64-encoded
images to the specified directory and returns relative paths
suitable for markdown image references.
Args:
images_dir: Directory path where images will be saved.
The directory will be created if it doesn't exist.
Returns:
A callable that accepts (page_idx, img_idx, image_b64) and
returns the relative path to the saved image.
Example:
>>> from pathlib import Path
>>> writer = create_image_writer(Path("output/images"))
>>> path = writer(1, 0, "iVBORw0KGgoAAAANS...")
>>> print(path)
'images/page1_img0.png'
"""
# Create directory if it doesn't exist
images_dir.mkdir(parents=True, exist_ok=True)
def writer(page_idx: int, img_idx: int, image_b64: str) -> str:
"""Save an image and return its relative path.
Args:
page_idx: Page number (1-based).
img_idx: Image index within the page (1-based).
image_b64: Base64-encoded image data.
Returns:
Relative path to the saved image file.
"""
filename: str = f"page{page_idx}_img{img_idx}.png"
filepath: Path = images_dir / filename
# Decode and save
image_data: bytes = base64.b64decode(image_b64)
filepath.write_bytes(image_data)
# Return relative path for markdown
return f"images/{filename}"
return writer
def extract_images(ocr_response: Any, output_dir: Path) -> List[str]:
"""Extract all images from an OCR response.
Iterates through all pages in the OCR response, extracts any
embedded images, decodes them from base64, and saves them
to the output directory.
Args:
ocr_response: OCR response object from Mistral API.
Expected to have a pages attribute, where each page
may have an images list containing objects with
image_base64 attributes.
output_dir: Base output directory. Images will be saved
to a subdirectory named "images".
Returns:
List of absolute file paths to the extracted images.
Example:
>>> from pathlib import Path
>>> paths = extract_images(ocr_response, Path("output/my_doc"))
>>> for path in paths:
... print(path)
'C:/output/my_doc/images/page1_img1.png'
'C:/output/my_doc/images/page2_img1.png'
Note:
- Pages and images are 1-indexed in filenames
- Images without base64 data are silently skipped
- The images subdirectory is created automatically
"""
images_dir: Path = output_dir / "images"
images_dir.mkdir(parents=True, exist_ok=True)
extracted: List[str] = []
for page_index, page in enumerate(ocr_response.pages, start=1):
if not getattr(page, "images", None):
continue
for img_idx, img in enumerate(page.images, start=1):
image_b64: Optional[str] = getattr(img, "image_base64", None)
if not image_b64:
continue
filename: str = f"page{page_index}_img{img_idx}.png"
filepath: Path = images_dir / filename
# Decode and save
image_data: bytes = base64.b64decode(image_b64)
filepath.write_bytes(image_data)
extracted.append(str(filepath))
return extracted

View File

@@ -0,0 +1,319 @@
"""Multi-LLM Integration Module for Chat Conversation.
Provides a unified interface for calling different LLM providers with streaming support:
- Ollama (local, free)
- Mistral API
- Anthropic API (Claude)
- OpenAI API
Example:
>>> for token in call_llm("Hello world", "ollama", "qwen2.5:7b"):
... print(token, end="", flush=True)
"""
import os
import json
import time
import logging
from typing import Iterator, Optional
from dotenv import load_dotenv
load_dotenv()
logger = logging.getLogger(__name__)
class LLMError(Exception):
"""Base exception for LLM errors."""
pass
def call_llm(
prompt: str,
provider: str,
model: str,
stream: bool = True,
temperature: float = 0.7,
max_tokens: int = 16384,
) -> Iterator[str]:
"""Call an LLM provider with unified interface.
Args:
prompt: The prompt to send to the LLM.
provider: Provider name ("ollama", "mistral", "anthropic", "openai").
model: Model name (e.g., "qwen2.5:7b", "mistral-small-latest", "claude-sonnet-4-5").
stream: Whether to stream tokens (default: True).
temperature: Temperature for generation (0-1).
max_tokens: Maximum tokens to generate (default 16384 for philosophical discussions).
Yields:
Tokens as strings (when streaming).
Raises:
LLMError: If provider is invalid or API call fails.
Example:
>>> for token in call_llm("Test", "ollama", "qwen2.5:7b"):
... print(token, end="")
"""
provider = provider.lower()
logger.info(f"[LLM Call] Provider: {provider}, Model: {model}, Stream: {stream}")
start_time = time.time()
try:
if provider == "ollama":
yield from _call_ollama(prompt, model, temperature, stream)
elif provider == "mistral":
yield from _call_mistral(prompt, model, temperature, max_tokens, stream)
elif provider == "anthropic":
yield from _call_anthropic(prompt, model, temperature, max_tokens, stream)
elif provider == "openai":
yield from _call_openai(prompt, model, temperature, max_tokens, stream)
else:
raise LLMError(f"Provider '{provider}' non supporté. Utilisez: ollama, mistral, anthropic, openai")
except Exception as e:
elapsed = time.time() - start_time
logger.error(f"[LLM Call] Error after {elapsed:.2f}s: {e}")
raise
elapsed = time.time() - start_time
logger.info(f"[LLM Call] Completed in {elapsed:.2f}s")
def _call_ollama(prompt: str, model: str, temperature: float, stream: bool) -> Iterator[str]:
"""Call Ollama API with streaming support.
Args:
prompt: The prompt text.
model: Ollama model name.
temperature: Temperature (0-1).
stream: Whether to stream.
Yields:
Tokens from the model.
"""
import requests
base_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
url = f"{base_url}/api/generate"
payload = {
"model": model,
"prompt": prompt,
"stream": stream,
"options": {
"temperature": temperature,
}
}
try:
response = requests.post(url, json=payload, stream=stream, timeout=120)
response.raise_for_status()
if stream:
# Stream mode: each line is a JSON object with "response" field
for line in response.iter_lines():
if line:
try:
data = json.loads(line)
token = data.get("response", "")
if token:
yield token
# Check if done
if data.get("done", False):
break
except json.JSONDecodeError:
continue
else:
# Non-stream mode
data = response.json()
yield data.get("response", "")
except requests.exceptions.RequestException as e:
raise LLMError(f"Ollama API error: {e}")
def _call_mistral(prompt: str, model: str, temperature: float, max_tokens: int, stream: bool) -> Iterator[str]:
"""Call Mistral API with streaming support.
Args:
prompt: The prompt text.
model: Mistral model name.
temperature: Temperature (0-1).
max_tokens: Max tokens to generate.
stream: Whether to stream.
Yields:
Tokens from the model.
"""
api_key = os.getenv("MISTRAL_API_KEY")
if not api_key:
raise LLMError("MISTRAL_API_KEY not set in environment")
try:
from mistralai import Mistral
except ImportError:
raise LLMError("mistralai package not installed. Run: pip install mistralai")
client = Mistral(api_key=api_key)
messages = [{"role": "user", "content": prompt}]
try:
if stream:
# Streaming mode
stream_response = client.chat.stream(
model=model,
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
)
for chunk in stream_response:
if chunk.data.choices:
delta = chunk.data.choices[0].delta
if hasattr(delta, 'content') and delta.content:
yield delta.content
else:
# Non-streaming mode
response = client.chat.complete(
model=model,
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
)
if response.choices:
yield response.choices[0].message.content or ""
except Exception as e:
raise LLMError(f"Mistral API error: {e}")
def _call_anthropic(prompt: str, model: str, temperature: float, max_tokens: int, stream: bool) -> Iterator[str]:
"""Call Anthropic API (Claude) with streaming support.
Args:
prompt: The prompt text.
model: Claude model name.
temperature: Temperature (0-1).
max_tokens: Max tokens to generate.
stream: Whether to stream.
Yields:
Tokens from the model.
"""
api_key = os.getenv("ANTHROPIC_API_KEY")
if not api_key:
raise LLMError("ANTHROPIC_API_KEY not set in environment")
try:
from anthropic import Anthropic
except ImportError:
raise LLMError("anthropic package not installed. Run: pip install anthropic")
client = Anthropic(api_key=api_key)
try:
if stream:
# Streaming mode
with client.messages.stream(
model=model,
max_tokens=max_tokens,
temperature=temperature,
messages=[{"role": "user", "content": prompt}],
) as stream:
for text in stream.text_stream:
yield text
else:
# Non-streaming mode
response = client.messages.create(
model=model,
max_tokens=max_tokens,
temperature=temperature,
messages=[{"role": "user", "content": prompt}],
)
if response.content:
yield response.content[0].text
except Exception as e:
raise LLMError(f"Anthropic API error: {e}")
def _call_openai(prompt: str, model: str, temperature: float, max_tokens: int, stream: bool) -> Iterator[str]:
"""Call OpenAI API with streaming support.
Args:
prompt: The prompt text.
model: OpenAI model name.
temperature: Temperature (0-1).
max_tokens: Max tokens to generate.
stream: Whether to stream.
Yields:
Tokens from the model.
"""
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise LLMError("OPENAI_API_KEY not set in environment")
try:
from openai import OpenAI
except ImportError:
raise LLMError("openai package not installed. Run: pip install openai")
client = OpenAI(api_key=api_key)
messages = [{"role": "user", "content": prompt}]
# Detect if model uses max_completion_tokens (o1, gpt-5.x) instead of max_tokens
uses_completion_tokens = model.startswith("o1") or model.startswith("gpt-5")
try:
if stream:
# Streaming mode
if uses_completion_tokens:
stream_response = client.chat.completions.create(
model=model,
messages=messages,
max_completion_tokens=max_tokens,
stream=True,
)
else:
stream_response = client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
stream=True,
)
for chunk in stream_response:
if chunk.choices:
delta = chunk.choices[0].delta
if hasattr(delta, 'content') and delta.content:
yield delta.content
else:
# Non-streaming mode
if uses_completion_tokens:
response = client.chat.completions.create(
model=model,
messages=messages,
max_completion_tokens=max_tokens,
stream=False,
)
else:
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
stream=False,
)
if response.choices:
yield response.choices[0].message.content or ""
except Exception as e:
raise LLMError(f"OpenAI API error: {e}")

View File

@@ -0,0 +1,495 @@
"""Semantic chunking of documents via LLM.
This module provides intelligent semantic chunking capabilities for academic and
philosophical texts, using Large Language Models (LLM) to identify coherent units
of meaning (argumentative units, definitions, examples, citations, etc.).
Overview:
The module offers two chunking strategies:
1. **LLM-based semantic chunking** (chunk_section_with_llm):
Uses an LLM to identify semantic boundaries and create chunks that preserve
argumentative coherence. Each chunk is annotated with summary, concepts, type.
2. **Simple paragraph-based chunking** (simple_chunk_by_paragraphs):
A fast fallback that splits text by paragraph boundaries.
Semantic Unit Types:
- argument: A logical argument or reasoning sequence
- definition: A definition or conceptual clarification
- example: An illustrative example or case study
- citation: A quoted passage from another source
- exposition: Expository content presenting ideas
- transition: Transitional text between sections
Chunk Size Guidelines:
- Target size: 300-500 words per chunk (configurable)
- Chunks are never split mid-sentence or mid-paragraph
- Short sections (< 80% of target) are kept as single chunks
LLM Provider Support:
- ollama: Local LLM (free, slower, default)
- mistral: Mistral API (faster, requires API key)
See Also:
utils.llm_cleaner: Chunk cleaning and validation
utils.llm_classifier: Section type classification
utils.pdf_pipeline: Main pipeline orchestration
"""
from __future__ import annotations
import json
import logging
import re
from typing import Any, Dict, List, Literal, Optional, TypedDict
from .llm_structurer import (
_clean_json_string,
_get_default_mistral_model,
_get_default_model,
call_llm,
)
from .llm_cleaner import clean_page_markers, is_chunk_valid
from .types import LLMProvider, SemanticChunk
logger: logging.Logger = logging.getLogger(__name__)
# =============================================================================
# Type Definitions for LLM Chunker
# =============================================================================
#: Unit type for semantic chunking (specific to this module's LLM output)
ChunkUnitType = Literal[
"argument",
"definition",
"example",
"citation",
"exposition",
"transition",
"main_content",
]
class LLMChunkResponse(TypedDict, total=False):
"""Individual chunk structure as returned by LLM.
Attributes:
text: Chunk text content (exact copy from source)
summary: Brief one-sentence summary
concepts: Key concepts extracted (3-5 items)
type: Semantic unit type
"""
text: str
summary: str
concepts: List[str]
type: str
class LLMChunksResult(TypedDict):
"""Complete response structure from LLM chunking.
Attributes:
chunks: List of chunk objects
"""
chunks: List[LLMChunkResponse]
# Note: SemanticChunk is imported from utils.types
def extract_paragraph_number(text: str) -> Optional[int]:
"""Extract paragraph number from the beginning of text.
Many philosophical texts use numbered paragraphs. This function
detects various numbering formats.
Args:
text: Text content that may start with a paragraph number.
Returns:
The paragraph number if detected, None otherwise.
Example:
>>> extract_paragraph_number("9 On presente...")
9
>>> extract_paragraph_number("Normal text")
None
"""
text = text.strip()
# Patterns possibles pour les numéros de paragraphe
patterns: List[str] = [
r'^(\d+)\s+[A-ZÀ-Ü]', # "9 On présente..."
r'^(\d+)[A-ZÀ-Ü]', # "10Dans la classification..."
r'\s*(\d+)', # "§ 15 ..."
r'^\[(\d+)\]', # "[9] ..."
r'^(\d+)\.', # "9. ..."
r'^(\d+)\)', # "9) ..."
]
for pattern in patterns:
match: Optional[re.Match[str]] = re.match(pattern, text)
if match:
try:
return int(match.group(1))
except ValueError:
continue
return None
def _extract_json_from_response(text: str) -> Dict[str, Any]:
"""Extract JSON from LLM response text.
Handles both wrapped JSON (in <JSON></JSON> tags) and raw JSON responses.
Falls back to empty chunks list if parsing fails.
Args:
text: Response text from LLM containing JSON.
Returns:
Parsed JSON as dictionary with 'chunks' key. Returns
{"chunks": []} if parsing fails.
"""
json_match: Optional[re.Match[str]] = re.search(
r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL
)
if json_match:
json_str: str = _clean_json_string(json_match.group(1))
try:
result: Dict[str, Any] = json.loads(json_str)
return result
except json.JSONDecodeError:
pass
start: int = text.find("{")
end: int = text.rfind("}")
if start != -1 and end > start:
json_str = _clean_json_string(text[start:end + 1])
try:
result = json.loads(json_str)
return result
except json.JSONDecodeError as e:
logger.warning(f"JSON invalide: {e}")
return {"chunks": []}
def chunk_section_with_llm(
section_content: str,
section_title: str,
chapter_title: Optional[str] = None,
subsection_title: Optional[str] = None,
section_level: int = 1,
model: Optional[str] = None,
provider: LLMProvider = "ollama",
temperature: float = 0.2,
target_chunk_size: int = 400,
) -> List[SemanticChunk]:
"""Split a section into semantically coherent chunks using an LLM.
This is the main semantic chunking function. It uses an LLM to identify
natural semantic boundaries in academic/philosophical texts, preserving
argumentative coherence and annotating each chunk with metadata.
Args:
section_content: The text content of the section to chunk.
section_title: Title of the current section being chunked.
chapter_title: Title of the parent chapter (level 1) for context.
subsection_title: Title of parent subsection (level 2) if applicable.
section_level: Hierarchy level (1=chapter, 2=section, etc.).
model: LLM model name. If None, uses provider default.
provider: LLM provider ("ollama" for local, "mistral" for API).
temperature: LLM temperature (lower = more deterministic).
target_chunk_size: Target number of words per chunk.
Returns:
List of SemanticChunk dictionaries containing text, summary,
concepts, type, section_level, and optionally paragraph_number.
Note:
If section is shorter than 80% of target_chunk_size, it is returned
as a single chunk. If LLM fails, returns section with error field.
"""
if model is None:
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
# Nettoyer le contenu
content: str = clean_page_markers(section_content)
# Si le contenu est court, ne pas découper
word_count: int = len(content.split())
if word_count < target_chunk_size * 0.8:
para_num: Optional[int] = extract_paragraph_number(content)
chunk: SemanticChunk = {
"text": content,
"summary": section_title,
"concepts": [],
"type": "main_content",
"section_level": section_level,
}
if para_num is not None:
chunk["paragraph_number"] = para_num
if subsection_title and subsection_title != section_title:
chunk["subsection_title"] = subsection_title
return [chunk]
chapter_info: str = f"Chapitre: {chapter_title}\n" if chapter_title else ""
prompt = f"""Tu es un expert en analyse de textes académiques.
TÂCHE: Découper ce texte en unités sémantiques cohérentes.
{chapter_info}Section: {section_title}
RÈGLES DE DÉCOUPAGE:
1. Chaque chunk doit avoir un SENS COMPLET (une idée, un argument)
2. Taille idéale: {target_chunk_size - 100} à {target_chunk_size + 100} mots
3. NE PAS couper au milieu d'une phrase ou d'un paragraphe
4. NE PAS couper au milieu d'une citation
5. Regrouper les paragraphes qui développent la même idée
6. Un chunk peut être plus long si nécessaire pour préserver le sens
POUR CHAQUE CHUNK, INDIQUE:
- text: le texte exact (copié, pas reformulé)
- summary: résumé en 1 phrase courte
- concepts: 3-5 concepts clés (mots ou expressions)
- type: argument | définition | exemple | citation | exposition | transition
TEXTE À DÉCOUPER:
{content}
RÉPONDS avec un JSON entre <JSON></JSON>:
<JSON>
{{
"chunks": [
{{
"text": "Premier paragraphe ou groupe de paragraphes...",
"summary": "Présentation de l'idée principale",
"concepts": ["concept1", "concept2", "concept3"],
"type": "exposition"
}},
{{
"text": "Deuxième partie du texte...",
"summary": "Développement de l'argument",
"concepts": ["concept4", "concept5"],
"type": "argument"
}}
]
}}
</JSON>
"""
logger.info(f"Chunking sémantique de '{section_title}' ({word_count} mots) via {provider.upper()}")
try:
response: str = call_llm(
prompt, model=model, provider=provider, temperature=temperature, timeout=300
)
result: Dict[str, Any] = _extract_json_from_response(response)
chunks: List[Dict[str, Any]] = result.get("chunks", [])
# Valider les chunks et extraire les numéros de paragraphe
valid_chunks: List[SemanticChunk] = []
for raw_chunk in chunks:
text: str = raw_chunk.get("text", "")
if is_chunk_valid(text):
# Extraire le numéro de paragraphe s'il existe
para_num = extract_paragraph_number(text)
chunk_data: SemanticChunk = {
"text": text,
"summary": raw_chunk.get("summary", ""),
"concepts": raw_chunk.get("concepts", []),
"type": raw_chunk.get("type", "main_content"),
"section_level": section_level,
}
# Ajouter le numéro de paragraphe si détecté
if para_num is not None:
chunk_data["paragraph_number"] = para_num
# Ajouter la hiérarchie complète
if subsection_title and subsection_title != section_title:
chunk_data["subsection_title"] = subsection_title
valid_chunks.append(chunk_data)
# Si aucun chunk valide, retourner le contenu complet
if not valid_chunks:
logger.warning(f"Aucun chunk valide pour '{section_title}', retour contenu complet")
para_num = extract_paragraph_number(content)
fallback: SemanticChunk = {
"text": content,
"summary": section_title,
"concepts": [],
"type": "main_content",
"section_level": section_level,
}
if para_num is not None:
fallback["paragraph_number"] = para_num
return [fallback]
logger.info(f"Section '{section_title}' découpée en {len(valid_chunks)} chunks")
return valid_chunks
except Exception as e:
logger.error(f"Erreur chunking LLM: {e}")
# Fallback: retourner le contenu complet
para_num = extract_paragraph_number(content)
fallback_err: SemanticChunk = {
"text": content,
"summary": section_title,
"concepts": [],
"type": "main_content",
"section_level": section_level,
"error": str(e),
}
if para_num is not None:
fallback_err["paragraph_number"] = para_num
return [fallback_err]
def simple_chunk_by_paragraphs(
content: str,
max_words: int = 500,
min_words: int = 100,
) -> List[str]:
"""Split text into chunks by paragraph boundaries (no LLM required).
This is a fast fallback chunking method that respects paragraph and
sentence boundaries. Use when LLM processing is not desired.
The algorithm:
1. Split by double newlines (paragraph boundaries)
2. Merge small paragraphs until max_words is reached
3. Split long paragraphs at sentence boundaries
4. Filter chunks below min_words threshold
Args:
content: Text content to split into chunks.
max_words: Maximum words per chunk. Defaults to 500.
min_words: Minimum words per chunk. Defaults to 100.
Returns:
List of text chunks as strings.
Example:
>>> chunks = simple_chunk_by_paragraphs(text, max_words=400)
>>> len(chunks)
3
"""
content = clean_page_markers(content)
# Découper par paragraphes (double saut de ligne)
paragraphs: List[str] = re.split(r'\n\n+', content)
chunks: List[str] = []
current_chunk: List[str] = []
current_words: int = 0
for para in paragraphs:
para = para.strip()
if not para:
continue
para_words: int = len(para.split())
# Si le paragraphe seul est trop long, le découper par phrases
if para_words > max_words:
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
current_chunk = []
current_words = 0
# Découper par phrases
sentences: List[str] = re.split(r'(?<=[.!?])\s+', para)
for sentence in sentences:
sentence_words: int = len(sentence.split())
if current_words + sentence_words > max_words and current_chunk:
chunks.append('\n\n'.join(current_chunk))
current_chunk = [sentence]
current_words = sentence_words
else:
current_chunk.append(sentence)
current_words += sentence_words
# Si ajouter ce paragraphe dépasse la limite
elif current_words + para_words > max_words:
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
current_chunk = [para]
current_words = para_words
else:
current_chunk.append(para)
current_words += para_words
# Dernier chunk
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
# Filtrer les chunks trop courts
return [c for c in chunks if len(c.split()) >= min_words or len(chunks) == 1]
def extract_concepts_from_chunk(
chunk_text: str,
model: Optional[str] = None,
provider: LLMProvider = "ollama",
) -> List[str]:
"""Extract key concepts from a text chunk using an LLM.
Useful for enriching chunks created without LLM processing or for
extracting additional concepts from existing chunks.
Args:
chunk_text: The text content to analyze for concepts.
model: LLM model name. If None, uses provider default.
provider: LLM provider ("ollama" or "mistral").
Returns:
List of 3-5 key concepts (words or short phrases). Returns
empty list if extraction fails or text is too short (< 100 chars).
Example:
>>> concepts = extract_concepts_from_chunk("L'etre-pour-la-mort...")
>>> concepts
['etre-pour-la-mort', 'structure existentiale', 'Dasein']
"""
if model is None:
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
if len(chunk_text) < 100:
return []
prompt: str = f"""Extrait les 3-5 concepts clés de ce texte.
Un concept = un mot ou une expression courte (2-3 mots max).
Texte:
{chunk_text[:1500]}
Réponds avec une liste JSON simple:
["concept1", "concept2", "concept3"]
"""
try:
response: str = call_llm(prompt, model=model, provider=provider, temperature=0.1, timeout=60)
# Chercher la liste JSON
match: Optional[re.Match[str]] = re.search(r'\[.*?\]', response, re.DOTALL)
if match:
concepts: List[str] = json.loads(match.group())
return concepts[:5] # Max 5 concepts
return []
except Exception as e:
logger.warning(f"Erreur extraction concepts: {e}")
return []

View File

@@ -0,0 +1,582 @@
"""LLM-based section classification module for document structure analysis.
This module provides functionality to classify document sections by type
(front_matter, chapter, appendix, etc.) using Large Language Models and
determine which sections should be indexed for semantic search.
Key Features:
- Section classification via LLM (classify_sections)
- Automatic TOC/metadata section exclusion (is_excluded_section)
- Post-classification validation (validate_classified_sections)
- Filtering for indexable content (filter_indexable_sections)
Section Types:
The following section types are recognized:
**Indexable Content (should_index=True):**
- chapter: Main document content, essays, articles, book reviews
- introduction: Document introductions
- conclusion: Document conclusions
- preface: Prefaces, forewords, warnings (intellectual content)
- abstract: Summaries, abstracts
**Non-Indexable Content (should_index=False):**
- front_matter: Title pages, copyright, credits, colophon
- toc_display: Table of contents display (not content)
- appendix: Document appendices
- bibliography: References, bibliography
- index: Document index
- notes: End notes
- ignore: Ads, empty pages, technical metadata
Classification Strategy:
1. LLM analyzes section titles and content previews
2. Automatic exclusion rules catch common TOC/metadata patterns
3. Post-classification validation detects false positives
4. Filtering extracts only indexable content
Typical Usage:
>>> from utils.llm_classifier import classify_sections, filter_indexable_sections
>>> sections = [
... {"title": "Table of Contents", "content": "...", "level": 1},
... {"title": "Introduction", "content": "...", "level": 1},
... {"title": "Chapter 1", "content": "...", "level": 1}
... ]
>>> classified = classify_sections(sections, provider="ollama")
>>> indexable = filter_indexable_sections(classified)
>>> print([s["title"] for s in indexable])
['Introduction', 'Chapter 1']
LLM Provider Options:
- "ollama": Local processing, free but slower
- "mistral": Cloud API, faster but incurs costs
Note:
The classifier is designed to handle edge cases like:
- Book reviews with analytical content (classified as chapter)
- Editor's notes without analysis (classified as front_matter)
- TOC fragments embedded in content (detected and excluded)
See Also:
- llm_toc: Table of contents extraction
- llm_chunker: Semantic chunking of classified sections
- llm_metadata: Document metadata extraction
"""
from __future__ import annotations
import json
import logging
import re
from typing import cast, Any, Dict, Final
from .llm_structurer import (
_clean_json_string,
_get_default_mistral_model,
_get_default_model,
call_llm,
)
from .types import LLMProvider
logger: logging.Logger = logging.getLogger(__name__)
# Types de sections possibles
SECTION_TYPES: Final[dict[str, str]] = {
"front_matter": "Métadonnées, page de titre, copyright, crédits, NOTE DE L'ÉDITEUR, colophon",
"toc_display": "Table des matières affichée (pas le contenu)",
"preface": "Préface, avant-propos, avertissement (contenu intellectuel à indexer)",
"abstract": "Résumé, abstract",
"introduction": "Introduction de l'œuvre",
"chapter": "Chapitre principal du document",
"conclusion": "Conclusion de l'œuvre",
"appendix": "Annexes",
"bibliography": "Bibliographie, références",
"index": "Index",
"notes": "Notes de fin",
"ignore": "À ignorer (publicités, pages vides, métadonnées techniques)",
}
def _extract_json_from_response(text: str) -> dict[str, Any]:
"""Extract JSON from LLM response text.
Handles two formats:
1. JSON wrapped in <JSON></JSON> tags
2. Raw JSON object in the response
Args:
text: Raw LLM response text.
Returns:
Parsed JSON as dictionary. Returns {"classifications": []} on failure.
"""
json_match: re.Match[str] | None = re.search(
r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL
)
if json_match:
json_str: str = _clean_json_string(json_match.group(1))
try:
result: Dict[str, Any] = json.loads(json_str)
return result
except json.JSONDecodeError:
pass
start: int = text.find("{")
end: int = text.rfind("}")
if start != -1 and end > start:
json_str = _clean_json_string(text[start:end + 1])
try:
result = json.loads(json_str)
return result
except json.JSONDecodeError as e:
logger.warning(f"JSON invalide: {e}")
return {"classifications": []}
def classify_sections(
sections: list[dict[str, Any]],
document_title: str | None = None,
model: str | None = None,
provider: LLMProvider = "ollama",
temperature: float = 0.1,
) -> list[dict[str, Any]]:
"""Classify document sections by type using LLM.
Uses an LLM to analyze section titles and content previews to determine
the type of each section (chapter, front_matter, toc_display, etc.) and
whether it should be indexed for semantic search.
Args:
sections: List of section dictionaries with keys:
- title: Section title
- content: Section content (preview used)
- level: Hierarchy level (1=chapter, 2=section, etc.)
document_title: Optional document title for context.
model: LLM model name. If None, uses provider default.
provider: LLM provider ("ollama" or "mistral").
temperature: Model temperature (0.0-1.0). Lower = more deterministic.
Returns:
Same sections list with added classification fields:
- type: Section type (SectionType literal)
- should_index: Whether to include in vector index
- chapter_number: Chapter number if applicable
- classification_reason: Explanation for the classification
Example:
>>> sections = [{"title": "Introduction", "content": "...", "level": 1}]
>>> classified = classify_sections(sections, provider="ollama")
>>> classified[0]["type"]
'introduction'
>>> classified[0]["should_index"]
True
"""
if model is None:
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
# Préparer les sections pour le prompt
sections_for_prompt: list[dict[str, Any]] = []
for i, section in enumerate(sections[:50]): # Limiter à 50 sections
sections_for_prompt.append({
"index": i,
"title": section.get("title", ""),
"preview": section.get("content", "")[:200] if section.get("content") else "",
"level": section.get("level", 1),
})
types_description: str = "\n".join([f"- {k}: {v}" for k, v in SECTION_TYPES.items()])
title_context: str = f"Titre du document: {document_title}\n" if document_title else ""
prompt: str = f"""Tu es un expert en analyse de structure documentaire.
TÂCHE: Classifier chaque section selon son type.
{title_context}
TYPES DISPONIBLES:
{types_description}
RÈGLES:
1. "front_matter": UNIQUEMENT pages de titre SANS contenu, copyright, colophon (métadonnées pures)
2. "toc_display": la TABLE DES MATIÈRES elle-même (pas son contenu)
3. "preface": préface, avant-propos, avertissement (À INDEXER car contenu intellectuel)
4. "chapter": TOUT contenu principal - chapitres, sections, articles, revues de livre, essais
5. "ignore": publicités, pages vides, métadonnées techniques sans valeur
IMPORTANT - REVUES DE LIVRE ET ARTICLES:
- Une REVUE DE LIVRE ("Book Review") avec analyse critique → chapter, should_index = true
- Un ARTICLE académique avec contenu substantiel → chapter, should_index = true
- Les métadonnées éditoriales (auteur, affiliation, journal) au début d'un article NE sont PAS un motif pour classer comme "front_matter"
- Si le document contient un TEXTE ANALYTIQUE développé → chapter
CAS PARTICULIERS:
- "NOTE DE L'ÉDITEUR" (infos édition, réimpression, SANS analyse) → front_matter, should_index = false
- "PRÉFACE" ou "AVANT-PROPOS" (texte intellectuel) → preface, should_index = true
- "Book Review" ou "Article" avec paragraphes d'analyse → chapter, should_index = true
INDEXATION:
- should_index = true pour: preface, introduction, chapter, conclusion, abstract
- should_index = false pour: front_matter, toc_display, ignore
⚠️ ATTENTION AUX FAUX POSITIFS - LISTE DE TITRES VS CONTENU RÉEL:
LISTE DE TITRES (toc_display, should_index=false):
- Suite de titres courts sans texte explicatif
- Lignes commençant par "Comment...", "Où...", "Les dispositions à..."
- Énumération de sections sans phrase complète
- Exemple: "Comment fixer la croyance?\\nOù la croyance s'oppose au savoir\\nL'idéal de rationalité"
CONTENU RÉEL (chapter, should_index=true):
- Texte avec phrases complètes et verbes conjugués
- Paragraphes développés avec arguments
- Explications, définitions, raisonnements
- Exemple: "Comment fixer la croyance? Cette question se pose dès lors que..."
SECTIONS À CLASSIFIER:
{json.dumps(sections_for_prompt, ensure_ascii=False, indent=2)}
RÉPONDS avec un JSON entre <JSON></JSON>:
<JSON>
{{
"classifications": [
{{
"index": 0,
"type": "front_matter",
"should_index": false,
"chapter_number": null,
"reason": "Page de titre avec métadonnées éditeur"
}},
{{
"index": 1,
"type": "chapter",
"should_index": true,
"chapter_number": 1,
"reason": "Premier chapitre du document"
}}
]
}}
</JSON>
"""
logger.info(f"Classification de {len(sections_for_prompt)} sections via {provider.upper()} ({model})")
try:
response: str = call_llm(prompt, model=model, provider=provider, temperature=temperature, timeout=300)
result: dict[str, Any] = _extract_json_from_response(response)
classifications: list[dict[str, Any]] = result.get("classifications", [])
# Créer un mapping index -> classification
class_map: dict[int, dict[str, Any]] = {
c["index"]: c for c in classifications if "index" in c
}
# Appliquer les classifications
for i, section in enumerate(sections):
if i in class_map:
c: dict[str, Any] = class_map[i]
section["type"] = c.get("type", "chapter")
section["should_index"] = c.get("should_index", True)
section["chapter_number"] = c.get("chapter_number")
section["classification_reason"] = c.get("reason", "")
else:
# Défaut: traiter comme contenu
section["type"] = "chapter"
section["should_index"] = True
section["chapter_number"] = None
# Stats
types_count: dict[str, int] = {}
for s in sections:
t: str = s.get("type", "unknown")
types_count[t] = types_count.get(t, 0) + 1
logger.info(f"Classification terminée: {types_count}")
return sections
except Exception as e:
logger.error(f"Erreur classification sections: {e}")
# En cas d'erreur, marquer tout comme indexable
for section in sections:
section["type"] = "chapter"
section["should_index"] = True
return sections
# Titres à exclure automatiquement (insensible à la casse)
EXCLUDED_SECTION_TITLES: Final[list[str]] = [
"table des matières",
"table des matieres",
"sommaire",
"table of contents",
"contents",
"toc",
"index",
"liste des figures",
"liste des tableaux",
"list of figures",
"list of tables",
"note de l'éditeur",
"note de l'editeur",
"note de la rédaction",
"copyright",
"mentions légales",
"crédits",
"colophon",
"achevé d'imprimer",
]
def is_excluded_section(section: dict[str, Any]) -> bool:
"""Check if a section should be automatically excluded from indexing.
Excludes sections based on:
1. Title matching known TOC/metadata patterns
2. Content analysis detecting TOC-like structure (short lines, title patterns)
Args:
section: Section dictionary with optional keys:
- title: Section title
- chapterTitle: Parent chapter title
- content: Section content
Returns:
True if section should be excluded from indexing.
Example:
>>> is_excluded_section({"title": "Table des matières"})
True
>>> is_excluded_section({"title": "Introduction", "content": "..."})
False
"""
title: str = section.get("title", "").lower().strip()
chapter_title: str = section.get("chapterTitle", "").lower().strip()
# Vérifier le titre de la section
for excluded in EXCLUDED_SECTION_TITLES:
if excluded in title or title == excluded:
return True
if excluded in chapter_title or chapter_title == excluded:
return True
# Vérifier si le contenu ressemble à une liste de titres (TOC)
content: str = section.get("content", "")
if content:
lines: list[str] = [l.strip() for l in content.split("\n") if l.strip()]
# Si pas assez de lignes, pas de détection
if len(lines) < 3:
return False
# Critère 1: Lignes courtes (moyenne < 50 chars)
avg_len: float = sum(len(l) for l in lines) / len(lines)
# Critère 2: Toutes les lignes sont courtes (< 100 chars)
all_short: bool = all(len(l) < 100 for l in lines[:10])
# Critère 3: Patterns typiques de titres de sections
title_patterns: list[str] = [
r'^Comment\s+.+\?', # "Comment fixer la croyance?"
r'^Où\s+.+', # "Où la croyance s'oppose"
r'^Les?\s+\w+\s+à\s+', # "Les dispositions à penser"
r'^Que\s+.+\?', # "Que peut-on savoir?"
r'^L[ae]\s+\w+\s+(de|du)\s+', # "La critique de l'intuition"
r'^Entre\s+.+\s+et\s+', # "Entre nature et norme"
]
# Compter combien de lignes matchent les patterns de titres
title_like_count: int = 0
for line in lines[:10]:
for pattern in title_patterns:
if re.match(pattern, line, re.IGNORECASE):
title_like_count += 1
break
# Critère 4: Pas de verbes conjugués typiques du contenu narratif
narrative_verbs: list[str] = [
r'\best\b', r'\bsont\b', r'\bétait\b', r'\bsera\b',
r'\ba\b', r'\bont\b', r'\bavait\b', r'\bavaient\b',
r'\bfait\b', r'\bdit\b', r'\bpense\b', r'\bexplique\b'
]
has_narrative: bool = False
for line in lines[:5]:
for verb_pattern in narrative_verbs:
if re.search(verb_pattern, line, re.IGNORECASE):
has_narrative = True
break
if has_narrative:
break
# Décision: C'est une liste de titres (TOC) si:
# - Lignes courtes ET toutes < 100 chars ET (beaucoup de patterns de titres OU pas de verbes narratifs)
if len(lines) >= 5 and avg_len < 50 and all_short:
if title_like_count >= len(lines) * 0.4 or not has_narrative:
logger.debug(f"Section '{title}' exclue: ressemble à une TOC (lignes courtes, {title_like_count}/{len(lines)} titres)")
return True
return False
def filter_indexable_sections(sections: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Filter sections to keep only those that should be indexed.
Applies multiple exclusion criteria:
1. Automatic exclusion by title pattern (TOC, index, etc.)
2. Parent chapter exclusion (if parent is TOC)
3. LLM classification (should_index flag)
Args:
sections: List of classified section dictionaries.
Returns:
Filtered list containing only indexable sections.
Example:
>>> sections = [
... {"title": "TOC", "should_index": False},
... {"title": "Chapter 1", "should_index": True}
... ]
>>> filtered = filter_indexable_sections(sections)
>>> len(filtered)
1
"""
filtered: list[dict[str, Any]] = []
excluded_count: int = 0
for s in sections:
# Vérifier l'exclusion automatique
if is_excluded_section(s):
logger.info(f"Section exclue automatiquement: '{s.get('title', 'Sans titre')}'")
excluded_count += 1
continue
# Vérifier si le chapitre parent est une TOC
chapter_title: str = s.get("chapterTitle", "").lower().strip()
if any(excluded in chapter_title for excluded in EXCLUDED_SECTION_TITLES):
logger.info(f"Section exclue (chapitre TOC): '{s.get('title', 'Sans titre')}' dans '{chapter_title}'")
excluded_count += 1
continue
# Vérifier la classification LLM
if s.get("should_index", True):
filtered.append(s)
else:
excluded_count += 1
if excluded_count > 0:
logger.info(f"Sections exclues: {excluded_count}, indexables: {len(filtered)}")
return filtered
def validate_classified_sections(sections: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Post-classification validation to detect false positives.
Performs additional checks on sections marked should_index=True to catch
TOC fragments that escaped initial classification:
1. Parent chapter is TOC -> exclude
2. Content is mostly short title-like lines -> reclassify as toc_display
Args:
sections: List of already-classified section dictionaries.
Returns:
Validated sections with corrections applied. Corrections are logged
and stored in 'validation_correction' field.
Example:
>>> sections = [{"title": "Part 1", "should_index": True, "content": "..."}]
>>> validated = validate_classified_sections(sections)
>>> # May reclassify sections with TOC-like content
"""
validated: list[dict[str, Any]] = []
fixed_count: int = 0
for section in sections:
# Vérifier d'abord si le titre du chapitre parent est une TOC
chapter_title: str = section.get("chapter_title", "").lower().strip()
section_title: str = section.get("title", "").lower().strip()
# Exclure si le chapitre parent est une TOC
is_toc_chapter: bool = False
for excluded in EXCLUDED_SECTION_TITLES:
if excluded in chapter_title:
logger.warning(f"Section '{section.get('title', 'Sans titre')}' exclue: chapitre parent est '{chapter_title}'")
section["should_index"] = False
section["type"] = "toc_display"
section["validation_correction"] = f"Exclue car chapitre parent = {chapter_title}"
fixed_count += 1
is_toc_chapter = True
break
if is_toc_chapter:
validated.append(section)
continue
# Si déjà marquée comme non-indexable, garder tel quel
if not section.get("should_index", True):
validated.append(section)
continue
content: str = section.get("content", "")
# Validation supplémentaire sur le contenu
if content:
lines: list[str] = [l.strip() for l in content.split("\n") if l.strip()]
# Si très peu de lignes, probablement pas un problème
if len(lines) < 3:
validated.append(section)
continue
# Calculer le ratio de lignes qui ressemblent à des titres
title_question_pattern: str = r'^(Comment|Où|Que|Quelle|Quel|Les?\s+\w+\s+(de|du|à)|Entre\s+.+\s+et)\s+'
title_like: int = sum(1 for l in lines if re.match(title_question_pattern, l, re.IGNORECASE))
# Si > 50% des lignes ressemblent à des titres ET lignes courtes
avg_len: float = sum(len(l) for l in lines) / len(lines)
if len(lines) >= 4 and title_like >= len(lines) * 0.5 and avg_len < 55:
# C'est probablement une liste de titres extraite de la TOC
logger.warning(f"Section '{section.get('title', 'Sans titre')}' reclassée: détectée comme liste de titres TOC")
section["should_index"] = False
section["type"] = "toc_display"
section["validation_correction"] = "Reclassée comme toc_display (liste de titres)"
fixed_count += 1
validated.append(section)
continue
validated.append(section)
if fixed_count > 0:
logger.info(f"Validation post-classification: {fixed_count} section(s) reclassée(s)")
return validated
def get_chapter_sections(sections: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Filter sections to return only chapter-type content.
Returns sections with types that contain main document content:
chapter, introduction, conclusion, abstract, preface.
Args:
sections: List of classified section dictionaries.
Returns:
Filtered list containing only chapter-type sections.
Example:
>>> sections = [
... {"title": "TOC", "type": "toc_display"},
... {"title": "Chapter 1", "type": "chapter"}
... ]
>>> chapters = get_chapter_sections(sections)
>>> len(chapters)
1
"""
chapter_types: set[str] = {"chapter", "introduction", "conclusion", "abstract", "preface"}
return [s for s in sections if s.get("type") in chapter_types]

View File

@@ -0,0 +1,389 @@
"""Text cleaning and validation for OCR-extracted content.
This module provides utilities for cleaning OCR artifacts from extracted text,
validating chunk content, and optionally using LLM for intelligent corrections.
It handles common OCR issues like page markers, isolated page numbers,
repeated headers/footers, and character recognition errors.
Overview:
The module offers three levels of cleaning:
1. **Basic cleaning** (clean_page_markers, clean_ocr_artifacts):
Fast regex-based cleaning for common issues. Always applied.
2. **LLM-enhanced cleaning** (clean_content_with_llm):
Uses an LLM to correct subtle OCR errors while preserving meaning.
Only applied when explicitly requested and for medium-length texts.
3. **Validation** (is_chunk_valid):
Checks if a text chunk contains meaningful content.
Cleaning Operations:
- Remove page markers (<!-- Page X -->)
- Remove isolated page numbers
- Remove short/repetitive header/footer lines
- Normalize multiple spaces and blank lines
- Correct obvious OCR character errors (LLM mode)
- Preserve citations, technical vocabulary, paragraph structure
Validation Criteria:
- Minimum character count (default: 20)
- Minimum word count (default: 5)
- Not pure metadata (URLs, ISBNs, DOIs, copyright notices)
LLM Provider Support:
- ollama: Local LLM (free, slower, default)
- mistral: Mistral API (faster, requires API key)
Example:
>>> from utils.llm_cleaner import clean_chunk, is_chunk_valid
>>>
>>> # Clean a chunk with basic cleaning only
>>> text = "<!-- Page 42 --> Some philosophical content..."
>>> cleaned = clean_chunk(text)
>>> print(cleaned)
'Some philosophical content...'
>>>
>>> # Validate chunk before processing
>>> if is_chunk_valid(cleaned):
... process_chunk(cleaned)
See Also:
utils.llm_chunker: Semantic chunking of sections
utils.llm_validator: Document validation and concept extraction
utils.pdf_pipeline: Main pipeline orchestration
"""
from __future__ import annotations
import logging
import re
from typing import List, Optional, Pattern
from .llm_structurer import call_llm, _get_default_model, _get_default_mistral_model
from .types import LLMProvider
logger: logging.Logger = logging.getLogger(__name__)
# Type alias for compiled regex patterns
RegexPattern = Pattern[str]
def clean_page_markers(text: str) -> str:
r"""Remove page markers and normalize blank lines from text.
Page markers are HTML comments inserted during OCR processing to track
page boundaries. This function removes them along with excessive blank
lines that may result from the removal.
Args:
text: Text content potentially containing page markers like
'<!-- Page 42 -->' and multiple consecutive newlines.
Returns:
Cleaned text with page markers removed and no more than two
consecutive newlines. Text is stripped of leading/trailing whitespace.
Example:
>>> text = "<!-- Page 1 -->\nContent here\n\n\n\n<!-- Page 2 -->"
>>> clean_page_markers(text)
'Content here'
"""
# Supprimer les marqueurs <!-- Page X -->
text = re.sub(r'<!--\s*Page\s*\d+\s*-->', '', text)
# Supprimer les lignes vides multiples
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
def clean_ocr_artifacts(text: str) -> str:
r"""Remove common OCR artifacts without using LLM.
This function performs fast, rule-based cleaning of typical OCR issues:
- Isolated page numbers (1-4 digits on their own line)
- Very short lines likely to be headers/footers (<=3 chars)
- Multiple consecutive spaces
- Excessive blank lines (>2)
Lines starting with '#' (markdown headers) are preserved regardless
of length. Empty lines are preserved (single blank lines only).
Args:
text: Raw OCR-extracted text potentially containing artifacts
like isolated page numbers, repeated headers, and irregular spacing.
Returns:
Cleaned text with artifacts removed and spacing normalized.
Leading/trailing whitespace is stripped.
Example:
>>> text = "42\n\nActual content here\n\n\n\n\nMore text"
>>> clean_ocr_artifacts(text)
'Actual content here\n\nMore text'
Note:
This function is always called as part of clean_chunk() and provides
a baseline level of cleaning even when LLM cleaning is disabled.
"""
# Supprimer les numéros de page isolés
text = re.sub(r'^\d{1,4}\s*$', '', text, flags=re.MULTILINE)
# Supprimer les en-têtes/pieds de page répétés (lignes très courtes isolées)
lines: List[str] = text.split('\n')
cleaned_lines: List[str] = []
for line in lines:
# Garder les lignes non vides et significatives
stripped: str = line.strip()
if stripped and (len(stripped) > 3 or stripped.startswith('#')):
cleaned_lines.append(line)
elif not stripped:
cleaned_lines.append('') # Préserver les lignes vides simples
text = '\n'.join(cleaned_lines)
# Normaliser les espaces
text = re.sub(r' {2,}', ' ', text)
# Supprimer les lignes vides multiples
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
def clean_content_with_llm(
text: str,
context: Optional[str] = None,
model: Optional[str] = None,
provider: LLMProvider = "ollama",
temperature: float = 0.1,
) -> str:
"""Clean text content using an LLM for intelligent OCR error correction.
Uses a language model to correct subtle OCR errors that rule-based
cleaning cannot handle, such as misrecognized characters in context.
The LLM is instructed to preserve the intellectual content exactly
while fixing obvious technical errors.
The function includes safeguards:
- Texts < 50 chars: Only basic cleaning (LLM skipped)
- Texts > 3000 chars: Only basic cleaning (timeout risk)
- If LLM changes text by >50%: Fallback to basic cleaning
Args:
text: Text content to clean. Should be between 50-3000 characters
for LLM processing.
context: Optional context about the document (title, subject) to
help the LLM make better corrections. Example: "Heidegger's
Being and Time, Chapter 2".
model: LLM model name. If None, uses provider default
(qwen2.5:7b for ollama, mistral-small-latest for mistral).
provider: LLM provider to use. Options: "ollama" (local, free)
or "mistral" (API, faster).
temperature: LLM temperature for response generation. Lower values
(0.1) produce more deterministic corrections. Defaults to 0.1.
Returns:
Cleaned text with OCR errors corrected. If LLM fails or produces
suspicious output (too short/long), returns basic-cleaned text.
Raises:
No exceptions raised - all errors caught and handled with fallback.
Example:
>>> text = "Heidegger's concept of Dase1n is central..." # '1' should be 'i'
>>> clean_content_with_llm(text, context="Being and Time")
"Heidegger's concept of Dasein is central..."
Note:
The LLM is explicitly instructed NOT to:
- Modify meaning or intellectual content
- Rephrase or summarize
- Add any new content
- Alter citations or technical vocabulary
"""
if model is None:
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
# Ne pas traiter les textes trop courts
if len(text.strip()) < 50:
return clean_page_markers(text)
# Limiter la taille pour éviter les timeouts
max_chars: int = 3000
if len(text) > max_chars:
# Pour les longs textes, nettoyer sans LLM
return clean_page_markers(clean_ocr_artifacts(text))
context_info: str = f"Contexte: {context}\n" if context else ""
prompt: str = f"""Tu es un expert en correction de textes OCRisés.
TÂCHE: Nettoyer ce texte extrait par OCR.
{context_info}
ACTIONS À EFFECTUER:
1. Supprimer les marqueurs de page (<!-- Page X -->)
2. Corriger les erreurs OCR ÉVIDENTES (caractères mal reconnus)
3. Supprimer les artefacts (numéros de page isolés, en-têtes répétés)
4. Normaliser la ponctuation et les espaces
RÈGLES STRICTES:
- NE PAS modifier le sens ou le contenu intellectuel
- NE PAS reformuler ou résumer
- NE PAS ajouter de contenu
- Préserver les citations et le vocabulaire technique
- Garder la structure des paragraphes
TEXTE À NETTOYER:
{text}
RÉPONDS UNIQUEMENT avec le texte nettoyé, sans commentaires ni balises."""
try:
response: str = call_llm(
prompt, model=model, provider=provider, temperature=temperature, timeout=120
)
# Vérifier que la réponse est valide
cleaned: str = response.strip()
# Si la réponse est trop différente (LLM a trop modifié), garder l'original nettoyé basiquement
if len(cleaned) < len(text) * 0.5 or len(cleaned) > len(text) * 1.5:
logger.warning("LLM a trop modifié le texte, utilisation du nettoyage basique")
return clean_page_markers(clean_ocr_artifacts(text))
return cleaned
except Exception as e:
logger.warning(f"Erreur nettoyage LLM: {e}, utilisation du nettoyage basique")
return clean_page_markers(clean_ocr_artifacts(text))
def clean_chunk(
chunk_text: str,
use_llm: bool = False,
context: Optional[str] = None,
model: Optional[str] = None,
provider: LLMProvider = "ollama",
) -> str:
r"""Clean a text chunk with optional LLM enhancement.
This is the main entry point for chunk cleaning. It always applies
basic cleaning (page markers, OCR artifacts) and optionally uses
LLM for more intelligent error correction.
Cleaning pipeline:
1. Remove page markers (always)
2. Remove OCR artifacts (always)
3. LLM correction (if use_llm=True and text >= 50 chars)
Args:
chunk_text: Raw text content of the chunk to clean.
use_llm: Whether to use LLM for enhanced cleaning. Defaults to
False. Set to True for higher quality but slower processing.
context: Optional document context (title, chapter) passed to LLM
for better corrections. Ignored if use_llm=False.
model: LLM model name. If None, uses provider default.
Ignored if use_llm=False.
provider: LLM provider ("ollama" or "mistral"). Defaults to
"ollama". Ignored if use_llm=False.
Returns:
Cleaned chunk text ready for indexing or further processing.
Example:
>>> # Basic cleaning only (fast)
>>> chunk = "<!-- Page 5 -->\n42\n\nThe concept of being..."
>>> clean_chunk(chunk)
'The concept of being...'
>>>
>>> # With LLM enhancement (slower, higher quality)
>>> clean_chunk(chunk, use_llm=True, context="Heidegger analysis")
'The concept of being...'
See Also:
is_chunk_valid: Validate cleaned chunks before processing
clean_page_markers: Basic page marker removal
clean_ocr_artifacts: Basic artifact removal
"""
# Nettoyage de base toujours appliqué
text: str = clean_page_markers(chunk_text)
text = clean_ocr_artifacts(text)
# Nettoyage LLM optionnel
if use_llm and len(text) >= 50:
text = clean_content_with_llm(text, context=context, model=model, provider=provider)
return text
def is_chunk_valid(chunk_text: str, min_chars: int = 20, min_words: int = 5) -> bool:
"""Check if a text chunk contains meaningful content.
Validates that a chunk has sufficient length and is not purely
metadata or boilerplate content. Used to filter out non-content
chunks before indexing.
Validation criteria:
1. Character count >= min_chars (after page marker removal)
2. Word count >= min_words
3. Not matching metadata patterns (URLs, ISBNs, DOIs, dates, copyright)
Args:
chunk_text: Text content of the chunk to validate. Page markers
are removed before validation.
min_chars: Minimum number of characters required. Defaults to 20.
Chunks shorter than this are considered invalid.
min_words: Minimum number of words required. Defaults to 5.
Chunks with fewer words are considered invalid.
Returns:
True if the chunk passes all validation criteria and contains
meaningful content suitable for indexing. False otherwise.
Example:
>>> is_chunk_valid("The concept of Dasein is central to Heidegger.")
True
>>> is_chunk_valid("42") # Too short
False
>>> is_chunk_valid("ISBN 978-0-123456-78-9") # Metadata
False
>>> is_chunk_valid("https://example.com/page") # URL
False
Note:
Metadata patterns checked:
- URLs (http://, https://)
- Dates (YYYY-MM-DD format)
- ISBN numbers
- DOI identifiers
- Copyright notices (©)
"""
text: str = clean_page_markers(chunk_text).strip()
# Vérifier la longueur
if len(text) < min_chars:
return False
# Compter les mots
words: List[str] = text.split()
if len(words) < min_words:
return False
# Vérifier que ce n'est pas juste des métadonnées
metadata_patterns: List[str] = [
r'^https?://',
r'^\d{4}-\d{2}-\d{2}$',
r'^ISBN',
r'^DOI',
r'',
]
pattern: str
for pattern in metadata_patterns:
if re.match(pattern, text, re.IGNORECASE):
return False
return True

View File

@@ -0,0 +1,294 @@
r"""LLM-based bibliographic metadata extraction from documents.
This module extracts bibliographic metadata (title, author, publisher, year, etc.)
from document text using Large Language Models. It supports both local (Ollama)
and cloud-based (Mistral API) LLM providers.
The extraction process:
1. Takes the first N characters of the document markdown (typically first pages)
2. Sends a structured prompt to the LLM requesting JSON-formatted metadata
3. Parses the LLM response to extract the JSON data
4. Applies default values and cleanup for missing/invalid fields
Supported metadata fields:
- title: Document title (including subtitle if present)
- author: Primary author name
- collection: Series or collection name
- publisher: Publisher name
- year: Publication year
- doi: Digital Object Identifier
- isbn: ISBN number
- language: ISO 639-1 language code (default: "fr")
- confidence: Dict of confidence scores per field (0.0-1.0)
LLM Provider Differences:
- **Ollama** (local): Free, slower, requires local installation.
Uses models like "mistral", "llama2", "mixtral".
- **Mistral API** (cloud): Fast, paid (~0.002€/call for small prompts).
Uses models like "mistral-small-latest", "mistral-medium-latest".
Cost Implications:
- Ollama: No API cost, only local compute resources
- Mistral API: ~0.002€ per metadata extraction call (small prompt)
Example:
>>> from utils.llm_metadata import extract_metadata
>>>
>>> markdown = '''
... # La technique et le temps
... ## Tome 1 : La faute d'Épiméthée
...
... Bernard Stiegler
...
... Éditions Galilée, 1994
... '''
>>>
>>> metadata = extract_metadata(markdown, provider="ollama")
>>> print(metadata)
{
'title': 'La technique et le temps. Tome 1 : La faute d\'Épiméthée',
'author': 'Bernard Stiegler',
'publisher': 'Éditions Galilée',
'year': 1994,
'language': 'fr',
'confidence': {'title': 0.95, 'author': 0.98}
}
See Also:
- llm_toc: Table of contents extraction via LLM
- llm_structurer: Core LLM call infrastructure
- pdf_pipeline: Orchestration using this module (Step 4)
"""
import json
import logging
import re
from typing import Any, Dict, Optional
from .llm_structurer import (
_clean_json_string,
_get_default_mistral_model,
_get_default_model,
call_llm,
)
from .types import LLMProvider
logger: logging.Logger = logging.getLogger(__name__)
def _extract_json_from_response(text: str) -> Dict[str, Any]:
"""Extract JSON data from an LLM response string.
Attempts to parse JSON from the LLM response using two strategies:
1. First, looks for JSON enclosed in <JSON></JSON> tags (preferred format)
2. Falls back to finding the first {...} block in the response
The function applies JSON string cleaning to handle common LLM quirks
like trailing commas, unescaped quotes, etc.
Args:
text: Raw LLM response text that may contain JSON data.
Returns:
Parsed JSON as a dictionary. Returns empty dict if no valid
JSON could be extracted.
Example:
>>> response = '<JSON>{"title": "Test", "author": "Smith"}</JSON>'
>>> _extract_json_from_response(response)
{'title': 'Test', 'author': 'Smith'}
>>> response = 'Here is the metadata: {"title": "Test"}'
>>> _extract_json_from_response(response)
{'title': 'Test'}
"""
# Chercher entre balises <JSON> et </JSON>
json_match: Optional[re.Match[str]] = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL)
if json_match:
json_str: str = _clean_json_string(json_match.group(1))
try:
result: Dict[str, Any] = json.loads(json_str)
return result
except json.JSONDecodeError:
pass
# Fallback: chercher le premier objet JSON
start: int = text.find("{")
end: int = text.rfind("}")
if start != -1 and end > start:
json_str = _clean_json_string(text[start:end + 1])
try:
result = json.loads(json_str)
return result
except json.JSONDecodeError as e:
logger.warning(f"JSON invalide: {e}")
return {}
def extract_metadata(
markdown: str,
model: Optional[str] = None,
provider: LLMProvider = "ollama",
temperature: float = 0.1,
max_chars: int = 6000,
) -> Dict[str, Any]:
"""Extract bibliographic metadata from a document using an LLM.
Analyzes the beginning of a document (typically first few pages) to extract
bibliographic metadata including title, author, publisher, year, and more.
Uses a structured prompt that guides the LLM to distinguish between
document title vs. collection name vs. publisher name.
The LLM is instructed to return confidence scores for extracted fields,
allowing downstream processing to handle uncertain extractions appropriately.
Args:
markdown: Document text in Markdown format. For best results, provide
at least the first 2-3 pages containing title page and colophon.
model: LLM model name to use. If None, uses the default model for the
selected provider (e.g., "mistral" for Ollama, "mistral-small-latest"
for Mistral API).
provider: LLM provider to use. Options are:
- "ollama": Local LLM (free, slower, requires Ollama installation)
- "mistral": Mistral API (fast, paid, requires API key)
temperature: Model temperature for generation. Lower values (0.0-0.3)
produce more consistent, deterministic results. Default 0.1.
max_chars: Maximum number of characters to send to the LLM. Longer
documents are truncated. Default 6000 (~2 pages).
Returns:
Dictionary containing extracted metadata with the following keys:
- title (str | None): Document title with subtitle if present
- author (str | None): Primary author name
- collection (str | None): Series or collection name
- publisher (str | None): Publisher name
- year (int | None): Publication year
- doi (str | None): Digital Object Identifier
- isbn (str | None): ISBN number
- language (str): ISO 639-1 language code (default "fr")
- confidence (dict): Confidence scores per field (0.0-1.0)
- error (str): Error message if extraction failed (only on error)
Raises:
No exceptions are raised; errors are captured in the return dict.
Note:
- Cost for Mistral API: ~0.002€ per call (6000 chars input)
- Ollama is free but requires local GPU/CPU resources
- The prompt is in French as most processed documents are French texts
- Low temperature (0.1) is used for consistent metadata extraction
Example:
>>> # Extract from first pages of a philosophy book
>>> markdown = Path("output/stiegler/stiegler.md").read_text()[:6000]
>>> metadata = extract_metadata(markdown, provider="ollama")
>>> print(f"Title: {metadata['title']}")
Title: La technique et le temps
>>> # Using Mistral API for faster extraction
>>> metadata = extract_metadata(markdown, provider="mistral")
>>> print(f"Author: {metadata['author']} (confidence: {metadata['confidence'].get('author', 'N/A')})")
Author: Bernard Stiegler (confidence: 0.98)
"""
if model is None:
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
# Prendre les premières pages (métadonnées souvent au début)
content: str = markdown[:max_chars]
if len(markdown) > max_chars:
content += "\n\n[... document tronqué ...]"
prompt: str = f"""Tu es un expert en bibliographie et édition scientifique.
TÂCHE: Extraire les métadonnées bibliographiques de ce document.
ATTENTION - PIÈGES COURANTS:
- Le titre n'est PAS forcément le premier titre H1 (peut être le nom de la collection)
- Le sous-titre fait partie du titre
- L'auteur peut apparaître sous le titre, dans les métadonnées éditeur, ou ailleurs
- Distingue bien: titre de l'œuvre ≠ nom de la collection/série ≠ nom de l'éditeur
INDICES POUR TROUVER LE VRAI TITRE:
- Souvent en plus grand / plus visible
- Accompagné du nom de l'auteur juste après
- Répété sur la page de garde et la page de titre
- Peut contenir un sous-titre après ":"
IMPORTANT - FORMAT DES DONNÉES:
- N'ajoute JAMAIS d'annotations comme "(correct)", "(à confirmer)", "(possiblement)", etc.
- Retourne uniquement les noms propres et titres sans commentaires
- NE METS PAS de phrases comme "À confirmer avec...", "Vérifier si...", "Possiblement..."
- Le champ "confidence" sert à exprimer ton niveau de certitude
- Si tu n'es pas sûr du titre, mets le titre le plus probable ET un confidence faible
- EXEMPLE CORRECT: "title": "La pensée-signe" avec "confidence": {{"title": 0.6}}
- EXEMPLE INCORRECT: "title": "À confirmer avec le titre exact"
RÉPONDS UNIQUEMENT avec un JSON entre balises <JSON></JSON>:
<JSON>
{{
"title": "Le vrai titre de l'œuvre (avec sous-titre si présent)",
"author": "Prénom Nom de l'auteur principal",
"collection": "Nom de la collection ou série (null si absent)",
"publisher": "Nom de l'éditeur",
"year": 2023,
"doi": "10.xxxx/xxxxx (null si absent)",
"isbn": "978-x-xxxx-xxxx-x (null si absent)",
"language": "fr",
"confidence": {{
"title": 0.95,
"author": 0.90
}}
}}
</JSON>
DOCUMENT À ANALYSER:
{content}
Réponds UNIQUEMENT avec le JSON."""
logger.info(f"Extraction métadonnées via {provider.upper()} ({model})")
try:
response: str = call_llm(prompt, model=model, provider=provider, temperature=temperature)
metadata: Dict[str, Any] = _extract_json_from_response(response)
# Valeurs par défaut si non trouvées
defaults: Dict[str, Optional[str]] = {
"title": None,
"author": None,
"collection": None,
"publisher": None,
"year": None,
"doi": None,
"isbn": None,
"language": "fr",
}
for key, default in defaults.items():
if key not in metadata or metadata[key] == "":
metadata[key] = default
# Nettoyer les valeurs "null" string
for key in metadata:
if metadata[key] == "null" or metadata[key] == "None":
metadata[key] = None
logger.info(f"Métadonnées extraites: titre='{metadata.get('title')}', auteur='{metadata.get('author')}'")
return metadata
except Exception as e:
logger.error(f"Erreur extraction métadonnées: {e}")
return {
"title": None,
"author": None,
"collection": None,
"publisher": None,
"year": None,
"doi": None,
"isbn": None,
"language": "fr",
"error": str(e),
}

View File

@@ -0,0 +1,583 @@
"""Structuration de documents via LLM (Ollama ou Mistral API)."""
from __future__ import annotations
import json
import logging
import os
import re
import time
from typing import Any, Dict, List, Optional, TypedDict, Union, cast
import requests
from dotenv import load_dotenv
import threading
# Import type definitions from central types module
from utils.types import LLMCostStats
# Charger les variables d'environnement
load_dotenv()
# Logger
logger: logging.Logger = logging.getLogger(__name__)
if not logging.getLogger().hasHandlers():
logging.basicConfig(
level=logging.INFO,
format="[%(asctime)s] %(levelname)s %(message)s"
)
class LLMStructureError(RuntimeError):
"""Erreur lors de la structuration via LLM."""
pass
# ═══════════════════════════════════════════════════════════════════════════════
# TypedDict Definitions
# ═══════════════════════════════════════════════════════════════════════════════
class MistralPricingEntry(TypedDict):
"""Mistral API pricing per million tokens."""
input: float
output: float
class LLMHierarchyPath(TypedDict, total=False):
"""Hierarchy path in structured output."""
part: Optional[str]
chapter: Optional[str]
section: Optional[str]
subsection: Optional[str]
class LLMChunkOutput(TypedDict, total=False):
"""Single chunk in LLM structured output."""
chunk_id: str
text: str
hierarchy: LLMHierarchyPath
type: str
is_toc: bool
class LLMDocumentSection(TypedDict, total=False):
"""Document section in structured output."""
path: LLMHierarchyPath
type: str
page_start: int
page_end: int
class LLMStructuredResult(TypedDict, total=False):
"""Result from LLM document structuring."""
document_structure: List[LLMDocumentSection]
chunks: List[LLMChunkOutput]
class OllamaResultContainer(TypedDict):
"""Container for Ollama call result (internal use)."""
response: Optional[str]
error: Optional[Exception]
done: bool
# ═══════════════════════════════════════════════════════════════════════════════
# Configuration
# ═══════════════════════════════════════════════════════════════════════════════
def _get_ollama_url() -> str:
"""Retourne l'URL de base d'Ollama."""
return os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
def _get_default_model() -> str:
"""Retourne le modèle LLM par défaut."""
return os.getenv("STRUCTURE_LLM_MODEL", "qwen2.5:7b")
def _get_mistral_api_key() -> Optional[str]:
"""Retourne la clé API Mistral."""
return os.getenv("MISTRAL_API_KEY")
def _get_default_mistral_model() -> str:
"""Retourne le modèle Mistral par défaut pour les tâches LLM."""
return os.getenv("MISTRAL_LLM_MODEL", "mistral-small-latest")
# ═══════════════════════════════════════════════════════════════════════════════
# Appel Mistral API (rapide, cloud) avec tracking des coûts
# ═══════════════════════════════════════════════════════════════════════════════
# Prix Mistral API par million de tokens (€)
MISTRAL_PRICING: Dict[str, MistralPricingEntry] = {
"mistral-small-latest": {"input": 0.2, "output": 0.6},
"mistral-medium-latest": {"input": 0.8, "output": 2.4},
"mistral-large-latest": {"input": 2.0, "output": 6.0},
# Fallback pour autres modèles
"default": {"input": 0.5, "output": 1.5},
}
# Accumulateur de coûts global (thread-local pour safety)
_cost_tracker: threading.local = threading.local()
def reset_llm_cost() -> None:
"""Réinitialise le compteur de coût LLM."""
_cost_tracker.total_cost = 0.0
_cost_tracker.total_input_tokens = 0
_cost_tracker.total_output_tokens = 0
_cost_tracker.calls_count = 0
def get_llm_cost() -> LLMCostStats:
"""Retourne les statistiques de coût LLM accumulées."""
return {
"total_cost": getattr(_cost_tracker, "total_cost", 0.0),
"total_input_tokens": getattr(_cost_tracker, "total_input_tokens", 0),
"total_output_tokens": getattr(_cost_tracker, "total_output_tokens", 0),
"calls_count": getattr(_cost_tracker, "calls_count", 0),
}
def _calculate_mistral_cost(model: str, input_tokens: int, output_tokens: int) -> float:
"""Calcule le coût d'un appel Mistral API en euros."""
pricing: MistralPricingEntry = MISTRAL_PRICING.get(model, MISTRAL_PRICING["default"])
cost: float = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
return cost
def _call_mistral_api(
prompt: str,
model: str = "mistral-small-latest",
temperature: float = 0.2,
max_tokens: int = 4096,
timeout: int = 120,
) -> str:
"""Appelle l'API Mistral pour générer une réponse.
Modèles disponibles (du plus rapide au plus puissant) :
- mistral-small-latest : Rapide, économique (~0.2€/M tokens input)
- mistral-medium-latest : Équilibré (~0.8€/M tokens input)
- mistral-large-latest : Puissant (~2€/M tokens input)
Args:
prompt: Le prompt à envoyer
model: Nom du modèle Mistral
temperature: Température (0-1)
max_tokens: Nombre max de tokens en réponse
timeout: Timeout en secondes
Returns:
Réponse textuelle du LLM
"""
api_key: Optional[str] = _get_mistral_api_key()
if not api_key:
raise LLMStructureError("MISTRAL_API_KEY non définie dans .env")
logger.info(f"Appel Mistral API - modèle: {model}")
url: str = "https://api.mistral.ai/v1/chat/completions"
headers: Dict[str, str] = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
payload: Dict[str, Any] = {
"model": model,
"messages": [{"role": "user", "content": prompt}],
"temperature": temperature,
"max_tokens": max_tokens,
}
try:
start: float = time.time()
response: requests.Response = requests.post(url, headers=headers, json=payload, timeout=timeout)
elapsed: float = time.time() - start
response.raise_for_status()
data: Dict[str, Any] = response.json()
content: str = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage: Dict[str, Any] = data.get("usage", {})
input_tokens: int = usage.get("prompt_tokens", 0)
output_tokens: int = usage.get("completion_tokens", 0)
# Calculer et accumuler le coût
call_cost: float = _calculate_mistral_cost(model, input_tokens, output_tokens)
# Mettre à jour le tracker
if not hasattr(_cost_tracker, "total_cost"):
reset_llm_cost()
_cost_tracker.total_cost += call_cost
_cost_tracker.total_input_tokens += input_tokens
_cost_tracker.total_output_tokens += output_tokens
_cost_tracker.calls_count += 1
logger.info(f"Mistral API terminé en {elapsed:.1f}s - {input_tokens}+{output_tokens} tokens = {call_cost:.6f}")
return content
except requests.exceptions.Timeout:
raise LLMStructureError(f"Timeout Mistral API ({timeout}s)")
except requests.exceptions.HTTPError as e:
raise LLMStructureError(f"Erreur HTTP Mistral: {e}")
except Exception as e:
raise LLMStructureError(f"Erreur Mistral API: {e}")
def _prepare_prompt(
markdown: str,
hierarchy: Dict[str, Any],
max_chars: int = 8000,
) -> str:
"""Prépare le prompt pour le LLM.
Args:
markdown: Texte Markdown du document
hierarchy: Structure hiérarchique initiale
max_chars: Nombre max de caractères du Markdown à inclure
Returns:
Prompt formaté pour le LLM
"""
# Tronquer le Markdown si nécessaire
truncated: str = markdown[:max_chars]
if len(markdown) > max_chars:
truncated += f"\n\n... [tronqué à {max_chars} caractères]"
# Sérialiser la hiérarchie
outline_json: str = json.dumps(hierarchy, ensure_ascii=False, indent=2)
prompt: str = f"""Tu es un expert en édition scientifique chargé d'analyser la structure logique d'un document.
IMPORTANT: Réponds UNIQUEMENT avec un objet JSON valide. Pas de texte avant ou après.
À partir du Markdown OCRisé et d'un premier découpage hiérarchique, tu dois :
1. Identifier les parties liminaires (préface, introduction...), le corps du document (parties, chapitres, sections) et les parties finales (conclusion, annexes, bibliographie...).
2. Reconstruire l'organisation réelle du texte.
3. Produire un JSON avec :
- "document_structure": vue hiérarchique du document
- "chunks": liste des chunks avec chunk_id, text, hierarchy, type
FORMAT DE RÉPONSE (entre balises <JSON></JSON>):
<JSON>
{{
"document_structure": [
{{
"path": {{"part": "Titre"}},
"type": "main_content",
"page_start": 1,
"page_end": 10
}}
],
"chunks": [
{{
"chunk_id": "chunk_00001",
"text": "Contenu...",
"hierarchy": {{
"part": "Titre partie",
"chapter": "Titre chapitre",
"section": null,
"subsection": null
}},
"type": "main_content",
"is_toc": false
}}
]
}}
</JSON>
### Hiérarchie initiale
{outline_json}
### Markdown OCR
{truncated}
Réponds UNIQUEMENT avec le JSON entre <JSON> et </JSON>."""
return prompt.strip()
def _call_ollama(
prompt: str,
model: str,
base_url: Optional[str] = None,
temperature: float = 0.2,
timeout: int = 300,
) -> str:
"""Appelle Ollama pour générer une réponse.
Args:
prompt: Le prompt à envoyer
model: Nom du modèle Ollama
base_url: URL de base d'Ollama
temperature: Température du modèle
timeout: Timeout en secondes
Returns:
Réponse textuelle du LLM
Raises:
LLMStructureError: En cas d'erreur d'appel
"""
# Essayer d'abord le SDK ollama
try:
import ollama
logger.info(f"Appel Ollama SDK - modèle: {model}, timeout: {timeout}s")
# Note: Le SDK ollama ne supporte pas directement le timeout
# On utilise un wrapper avec threading.Timer pour forcer le timeout
result_container: OllamaResultContainer = {"response": None, "error": None, "done": False}
def _run_ollama_call() -> None:
try:
resp: Any
if hasattr(ollama, "generate"):
resp = ollama.generate(
model=model,
prompt=prompt,
stream=False,
options={"temperature": temperature}
)
if isinstance(resp, dict):
result_container["response"] = resp.get("response", json.dumps(resp))
elif hasattr(resp, "response"):
result_container["response"] = resp.response
else:
result_container["response"] = str(resp)
else:
# Fallback sur chat
resp = ollama.chat(
model=model,
messages=[{"role": "user", "content": prompt}],
options={"temperature": temperature}
)
if isinstance(resp, dict):
result_container["response"] = resp.get("message", {}).get("content", str(resp))
else:
result_container["response"] = str(resp)
result_container["done"] = True
except Exception as e:
result_container["error"] = e
result_container["done"] = True
thread: threading.Thread = threading.Thread(target=_run_ollama_call, daemon=True)
thread.start()
thread.join(timeout=timeout)
if not result_container["done"]:
raise LLMStructureError(f"Timeout Ollama SDK après {timeout}s (modèle: {model})")
if result_container["error"]:
raise result_container["error"]
if result_container["response"]:
return result_container["response"]
raise LLMStructureError("Aucune réponse du SDK Ollama")
except ImportError:
logger.info("SDK ollama non disponible, utilisation de l'API HTTP")
except Exception as e:
logger.warning(f"Erreur SDK ollama: {e}, fallback HTTP")
# Fallback HTTP
base: str = base_url or _get_ollama_url()
url: str = f"{base.rstrip('/')}/api/generate"
payload: Dict[str, Any] = {
"model": model,
"prompt": prompt,
"stream": False,
"options": {"temperature": temperature},
}
# Retry avec backoff
max_retries: int = 2
backoff: float = 1.0
for attempt in range(max_retries + 1):
try:
logger.info(f"Appel HTTP Ollama (tentative {attempt + 1})")
response: requests.Response = requests.post(url, json=payload, timeout=timeout)
if response.status_code != 200:
raise LLMStructureError(
f"Erreur Ollama ({response.status_code}): {response.text}"
)
data: Dict[str, Any] = response.json()
if "response" not in data:
raise LLMStructureError(f"Réponse Ollama inattendue: {data}")
return cast(str, data["response"])
except requests.RequestException as e:
if attempt < max_retries:
time.sleep(backoff)
backoff *= 2
continue
raise LLMStructureError(f"Impossible de contacter Ollama: {e}") from e
raise LLMStructureError("Échec après plusieurs tentatives")
# ═══════════════════════════════════════════════════════════════════════════════
# Fonction générique d'appel LLM
# ═══════════════════════════════════════════════════════════════════════════════
def call_llm(
prompt: str,
model: Optional[str] = None,
provider: str = "ollama", # "ollama" ou "mistral"
temperature: float = 0.2,
timeout: int = 300,
) -> str:
"""Appelle un LLM (Ollama local ou Mistral API).
Args:
prompt: Le prompt à envoyer
model: Nom du modèle (auto-détecté si None)
provider: "ollama" (local, lent) ou "mistral" (API, rapide)
temperature: Température du modèle
timeout: Timeout en secondes
Returns:
Réponse textuelle du LLM
"""
resolved_model: str
if provider == "mistral":
# Mistral API (rapide, cloud)
resolved_model = model or _get_default_mistral_model()
return _call_mistral_api(
prompt,
model=resolved_model,
temperature=temperature,
timeout=timeout,
)
else:
# Ollama (local, lent mais gratuit)
resolved_model = model or _get_default_model()
return _call_ollama(
prompt,
model=resolved_model,
temperature=temperature,
timeout=timeout,
)
def _clean_json_string(json_str: str) -> str:
"""Nettoie une chaîne JSON des caractères de contrôle invalides.
Stratégie robuste : Remplace TOUS les caractères de contrôle (x00-x1f)
par des espaces, puis réduit les espaces multiples. Cela évite les erreurs
"Invalid control character" de json.loads().
"""
# Remplacer tous les caractères de contrôle par des espaces
cleaned: str = re.sub(r'[\x00-\x1f]', ' ', json_str)
# Réduire les espaces multiples
cleaned = re.sub(r'\s+', ' ', cleaned)
return cleaned
def _extract_json(text: str) -> LLMStructuredResult:
"""Extrait le JSON de la réponse du LLM.
Args:
text: Réponse textuelle du LLM
Returns:
Dictionnaire JSON parsé
Raises:
LLMStructureError: Si le JSON est invalide ou absent
"""
# Chercher entre balises <JSON> et </JSON>
json_start: int = text.find("<JSON>")
json_end: int = text.find("</JSON>")
if json_start != -1 and json_end != -1 and json_end > json_start:
json_content: str = text[json_start + 6:json_end].strip()
json_content = _clean_json_string(json_content)
try:
result: Dict[str, Any] = json.loads(json_content)
if "chunks" not in result:
raise LLMStructureError(
f"JSON sans clé 'chunks'. Clés: {list(result.keys())}"
)
return cast(LLMStructuredResult, result)
except json.JSONDecodeError:
pass # Fallback ci-dessous
# Fallback: chercher par accolades
start: int = text.find("{")
end: int = text.rfind("}")
if start == -1 or end == -1 or end <= start:
raise LLMStructureError(
f"Pas de JSON trouvé dans la réponse.\nDébut: {text[:500]}"
)
json_str: str = _clean_json_string(text[start:end + 1])
try:
result = json.loads(json_str)
if "chunks" not in result:
raise LLMStructureError(
f"JSON sans clé 'chunks'. Clés: {list(result.keys())}"
)
return cast(LLMStructuredResult, result)
except json.JSONDecodeError as e:
raise LLMStructureError(f"JSON invalide: {e}\nContenu: {json_str[:500]}") from e
def structure_with_llm(
markdown: str,
hierarchy: Dict[str, Any],
model: Optional[str] = None,
base_url: Optional[str] = None,
temperature: float = 0.2,
max_chars: int = 8000,
timeout: int = 300,
) -> LLMStructuredResult:
"""Améliore la structure d'un document via LLM.
Args:
markdown: Texte Markdown du document
hierarchy: Structure hiérarchique initiale (de build_hierarchy)
model: Modèle Ollama à utiliser
base_url: URL de base d'Ollama
temperature: Température du modèle
max_chars: Nombre max de caractères du Markdown
timeout: Timeout en secondes
Returns:
Structure améliorée avec document_structure et chunks
Raises:
LLMStructureError: En cas d'erreur
"""
resolved_model: str = model or _get_default_model()
logger.info(f"Structuration LLM - modèle: {resolved_model}")
# Préparer le prompt
prompt: str = _prepare_prompt(markdown, hierarchy, max_chars)
# Appeler le LLM
raw_response: str = _call_ollama(
prompt,
model=resolved_model,
base_url=base_url,
temperature=temperature,
timeout=timeout,
)
# Extraire le JSON
return _extract_json(raw_response)

View File

@@ -0,0 +1,420 @@
"""LLM-based Table of Contents (TOC) extraction module.
This module provides functionality to extract hierarchical table of contents
from markdown documents using Large Language Models. It intelligently parses
document structure and creates both hierarchical and flat representations
of the TOC.
Key Features:
- Hierarchical TOC extraction with chapters, sections, and subsections
- Flat TOC generation with full paths for navigation
- Content-to-TOC matching for associating sections with TOC entries
- Support for multiple LLM providers (Ollama local, Mistral API)
TOC Structure Levels:
- Level 1: Introduction, main chapters, Conclusion, Bibliography
- Level 2: Sections listed under a chapter (same visual level)
- Level 3: Only if explicit indentation or subsection visible
Typical Usage:
>>> from utils.llm_toc import extract_toc
>>> result = extract_toc(
... markdown=document_text,
... document_title="The Republic",
... provider="ollama"
... )
>>> print(result["toc"]) # Hierarchical structure
[
{
"title": "Introduction",
"level": 1,
"children": []
},
{
"title": "Book I: Justice",
"level": 1,
"chapter_number": 1,
"children": [
{"title": "The Nature of Justice", "level": 2, "children": []}
]
}
]
>>> print(result["flat_toc"]) # Flat list with paths
[
{"title": "Introduction", "level": 1, "path": "Introduction"},
{"title": "Book I: Justice", "level": 1, "path": "Book I: Justice"},
{
"title": "The Nature of Justice",
"level": 2,
"path": "Book I: Justice > The Nature of Justice"
}
]
LLM Provider Options:
- "ollama": Local processing, free but slower
- "mistral": Cloud API, faster but incurs costs
Note:
For documents without a clear TOC (short articles, book reviews),
the module returns an empty TOC list rather than inventing structure.
See Also:
- llm_metadata: Document metadata extraction
- llm_classifier: Section classification
- toc_extractor: Non-LLM TOC extraction alternatives
"""
import json
import logging
import re
from typing import cast, Any, Dict, List, Optional
from .llm_structurer import (
_clean_json_string,
_get_default_mistral_model,
_get_default_model,
call_llm,
)
from .types import FlatTOCEntry, LLMProvider, TOCEntry, TOCResult
logger: logging.Logger = logging.getLogger(__name__)
def _extract_json_from_response(text: str) -> Dict[str, Any]:
"""Extract JSON data from an LLM response.
Parses the LLM response to extract JSON content, handling both
explicitly tagged JSON (between <JSON></JSON> tags) and raw JSON
embedded in the response text.
Args:
text: The raw LLM response text that may contain JSON.
Returns:
A dictionary containing the parsed JSON data. Returns
{"toc": []} if no valid JSON can be extracted.
Note:
This function attempts two parsing strategies:
1. Look for JSON between <JSON></JSON> tags
2. Find JSON by locating first '{' and last '}'
"""
json_match: Optional[re.Match[str]] = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL)
if json_match:
json_str: str = _clean_json_string(json_match.group(1))
try:
result: Dict[str, Any] = json.loads(json_str)
return result
except json.JSONDecodeError:
pass
start: int = text.find("{")
end: int = text.rfind("}")
if start != -1 and end > start:
json_str = _clean_json_string(text[start:end + 1])
try:
result = json.loads(json_str)
return result
except json.JSONDecodeError as e:
logger.warning(f"JSON invalide: {e}")
return {"toc": []}
def extract_toc(
markdown: str,
document_title: Optional[str] = None,
model: Optional[str] = None,
provider: LLMProvider = "ollama",
temperature: float = 0.1,
) -> Dict[str, Any]:
r"""Extract a structured table of contents from a document using LLM.
Analyzes markdown content to identify the document's hierarchical
structure and generates both a nested TOC (with children) and a
flat TOC (with navigation paths).
Args:
markdown: Complete markdown text of the document to analyze.
document_title: Optional title of the document for context.
Helps the LLM better understand the document structure.
model: LLM model name to use. If None, uses the default model
for the specified provider.
provider: LLM provider to use. Either "ollama" for local
processing or "mistral" for cloud API.
temperature: Model temperature for response generation.
Lower values (0.1) produce more consistent results.
Returns:
A dictionary containing:
- toc: Hierarchical list of TOC entries, each with:
- title: Section title
- level: Hierarchy level (1, 2, or 3)
- chapter_number: Optional chapter number
- children: List of nested TOC entries
- flat_toc: Flat list of all TOC entries with paths:
- title: Section title
- level: Hierarchy level
- path: Full navigation path (e.g., "Chapter 1 > Section 1")
- error: Error message string (only if extraction failed)
Raises:
No exceptions are raised; errors are captured in the return dict.
Example:
>>> result = extract_toc(
... markdown="# Introduction\n...\n# Chapter 1\n## Section 1.1",
... document_title="My Book",
... provider="ollama"
... )
>>> len(result["toc"])
2
>>> result["toc"][0]["title"]
'Introduction'
Note:
- Documents longer than 12,000 characters are truncated
- Short articles without clear TOC return empty lists
- The LLM is instructed to never invent structure
"""
if model is None:
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
# Tronquer si trop long mais garder les sections importantes
max_chars: int = 12000
content: str = markdown[:max_chars]
if len(markdown) > max_chars:
content += "\n\n[... suite du document ...]"
title_context: str = f"Titre du document: {document_title}\n" if document_title else ""
prompt: str = f"""Tu es un expert en structuration de documents académiques.
TÂCHE: Extraire la table des matières FIDÈLE au document fourni.
{title_context}
⚠️ RÈGLES CRITIQUES:
1. **ANALYSER LE DOCUMENT RÉEL** - Ne JAMAIS copier les exemples ci-dessous!
2. **DOCUMENTS SANS TOC** - Si le document est un article court, une revue de livre, ou n'a pas de table des matières explicite, retourner {{"toc": []}}
3. **RESPECTER LA STRUCTURE PLATE** - Ne pas inventer de hiérarchie entre des lignes au même niveau
4. **IGNORER** - Métadonnées éditoriales (DOI, ISBN, éditeur, copyright, numéros de page)
NIVEAUX DE STRUCTURE:
- level 1: Introduction, Chapitres principaux, Conclusion, Bibliographie
- level 2: Sections listées sous un chapitre (même niveau visuel)
- level 3: UNIQUEMENT si indentation ou sous-titre explicite visible
FORMAT DE RÉPONSE (JSON entre balises <JSON></JSON>):
Pour un livre avec TOC:
<JSON>
{{
"toc": [
{{
"title": "Titre Chapitre 1",
"level": 1,
"chapter_number": 1,
"children": [
{{"title": "Section 1.1", "level": 2, "children": []}},
{{"title": "Section 1.2", "level": 2, "children": []}}
]
}}
]
}}
</JSON>
Pour un article SANS TOC (revue de livre, article court, etc.):
<JSON>
{{
"toc": []
}}
</JSON>
⚠️ NE PAS COPIER CES EXEMPLES ! Analyser uniquement le DOCUMENT RÉEL ci-dessous.
DOCUMENT À ANALYSER:
{content}
Réponds UNIQUEMENT avec le JSON correspondant à CE document (pas aux exemples)."""
logger.info(f"Extraction TOC via {provider.upper()} ({model})")
try:
response: str = call_llm(prompt, model=model, provider=provider, temperature=temperature, timeout=360)
result: Dict[str, Any] = _extract_json_from_response(response)
toc: List[Dict[str, Any]] = result.get("toc", [])
# Générer la version plate de la TOC
flat_toc: List[Dict[str, Any]] = _flatten_toc(toc)
logger.info(f"TOC extraite: {len(toc)} entrées niveau 1, {len(flat_toc)} entrées totales")
return {
"toc": toc,
"flat_toc": flat_toc,
}
except Exception as e:
logger.error(f"Erreur extraction TOC: {e}")
return {
"toc": [],
"flat_toc": [],
"error": str(e),
}
def _flatten_toc(
toc: List[Dict[str, Any]],
parent_path: str = "",
result: Optional[List[Dict[str, Any]]] = None
) -> List[Dict[str, Any]]:
"""Flatten a hierarchical TOC into a list with navigation paths.
Recursively traverses a nested TOC structure and produces a flat
list where each entry includes its full path from the root.
Args:
toc: Hierarchical TOC list with nested children.
parent_path: Path accumulated from parent entries. Used
internally during recursion.
result: Accumulator list for results. Used internally
during recursion.
Returns:
A flat list of TOC entries, each containing:
- title: The section title
- level: Hierarchy level (1, 2, or 3)
- path: Full navigation path (e.g., "Chapter > Section")
- chapter_number: Optional chapter number if present
Example:
>>> hierarchical_toc = [
... {
... "title": "Chapter 1",
... "level": 1,
... "children": [
... {"title": "Section 1.1", "level": 2, "children": []}
... ]
... }
... ]
>>> flat = _flatten_toc(hierarchical_toc)
>>> flat[0]["path"]
'Chapter 1'
>>> flat[1]["path"]
'Chapter 1 > Section 1.1'
"""
if result is None:
result = []
for item in toc:
title: str = item.get("title", "")
level: int = item.get("level", 1)
# Construire le chemin
path: str
if parent_path:
path = f"{parent_path} > {title}"
else:
path = title
result.append({
"title": title,
"level": level,
"path": path,
"chapter_number": item.get("chapter_number"),
})
# Récursion sur les enfants
children: List[Dict[str, Any]] = item.get("children", [])
if children:
_flatten_toc(children, path, result)
return result
def match_content_to_toc(
content_sections: List[Dict[str, Any]],
flat_toc: List[Dict[str, Any]],
model: Optional[str] = None,
provider: LLMProvider = "ollama",
) -> List[Dict[str, Any]]:
"""Match content sections to TOC entries using LLM.
Uses an LLM to intelligently associate extracted content sections
with their corresponding entries in the table of contents. This
enables navigation and context-aware content organization.
Args:
content_sections: List of content sections extracted from
the document. Each section should have a "title" key.
flat_toc: Flat TOC list as returned by extract_toc()["flat_toc"].
Each entry should have a "title" key.
model: LLM model name to use. If None, uses the default
model for the specified provider.
provider: LLM provider to use. Either "ollama" for local
processing or "mistral" for cloud API.
Returns:
The input content_sections list with a "toc_match" key added
to each section. The value is either:
- The matched TOC entry dict (if a match was found)
- None (if no match was found)
Example:
>>> sections = [{"title": "Introduction"}, {"title": "Methods"}]
>>> toc = [{"title": "Introduction", "level": 1, "path": "Introduction"}]
>>> matched = match_content_to_toc(sections, toc)
>>> matched[0]["toc_match"]["title"]
'Introduction'
>>> matched[1]["toc_match"] is None
True
Note:
- Only the first 30 content sections are processed to limit costs
- Failed matches are silently handled (sections get toc_match=None)
"""
if model is None:
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
# Préparer les données pour le prompt
toc_titles: List[str] = [item["title"] for item in flat_toc]
section_titles: List[str] = [s.get("title", "") for s in content_sections[:30]] # Limiter
prompt: str = f"""Tu dois associer les sections de contenu aux entrées de la table des matières.
TABLE DES MATIÈRES:
{json.dumps(toc_titles, ensure_ascii=False, indent=2)}
SECTIONS DE CONTENU:
{json.dumps(section_titles, ensure_ascii=False, indent=2)}
Pour chaque section de contenu, indique l'index (0-based) de l'entrée TOC correspondante.
Si pas de correspondance, indique -1.
RÉPONDS avec un JSON:
<JSON>
{{
"matches": [0, 1, 2, -1, 3, ...]
}}
</JSON>
"""
try:
response: str = call_llm(prompt, model=model, provider=provider, temperature=0.1)
result: Dict[str, Any] = _extract_json_from_response(response)
matches: List[int] = result.get("matches", [])
# Appliquer les correspondances
for i, section in enumerate(content_sections):
if i < len(matches) and matches[i] >= 0 and matches[i] < len(flat_toc):
section["toc_match"] = flat_toc[matches[i]]
else:
section["toc_match"] = None
return content_sections
except Exception as e:
logger.warning(f"Erreur correspondance TOC: {e}")
return content_sections

View File

@@ -0,0 +1,513 @@
"""Document validation and enrichment using Large Language Models.
This module provides comprehensive validation, correction, and enrichment
functionality for parsed documents. It uses LLMs to verify document coherence,
detect inconsistencies, suggest corrections, and extract key concepts from
text chunks.
Overview:
The module performs three main functions:
1. **Document Validation** (validate_document):
Verifies the coherence of parsed documents by checking metadata,
table of contents, and chunk content quality. Returns detailed
validation results with issues, corrections, and confidence scores.
2. **Content Enrichment** (enrich_chunks_with_concepts, generate_section_summary):
Enhances document content by extracting key philosophical concepts
from chunks and generating concise summaries for sections.
3. **Correction Application** (apply_corrections, clean_validation_annotations):
Applies suggested corrections from validation results and cleans
LLM-generated annotation artifacts from text.
Validation Criteria:
The validator checks several aspects of document quality:
- **Metadata Quality**: Verifies title and author are correctly identified
(not collection names, not "Unknown" when visible in text)
- **TOC Coherence**: Checks for duplicates, proper ordering, completeness
- **Chunk Content**: Ensures chunks contain substantive content, not just
metadata fragments or headers
Validation Result Structure:
The ValidationResult TypedDict contains:
- valid (bool): Overall validation pass/fail
- errors (List[str]): Critical issues requiring attention
- warnings (List[str]): Non-critical suggestions
- corrections (Dict[str, str]): Suggested field corrections
- concepts (List[str]): Extracted key concepts
- score (float): Confidence score (0.0 to 1.0)
LLM Provider Support:
- ollama: Local LLM (free, slower, privacy-preserving)
- mistral: Mistral API (faster, requires API key, ~0.001 per validation)
Example:
>>> from utils.llm_validator import validate_document, apply_corrections
>>>
>>> # Validate a parsed document
>>> parsed_doc = {
... "metadata": {"title": "Phenomenologie", "author": "Hegel"},
... "toc": [{"title": "Preface", "level": 1, "page": 1}],
... "chunks": [{"text": "La conscience...", "section_path": "Preface"}]
... }
>>> result = validate_document(parsed_doc, provider="ollama")
>>> print(f"Valid: {result['valid']}, Score: {result['score']}")
Valid: True, Score: 0.85
See Also:
utils.llm_cleaner: Text cleaning and validation
utils.llm_chunker: Semantic chunking of sections
utils.pdf_pipeline: Main pipeline orchestration
"""
from __future__ import annotations
import json
import logging
import re
from typing import Any, Dict, List, Optional, Match
from .llm_structurer import call_llm, _get_default_model, _get_default_mistral_model, _clean_json_string
from .types import LLMProvider, ValidationResult, ParsedDocument, ChunkData
logger: logging.Logger = logging.getLogger(__name__)
def _extract_json_from_response(text: str) -> Dict[str, Any]:
"""Extract JSON from an LLM response text.
Attempts to parse JSON from the response using two strategies:
1. Look for content wrapped in <JSON></JSON> tags
2. Find the first { and last } to extract raw JSON
Args:
text: LLM response text potentially containing JSON data.
May include markdown, explanatory text, or XML-style tags.
Returns:
Parsed dictionary from the JSON content. Returns an empty dict
if no valid JSON is found or parsing fails.
Example:
>>> response = '<JSON>{"valid": true, "score": 0.9}</JSON>'
>>> _extract_json_from_response(response)
{'valid': True, 'score': 0.9}
"""
json_match: Optional[Match[str]] = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL)
if json_match:
json_str: str = _clean_json_string(json_match.group(1))
try:
result: Dict[str, Any] = json.loads(json_str)
return result
except json.JSONDecodeError:
pass
start: int = text.find("{")
end: int = text.rfind("}")
if start != -1 and end > start:
json_str = _clean_json_string(text[start:end + 1])
try:
result = json.loads(json_str)
return result
except json.JSONDecodeError as e:
logger.warning(f"JSON invalide: {e}")
return {}
def validate_document(
parsed_doc: Dict[str, Any],
model: Optional[str] = None,
provider: LLMProvider = "ollama",
temperature: float = 0.1,
) -> ValidationResult:
"""Validate a parsed document's coherence and suggest corrections.
Uses an LLM to analyze the document structure and content, checking
for common issues like incorrect metadata, inconsistent TOC, or
low-quality chunk content.
Args:
parsed_doc: Dictionary containing the parsed document with keys:
- metadata: Dict with title, author, year, language
- toc: List of TOC entries with title, level, page
- chunks: List of text chunks with content and metadata
model: LLM model name. If None, uses provider's default model.
provider: LLM provider, either "ollama" (local) or "mistral" (API).
temperature: Model temperature for response generation (0.0-1.0).
Lower values produce more deterministic results.
Returns:
ValidationResult TypedDict containing:
- valid: Overall validation status (True if no critical errors)
- errors: List of critical issues as strings
- warnings: List of non-critical suggestions
- corrections: Dict mapping field names to suggested corrections
- concepts: Extracted key concepts (empty for this function)
- score: Confidence score from 0.0 to 1.0
Note:
The function always returns a valid result, even on LLM errors.
Check the 'score' field - a score of 0.0 indicates an error occurred.
"""
if model is None:
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
# Extraire les infos clés
metadata: Dict[str, Any] = parsed_doc.get("metadata", {})
toc: List[Dict[str, Any]] = parsed_doc.get("toc", [])
chunks: List[Dict[str, Any]] = parsed_doc.get("chunks", [])
# Préparer le résumé du document
doc_summary: Dict[str, Any] = {
"title": metadata.get("title"),
"author": metadata.get("author"),
"toc_count": len(toc),
"toc_preview": [t.get("title") for t in toc[:10]] if toc else [],
"chunks_count": len(chunks),
"first_chunks_preview": [
c.get("text", "")[:100] for c in chunks[:5]
] if chunks else [],
}
prompt: str = f"""Tu es un expert en validation de documents structurés.
TÂCHE: Vérifier la cohérence de ce document parsé et détecter les erreurs.
DOCUMENT PARSÉ:
{json.dumps(doc_summary, ensure_ascii=False, indent=2)}
VÉRIFICATIONS À EFFECTUER:
1. Le titre correspond-il au contenu? (pas le nom d'une collection)
2. L'auteur est-il correctement identifié? (pas "Inconnu" si visible)
3. La TOC est-elle cohérente? (pas de doublons, bon ordre)
4. Les chunks contiennent-ils du vrai contenu? (pas que des métadonnées)
RÉPONDS avec un JSON entre <JSON></JSON>:
<JSON>
{{
"is_valid": true,
"confidence": 0.85,
"issues": [
{{
"field": "title",
"severity": "warning",
"message": "Le titre semble être le nom de la collection",
"suggestion": "Vrai titre suggéré"
}}
],
"corrections": {{
"title": "Titre corrigé si nécessaire",
"author": "Auteur corrigé si nécessaire"
}},
"quality_score": {{
"metadata": 0.8,
"toc": 0.9,
"chunks": 0.7
}}
}}
</JSON>
"""
logger.info(f"Validation du document parsé via {provider.upper()}")
try:
response: str = call_llm(
prompt, model=model, provider=provider, temperature=temperature, timeout=180
)
result: Dict[str, Any] = _extract_json_from_response(response)
# Construire ValidationResult avec valeurs par défaut
is_valid: bool = result.get("is_valid", True)
issues: List[str] = result.get("issues", [])
corrections: Dict[str, str] = result.get("corrections", {})
confidence: float = result.get("confidence", 0.5)
logger.info(f"Validation terminée: valid={is_valid}, issues={len(issues)}")
validation_result: ValidationResult = {
"valid": is_valid,
"errors": [str(issue) for issue in issues] if issues else [],
"warnings": [],
"corrections": corrections,
"concepts": [],
"score": confidence,
}
return validation_result
except Exception as e:
logger.error(f"Erreur validation document: {e}")
error_result: ValidationResult = {
"valid": True,
"errors": [str(e)],
"warnings": [],
"corrections": {},
"concepts": [],
"score": 0.0,
}
return error_result
def generate_section_summary(
section_content: str,
section_title: str,
model: Optional[str] = None,
provider: LLMProvider = "ollama",
max_words: int = 50,
) -> str:
"""Generate a concise summary for a document section using LLM.
Creates a single-sentence summary capturing the main idea of the section.
For very short sections (< 100 characters), returns the section title
instead of calling the LLM.
Args:
section_content: Full text content of the section to summarize.
section_title: Title of the section, used as fallback if summarization
fails or content is too short.
model: LLM model name. If None, uses provider's default model.
provider: LLM provider, either "ollama" (local) or "mistral" (API).
max_words: Maximum number of words for the generated summary.
Defaults to 50 words.
Returns:
Generated summary string, truncated to max_words if necessary.
Returns section_title if content is too short or on error.
Note:
Only the first 2000 characters of section_content are sent to the LLM
to manage context window limits and costs.
"""
if model is None:
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
if len(section_content) < 100:
return section_title
prompt: str = f"""Résume cette section en maximum {max_words} mots.
Le résumé doit capturer l'idée principale.
Titre: {section_title}
Contenu:
{section_content[:2000]}
Résumé (en une phrase):"""
try:
response: str = call_llm(
prompt, model=model, provider=provider, temperature=0.2, timeout=60
)
# Nettoyer la réponse
summary: str = response.strip()
# Limiter la longueur
words: List[str] = summary.split()
if len(words) > max_words:
summary = ' '.join(words[:max_words]) + '...'
return summary or section_title
except Exception as e:
logger.warning(f"Erreur génération résumé: {e}")
return section_title
def enrich_chunks_with_concepts(
chunks: List[Dict[str, Any]],
model: Optional[str] = None,
provider: LLMProvider = "ollama",
) -> List[Dict[str, Any]]:
"""Enrich text chunks with extracted key concepts using LLM.
Processes each chunk to extract 3-5 key philosophical or thematic
concepts, adding them to the chunk's 'concepts' field. Skips chunks
that already have concepts or are too short (< 100 characters).
Args:
chunks: List of chunk dictionaries, each containing at minimum:
- text: The chunk's text content
May also contain existing 'concepts' field (will be skipped).
model: LLM model name. If None, uses provider's default model.
provider: LLM provider, either "ollama" (local) or "mistral" (API).
Returns:
The same list of chunks, modified in-place with 'concepts' field
added to each chunk. Each concepts field is a list of 0-5 strings.
Note:
- Chunks are processed individually with logging every 10 chunks.
- Only the first 1000 characters of each chunk are analyzed.
- The function modifies chunks in-place AND returns them.
- On extraction error, sets concepts to an empty list.
"""
if model is None:
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
# Limiter le nombre de chunks à traiter en une fois
batch_size: int = 10
i: int
chunk: Dict[str, Any]
for i, chunk in enumerate(chunks):
if "concepts" in chunk and chunk["concepts"]:
continue # Déjà enrichi
text: str = chunk.get("text", "")
if len(text) < 100:
chunk["concepts"] = []
continue
# Traiter par batch pour optimiser
if i % batch_size == 0:
logger.info(f"Enrichissement concepts: chunks {i} à {min(i+batch_size, len(chunks))}")
prompt: str = f"""Extrait 3-5 concepts clés de ce texte.
Réponds avec une liste JSON: ["concept1", "concept2", ...]
Texte:
{text[:1000]}
Concepts:"""
try:
response: str = call_llm(
prompt, model=model, provider=provider, temperature=0.1, timeout=30
)
# Chercher la liste JSON
match: Optional[Match[str]] = re.search(r'\[.*?\]', response, re.DOTALL)
if match:
concepts: List[str] = json.loads(match.group())
chunk["concepts"] = concepts[:5]
else:
chunk["concepts"] = []
except Exception as e:
logger.warning(f"Erreur extraction concepts chunk {i}: {e}")
chunk["concepts"] = []
return chunks
def clean_validation_annotations(text: str) -> str:
"""Remove LLM-generated validation annotations from text.
Cleans common annotation patterns that LLMs may add when validating
or correcting text, such as confidence markers or verification notes.
Patterns removed:
- "(correct)" or "(a confirmer)" at end of text
- "(a confirmer comme titre principal)"
- "(possiblement...)" or "(probablement...)"
- Isolated "(correct)" or "(a confirmer)" mid-text
Args:
text: Text potentially containing LLM annotation artifacts.
Returns:
Cleaned text with annotations removed and whitespace normalized.
Returns the original text if input is None or empty.
Example:
>>> clean_validation_annotations("Phenomenologie (a confirmer)")
"Phenomenologie"
>>> clean_validation_annotations("G.W.F. Hegel (correct)")
'G.W.F. Hegel'
"""
if not text:
return text
# Supprimer les annotations à la fin du texte
text = re.sub(
r'\s*\([^)]*(?:correct|à confirmer|possiblement|probablement)[^)]*\)\s*$',
'',
text,
flags=re.IGNORECASE
)
# Nettoyer aussi les annotations au milieu si elles sont isolées
text = re.sub(r'\s*\((?:correct|à confirmer)\)\s*', ' ', text, flags=re.IGNORECASE)
return text.strip()
def apply_corrections(
parsed_doc: Dict[str, Any],
validation_result: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""Apply validation corrections to a parsed document.
Takes the corrections suggested by validate_document() and applies them
to the document's metadata. Also cleans any LLM annotation artifacts
from existing metadata fields.
Args:
parsed_doc: Parsed document dictionary containing at minimum:
- metadata: Dict with title, author, and other fields
May also contain 'work' field as fallback title source.
validation_result: Result from validate_document() containing:
- corrections: Dict mapping field names to corrected values
If None, only cleans existing metadata annotations.
Returns:
The modified parsed_doc with:
- Corrected metadata fields applied
- Original values preserved in 'original_<field>' keys
- LLM annotations cleaned from all text fields
- 'validation' key added with the validation_result
Note:
- Modifies parsed_doc in-place AND returns it
- Empty correction values are ignored
- If title contains validation phrases and 'work' field exists,
the work field value is used as the corrected title
"""
corrections: Dict[str, str] = (
validation_result.get("corrections", {}) if validation_result else {}
)
metadata: Dict[str, Any] = parsed_doc.get("metadata", {})
# Appliquer les corrections de métadonnées
if "title" in corrections and corrections["title"]:
old_title: Optional[str] = metadata.get("title")
# Nettoyer les annotations de validation
clean_title: str = clean_validation_annotations(corrections["title"])
metadata["title"] = clean_title
metadata["original_title"] = old_title
logger.info(f"Titre corrigé: '{old_title}' -> '{clean_title}'")
if "author" in corrections and corrections["author"]:
old_author: Optional[str] = metadata.get("author")
# Nettoyer les annotations de validation
clean_author: str = clean_validation_annotations(corrections["author"])
metadata["author"] = clean_author
metadata["original_author"] = old_author
logger.info(f"Auteur corrigé: '{old_author}' -> '{clean_author}'")
# Nettoyer aussi les métadonnées existantes si pas de corrections
if "title" in metadata and metadata["title"]:
title: str = metadata["title"]
# Si le titre contient des phrases de validation, utiliser le champ "work" à la place
validation_phrases: List[str] = ["à confirmer", "confirmer avec", "vérifier"]
if any(phrase in title.lower() for phrase in validation_phrases):
if "work" in metadata and metadata["work"]:
logger.info(f"Titre remplacé par 'work': '{title}' -> '{metadata['work']}'")
metadata["original_title"] = title
metadata["title"] = metadata["work"]
else:
metadata["title"] = clean_validation_annotations(title)
if "author" in metadata and metadata["author"]:
metadata["author"] = clean_validation_annotations(metadata["author"])
parsed_doc["metadata"] = metadata
parsed_doc["validation"] = validation_result
return parsed_doc

View File

@@ -0,0 +1,141 @@
"""Markdown document builder from OCR API responses.
This module transforms Mistral OCR API responses into structured Markdown text.
It handles text extraction, page marker insertion, and image processing
(either base64 embedding or disk-based storage with relative path references).
The builder is a core component of the PDF processing pipeline, sitting between
OCR extraction and hierarchical parsing.
Pipeline Position:
PDF → OCR (mistral_client) → **Markdown Builder** → Hierarchy Parser → Chunks
Features:
- Page markers: Inserts HTML comments (<!-- Page N -->) for traceability
- Image handling: Supports both inline base64 and external file references
- Type safety: Uses Protocol-based typing for OCR response structures
Workflow:
1. Iterate through pages in the OCR response
2. Extract Markdown content from each page
3. Process images (embed as base64 or save via ImageWriter callback)
4. Assemble the complete Markdown document
Image Handling Modes:
1. **No images**: Set embed_images=False and image_writer=None
2. **Inline base64**: Set embed_images=True (large file size)
3. **External files**: Provide image_writer callback (recommended)
Example:
>>> from pathlib import Path
>>> from utils.image_extractor import create_image_writer
>>>
>>> # Create image writer for output directory
>>> writer = create_image_writer(Path("output/my_doc/images"))
>>>
>>> # Build markdown with external image references
>>> markdown = build_markdown(
... ocr_response,
... embed_images=False,
... image_writer=writer
... )
>>> print(markdown[:100])
<!-- Page 1 -->
# Document Title
...
Note:
- Page indices are 1-based for human readability
- The OCR response must follow the Mistral API structure
- Empty pages produce only the page marker comment
See Also:
- utils.mistral_client: OCR API client for obtaining responses
- utils.image_extractor: Image writer factory and extraction
- utils.hierarchy_parser: Next step in pipeline (structure parsing)
"""
from typing import Any, Callable, List, Optional, Protocol
# Type pour le writer d'images
ImageWriterCallable = Callable[[int, int, str], Optional[str]]
class OCRImage(Protocol):
"""Protocol pour une image extraite par OCR."""
image_base64: Optional[str]
class OCRPage(Protocol):
"""Protocol pour une page extraite par OCR."""
markdown: Optional[str]
images: Optional[List[OCRImage]]
class OCRResponseProtocol(Protocol):
"""Protocol pour la réponse complète de l'API OCR Mistral."""
pages: List[OCRPage]
def build_markdown(
ocr_response: OCRResponseProtocol,
embed_images: bool = False,
image_writer: Optional[ImageWriterCallable] = None,
) -> str:
"""Construit le texte Markdown à partir de la réponse OCR.
Args:
ocr_response: Réponse de l'API OCR Mistral contenant les pages extraites.
embed_images: Intégrer les images en base64 dans le Markdown.
image_writer: Fonction pour sauvegarder les images sur disque.
Signature: (page_idx, img_idx, base64_data) -> chemin_relatif.
Returns:
Texte Markdown complet du document avec marqueurs de page et images.
Example:
>>> markdown = build_markdown(
... ocr_response,
... embed_images=False,
... image_writer=lambda p, i, b64: f"images/p{p}_i{i}.png"
... )
"""
md_parts: List[str] = []
for page_index, page in enumerate(ocr_response.pages, start=1):
# Commentaire de page
md_parts.append(f"<!-- Page {page_index} -->\n\n")
# Contenu Markdown de la page
page_markdown: Optional[str] = getattr(page, "markdown", None)
if page_markdown:
md_parts.append(page_markdown)
md_parts.append("\n\n")
# Traitement des images
page_images: Optional[List[OCRImage]] = getattr(page, "images", None)
if page_images:
for img_idx, img in enumerate(page_images, start=1):
image_b64: Optional[str] = getattr(img, "image_base64", None)
if not image_b64:
continue
if embed_images:
# Image intégrée en base64
data_uri: str = f"data:image/png;base64,{image_b64}"
md_parts.append(
f"![Page {page_index} Image {img_idx}]({data_uri})\n\n"
)
elif image_writer:
# Image sauvegardée sur disque
rel_path: Optional[str] = image_writer(page_index, img_idx, image_b64)
if rel_path:
md_parts.append(
f"![Page {page_index} Image {img_idx}]({rel_path})\n\n"
)
return "".join(md_parts)

View File

@@ -0,0 +1,169 @@
"""Mistral API Client Management.
This module provides utilities for managing the Mistral API client,
including API key retrieval and OCR cost estimation. It serves as the
foundation for all Mistral API interactions in the Library RAG pipeline.
Key Features:
- Automatic API key discovery from multiple sources
- Client instantiation with proper authentication
- OCR cost estimation for budget planning
API Key Priority:
The module searches for the Mistral API key in this order:
1. Explicit argument passed to functions
2. MISTRAL_API_KEY environment variable
3. .env file in the project root
Cost Estimation:
Mistral OCR pricing (as of 2024):
- Standard OCR: ~1 EUR per 1000 pages (0.001 EUR/page)
- OCR with annotations: ~3 EUR per 1000 pages (0.003 EUR/page)
Example:
Basic client creation and usage::
from utils.mistral_client import create_client, estimate_ocr_cost
# Create authenticated client
client = create_client()
# Estimate cost for a 100-page document
cost = estimate_ocr_cost(100, use_annotations=False)
print(f"Estimated cost: {cost:.2f} EUR") # Output: Estimated cost: 0.10 EUR
Using explicit API key::
client = create_client(api_key="your-api-key-here")
See Also:
- :mod:`utils.ocr_processor`: OCR execution functions using this client
- :mod:`utils.pdf_uploader`: PDF upload utilities for OCR processing
Note:
Ensure MISTRAL_API_KEY is set before using this module in production.
The API key can be obtained from the Mistral AI platform dashboard.
"""
import os
from typing import Optional
from dotenv import load_dotenv
from mistralai import Mistral
def get_api_key(api_key: Optional[str] = None) -> str:
"""Retrieve the Mistral API key from available sources.
Searches for the API key in the following priority order:
1. Explicit argument passed to this function
2. MISTRAL_API_KEY environment variable
3. .env file in the project root
Args:
api_key: Optional API key to use directly. If provided and non-empty,
this value is used without checking other sources.
Returns:
The Mistral API key as a string.
Raises:
RuntimeError: If no API key is found in any of the checked sources.
Example:
>>> # Using environment variable
>>> key = get_api_key()
>>> len(key) > 0
True
>>> # Using explicit key
>>> key = get_api_key("my-api-key")
>>> key
'my-api-key'
"""
# 1. Argument fourni
if api_key and api_key.strip():
return api_key.strip()
# 2. Variable d environnement
env_key = os.getenv("MISTRAL_API_KEY", "").strip()
if env_key:
return env_key
# 3. Fichier .env
load_dotenv()
env_key = os.getenv("MISTRAL_API_KEY", "").strip()
if env_key:
return env_key
raise RuntimeError(
"MISTRAL_API_KEY manquante. "
"Definissez la variable d environnement ou creez un fichier .env"
)
def create_client(api_key: Optional[str] = None) -> Mistral:
"""Create and return an authenticated Mistral client.
This is the primary entry point for obtaining a Mistral client instance.
The client can be used for OCR operations, chat completions, and other
Mistral API features.
Args:
api_key: Optional API key. If not provided, the key is automatically
retrieved from environment variables or .env file.
Returns:
An authenticated Mistral client instance ready for API calls.
Raises:
RuntimeError: If no API key is found (propagated from get_api_key).
Example:
>>> client = create_client()
>>> # Client is now ready for OCR or other operations
>>> response = client.ocr.process(...) # doctest: +SKIP
"""
key = get_api_key(api_key)
return Mistral(api_key=key)
def estimate_ocr_cost(nb_pages: int, use_annotations: bool = False) -> float:
"""Estimate the cost of OCR processing for a document.
Calculates the expected cost based on Mistral OCR pricing model.
This is useful for budget planning before processing large document
collections.
Pricing Model:
- Standard OCR: ~1 EUR per 1000 pages (0.001 EUR/page)
- OCR with annotations: ~3 EUR per 1000 pages (0.003 EUR/page)
The annotation mode is approximately 3x more expensive but provides
additional structural information useful for TOC extraction.
Args:
nb_pages: Number of pages in the document to process.
use_annotations: If True, uses the higher annotation pricing.
Annotations provide bounding box and structural data.
Returns:
Estimated cost in euros as a float.
Example:
>>> # Standard OCR for 100 pages
>>> estimate_ocr_cost(100)
0.1
>>> # OCR with annotations for 100 pages
>>> estimate_ocr_cost(100, use_annotations=True)
0.3
>>> # Large document collection
>>> estimate_ocr_cost(10000)
10.0
"""
if use_annotations:
return nb_pages * 0.003 # 3 EUR / 1000 pages
else:
return nb_pages * 0.001 # 1 EUR / 1000 pages

View File

@@ -0,0 +1,312 @@
"""OCR Processing via Mistral API.
This module provides functions for executing OCR (Optical Character Recognition)
on PDF documents using the Mistral API. It handles both standard OCR and advanced
OCR with structured annotations for better document understanding.
Key Features:
- Standard OCR for text extraction with optional image embedding
- Advanced OCR with document and bounding box annotations
- Response serialization for JSON storage and further processing
- Support for page-by-page processing
OCR Modes:
1. **Standard OCR** (run_ocr):
- Extracts text and optionally images
- Cost: ~1 EUR per 1000 pages (0.001 EUR/page)
- Best for: Simple text extraction, content indexing
2. **OCR with Annotations** (run_ocr_with_annotations):
- Extracts text with structural metadata (bounding boxes, document structure)
- Cost: ~3 EUR per 1000 pages (0.003 EUR/page)
- Best for: TOC extraction, layout analysis, structured documents
- Document annotations limited to 8 pages max
- Bounding box annotations have no page limit
Response Structure:
The OCR response contains:
- pages: List of page objects with text content
- images: Optional base64-encoded images (if include_images=True)
- annotations: Structural metadata (if using annotation mode)
Example:
Basic OCR processing::
from utils.mistral_client import create_client
from utils.ocr_processor import run_ocr, serialize_ocr_response
# Create client and read PDF
client = create_client()
with open("document.pdf", "rb") as f:
pdf_bytes = f.read()
# Run OCR
response = run_ocr(client, pdf_bytes, "document.pdf")
# Serialize for storage
ocr_dict = serialize_ocr_response(response)
print(f"Extracted {len(ocr_dict['pages'])} pages")
Cost Considerations:
- Always estimate costs before batch processing with estimate_ocr_cost()
- Use pages parameter to limit processing when full document is not needed
- Annotation mode is 3x more expensive - use only when structure is needed
- Cache OCR results to avoid reprocessing (saved in output/<doc>/<doc>.json)
See Also:
- utils.mistral_client: Client creation and cost estimation
- utils.pdf_uploader: PDF upload utilities
- utils.pdf_pipeline: Full pipeline orchestration
Note:
OCR responses are Pydantic models from the Mistral SDK. Use
serialize_ocr_response() to convert to dictionaries before JSON storage.
"""
import json
from typing import Any, Dict, List, Optional, Type
from mistralai import Mistral
from pydantic import BaseModel
from .pdf_uploader import upload_pdf
from .types import OCRResponse
def run_ocr(
client: Mistral,
file_bytes: bytes,
filename: str,
include_images: bool = True,
) -> Any:
"""Execute standard OCR on a PDF document via Mistral API.
Uploads the PDF to Mistral servers and runs OCR to extract text content.
Optionally includes base64-encoded images from the document.
This is the most cost-effective OCR mode (~0.001 EUR/page) suitable for
basic text extraction and content indexing.
Args:
client: Authenticated Mistral client instance created via
utils.mistral_client.create_client().
file_bytes: Binary content of the PDF file to process.
filename: Original filename of the PDF (used for identification).
include_images: If True, includes base64-encoded images from each page
in the response. Set to False to reduce response size when images
are not needed. Defaults to True.
Returns:
OCR response object from Mistral API (Pydantic model). Contains:
- pages: List of page objects with extracted text
- images: Base64 images if include_images=True
Use serialize_ocr_response() to convert to a dictionary.
Raises:
RuntimeError: If the Mistral client is not properly authenticated.
HTTPError: If the API request fails (network issues, rate limits).
Example:
>>> from utils.mistral_client import create_client
>>> client = create_client()
>>> with open("document.pdf", "rb") as f:
... pdf_bytes = f.read()
>>> response = run_ocr(client, pdf_bytes, "document.pdf")
>>> # Access extracted text from first page
>>> first_page_text = response.pages[0].markdown # doctest: +SKIP
Note:
The PDF is first uploaded to Mistral servers via
utils.pdf_uploader.upload_pdf(), then processed. The uploaded
file is automatically cleaned up by Mistral after processing.
"""
# Upload du document
doc_url: str = upload_pdf(client, file_bytes, filename)
# Appel OCR
response = client.ocr.process(
model="mistral-ocr-latest",
document={
"type": "document_url",
"document_url": doc_url,
},
include_image_base64=include_images,
)
return response
def run_ocr_with_annotations(
client: Mistral,
file_bytes: bytes,
filename: str,
include_images: bool = True,
document_annotation_format: Optional[Type[BaseModel]] = None,
bbox_annotation_format: Optional[Type[BaseModel]] = None,
pages: Optional[List[int]] = None,
) -> Any:
"""Execute OCR with structured annotations on a PDF document.
This advanced OCR mode extracts text along with structural metadata
defined by Pydantic schemas. Useful for extracting structured data
like table of contents, form fields, or document hierarchy.
Two annotation modes are available:
- Document annotations: Extract document-level structure (limited to 8 pages)
- Bounding box annotations: Extract element positions (no page limit)
This mode is approximately 3x more expensive than standard OCR (~0.003 EUR/page).
Args:
client: Authenticated Mistral client instance created via
utils.mistral_client.create_client().
file_bytes: Binary content of the PDF file to process.
filename: Original filename of the PDF (used for identification).
include_images: If True, includes base64-encoded images from each page.
Defaults to True.
document_annotation_format: Optional Pydantic model defining the expected
document-level annotation structure. The model is converted to JSON
schema for the API. Limited to processing 8 pages maximum.
bbox_annotation_format: Optional Pydantic model defining the expected
bounding box annotation structure. No page limit applies.
pages: Optional list of 0-indexed page numbers to process. If None,
all pages are processed. Use this to limit costs and processing time.
Returns:
OCR response object with annotations from Mistral API. Contains:
- pages: List of page objects with extracted text
- annotations: Structured data matching the provided Pydantic schema
- images: Base64 images if include_images=True
Use serialize_ocr_response() to convert to a dictionary.
Raises:
RuntimeError: If the Mistral client is not properly authenticated.
HTTPError: If the API request fails (network issues, rate limits).
ValueError: If document_annotation_format is used with more than 8 pages.
Example:
Extract table of contents from first 8 pages::
from pydantic import BaseModel
from typing import List, Optional
class TOCEntry(BaseModel):
title: str
page: int
level: int
children: Optional[List["TOCEntry"]] = None
response = run_ocr_with_annotations(
client,
pdf_bytes,
"book.pdf",
document_annotation_format=TOCEntry,
pages=[0, 1, 2, 3, 4, 5, 6, 7]
)
# Access annotations
toc_data = response.annotations # doctest: +SKIP
Note:
- Document annotations are more expensive but provide rich structure
- For large documents, use pages parameter to limit processing
- Consider caching results to avoid reprocessing costs
"""
from mistralai.extra import response_format_from_pydantic_model
# Upload du document
doc_url: str = upload_pdf(client, file_bytes, filename)
# Construire les arguments de l'appel OCR
kwargs: Dict[str, Any] = {
"model": "mistral-ocr-latest",
"document": {
"type": "document_url",
"document_url": doc_url,
},
"include_image_base64": include_images,
}
# Ajouter les pages si spécifié
if pages is not None:
kwargs["pages"] = pages
# Ajouter le format d'annotation document si fourni
if document_annotation_format is not None:
kwargs["document_annotation_format"] = response_format_from_pydantic_model(
document_annotation_format
)
# Ajouter le format d'annotation bbox si fourni
if bbox_annotation_format is not None:
kwargs["bbox_annotation_format"] = response_format_from_pydantic_model(
bbox_annotation_format
)
# Appel OCR avec annotations
response = client.ocr.process(**kwargs)
return response
def serialize_ocr_response(response: Any) -> Dict[str, Any]:
"""Convert an OCR response object to a JSON-serializable dictionary.
The Mistral OCR API returns Pydantic model objects that need to be
converted to plain dictionaries for JSON storage or further processing.
This function handles various response formats from different versions
of the Mistral SDK.
Args:
response: OCR response object from Mistral API. Can be any object
that has model_dump(), dict(), or json() method.
Returns:
A dictionary representation of the OCR response, suitable for:
- JSON serialization with json.dumps()
- Storage in files (output/<doc>/<doc>.json)
- Further processing in the pipeline
The dictionary typically contains:
- pages: List of page data with text content
- images: Base64-encoded images (if requested)
- model: OCR model used
- usage: Token/page usage statistics
Raises:
TypeError: If the response object cannot be serialized using any
of the supported methods (model_dump, dict, json).
Example:
>>> # Assuming response is from run_ocr()
>>> ocr_dict = serialize_ocr_response(response) # doctest: +SKIP
>>> import json
>>> with open("ocr_result.json", "w") as f:
... json.dump(ocr_dict, f, indent=2) # doctest: +SKIP
>>> # Access page count
>>> num_pages = len(ocr_dict["pages"]) # doctest: +SKIP
Note:
This function tries multiple serialization methods in order of
preference:
1. model_dump() - Pydantic v2 (preferred)
2. dict() - Pydantic v1 compatibility
3. json() - Fallback for other Pydantic models
"""
if hasattr(response, "model_dump"):
result: Dict[str, Any] = response.model_dump()
return result
if hasattr(response, "dict"):
result = response.dict()
return result
if hasattr(response, "json"):
result = json.loads(response.json())
return result
raise TypeError("Réponse OCR non sérialisable")

View File

@@ -0,0 +1,55 @@
"""Schémas Pydantic pour l'extraction structurée via OCR avec annotations.
Utilisés avec document_annotation_format et bbox_annotation_format de l'API Mistral.
"""
from typing import List, Optional
from pydantic import BaseModel, Field
from enum import Enum
class TocEntryType(str, Enum):
"""Type d'entrée de table des matières."""
CHAPTER = "chapter"
SECTION = "section"
SUBSECTION = "subsection"
PREAMBLE = "preamble"
APPENDIX = "appendix"
class TocEntry(BaseModel):
"""Entrée de table des matières avec hiérarchie."""
title: str = Field(..., description="Titre exact de la section tel qu'il apparaît dans la table des matières")
page_number: int = Field(..., description="Numéro de page réel tel qu'imprimé/affiché dans le livre (PAS l'index séquentiel du PDF, mais le numéro visible sur la page elle-même)")
level: int = Field(..., description="""Niveau hiérarchique détecté VISUELLEMENT dans la mise en page de la table des matières:
- level=1 si le titre est aligné à gauche SANS indentation (titres principaux)
- level=2 si le titre a une PETITE indentation ou est légèrement décalé vers la droite
- level=3 si le titre a une DOUBLE indentation ou est très décalé vers la droite
Regardez attentivement l'alignement horizontal et les espaces avant chaque titre pour déterminer le niveau.""")
entry_type: TocEntryType = Field(default=TocEntryType.SECTION, description="Type d'entrée: 'preamble' pour préfaces/introductions, 'chapter' pour chapitres, 'section' pour sections, 'subsection' pour sous-sections, 'appendix' pour annexes")
parent_title: Optional[str] = Field(None, description="Si level > 1, indiquer le titre du parent direct (l'entrée de level=1 sous laquelle cette entrée est indentée)")
class DocumentTOC(BaseModel):
"""Table des matières complète du document."""
entries: List[TocEntry] = Field(..., description="""Liste COMPLÈTE de TOUTES les entrées de la table des matières dans l'ordre d'apparition.
IMPORTANT : Analysez attentivement l'indentation/alignement horizontal de chaque titre pour assigner le bon niveau hiérarchique:
- Les titres alignés à gauche (non indentés) = level 1
- Les titres légèrement indentés/décalés vers la droite = level 2 (sous-sections du titre level 1 précédent)
- Les titres avec double indentation = level 3 (sous-sections du titre level 2 précédent)
Chaque entrée doit avoir son vrai numéro de page tel qu'imprimé dans le livre.""")
has_explicit_toc: bool = Field(..., description="Le document contient-il une table des matières explicite et visible ? (généralement en début de document)")
toc_page_numbers: List[int] = Field(..., description="Liste des numéros de pages où se trouve la table des matières (généralement pages 2-5)")
class DocumentMetadata(BaseModel):
"""Métadonnées enrichies du document."""
title: str = Field(..., description="Titre complet du document")
author: str = Field(..., description="Auteur principal du document")
languages: List[str] = Field(..., description="Liste des langues présentes dans le document (codes ISO 639-1, ex: ['fr', 'en'])")
summary: str = Field(..., description="Résumé du document en 2-3 phrases maximum")
collection: Optional[str] = Field(None, description="Nom de la collection ou série éditoriale")
publisher: Optional[str] = Field(None, description="Nom de l'éditeur")
year: Optional[int] = Field(None, description="Année de publication")
total_pages: int = Field(..., description="Nombre total de pages dans le document")
toc: DocumentTOC = Field(..., description="Table des matières structurée avec hiérarchie et numéros de page réels")

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,31 @@
"""Upload de fichiers PDF vers l'API Mistral."""
from mistralai import Mistral
def upload_pdf(client: Mistral, file_bytes: bytes, filename: str) -> str:
"""Upload un PDF vers Mistral et retourne l'URL signée.
Args:
client: Client Mistral authentifié
file_bytes: Contenu binaire du fichier PDF
filename: Nom du fichier
Returns:
URL signée du document uploadé
"""
# Upload du fichier
uploaded = client.files.upload(
file={
"file_name": filename,
"content": file_bytes,
},
purpose="ocr",
)
# Récupération de l'URL signée
signed = client.files.get_signed_url(file_id=uploaded.id)
return signed.url

View File

@@ -0,0 +1,382 @@
"""TOC Enrichment Module for Chunk Metadata Enhancement.
This module provides functions to enrich chunk metadata with hierarchical
information from the table of contents (TOC). It matches chunks to their
corresponding TOC entries and extracts:
- Full hierarchical paths (e.g., "Peirce: CP 1.628 > 628. It is...")
- Chapter titles
- Canonical academic references (e.g., "CP 1.628", "Ménon 80a")
The enrichment happens before Weaviate ingestion to ensure chunks have
complete metadata for rigorous academic citation.
Usage:
>>> from utils.toc_enricher import enrich_chunks_with_toc
>>> enriched_chunks = enrich_chunks_with_toc(chunks, toc, hierarchy)
See Also:
- utils.types: FlatTOCEntryEnriched type definition
- utils.weaviate_ingest: Integration point for enrichment
"""
from __future__ import annotations
import logging
import re
from typing import Any, Dict, List, Optional
from .types import FlatTOCEntryEnriched
logger = logging.getLogger(__name__)
def flatten_toc_with_paths(
toc: List[Dict[str, Any]],
hierarchy: Dict[str, Any],
) -> List[FlatTOCEntryEnriched]:
"""Flatten hierarchical or flat TOC and build full paths with metadata.
Handles both hierarchical TOCs (with 'children' keys) and flat TOCs
(where parent-child relationships are inferred from 'level' field).
Traverses the TOC structure and creates enriched flat entries with:
- Full hierarchical path (e.g., "Peirce: CP 1.628 > 628. It is...")
- Canonical reference extraction (e.g., "CP 1.628")
- Chapter title tracking (first level 1 ancestor)
- Parent title list for context
Args:
toc: TOC structure with 'title' and 'level' fields, optionally 'children'
hierarchy: Document hierarchy (currently unused, reserved for future)
Returns:
List of enriched flat TOC entries with full metadata.
Example:
>>> toc = [
... {"title": "Peirce: CP 1.628", "level": 1},
... {"title": "628. It is the instincts...", "level": 2}
... ]
>>> flat = flatten_toc_with_paths(toc, {})
>>> flat[1]["full_path"]
'Peirce: CP 1.628 > 628. It is the instincts...'
>>> flat[1]["canonical_ref"]
'CP 1.628'
"""
flat_toc: List[FlatTOCEntryEnriched] = []
# Check if TOC is hierarchical (has children) or flat (level-based)
is_hierarchical = any("children" in entry for entry in toc if entry)
if is_hierarchical:
# Original recursive approach for hierarchical TOCs
def traverse(
entries: List[Dict[str, Any]],
parent_titles: List[str],
current_chapter: str,
current_canonical: Optional[str],
) -> None:
"""Recursively traverse TOC entries and build flat list."""
for entry in entries:
title = entry.get("title", "")
level = entry.get("level", 0)
children = entry.get("children", [])
# Build full path from parents + current title
full_path_parts = parent_titles + [title]
full_path = " > ".join(full_path_parts)
# Extract canonical reference if present in title
canonical_ref = current_canonical
cp_match = re.search(r'CP\s+(\d+\.\d+)', title)
stephanus_match = re.search(r'(\w+\s+\d+[a-z])', title)
if cp_match:
canonical_ref = f"CP {cp_match.group(1)}"
elif stephanus_match:
canonical_ref = stephanus_match.group(1)
# Update chapter title when entering level 1
chapter_title = current_chapter
if level == 1:
chapter_title = title
# Create enriched entry
enriched_entry: FlatTOCEntryEnriched = {
"title": title,
"level": level,
"full_path": full_path,
"chapter_title": chapter_title,
"canonical_ref": canonical_ref,
"parent_titles": parent_titles.copy(),
"index_in_flat_list": len(flat_toc),
}
flat_toc.append(enriched_entry)
# Recursively process children
if children:
traverse(
children,
parent_titles + [title],
chapter_title,
canonical_ref,
)
traverse(toc, [], "", None)
else:
# New iterative approach for flat TOCs (infer hierarchy from levels)
parent_stack: List[Dict[str, Any]] = [] # Stack of (level, title, canonical_ref)
current_chapter = ""
current_canonical: Optional[str] = None
for entry in toc:
title = entry.get("title", "")
level = entry.get("level", 1)
# Pop parents that are at same or deeper level
while parent_stack and parent_stack[-1]["level"] >= level:
parent_stack.pop()
# Build parent titles list
parent_titles = [p["title"] for p in parent_stack]
# Build full path
full_path_parts = parent_titles + [title]
full_path = " > ".join(full_path_parts)
# Extract canonical reference if present in title
cp_match = re.search(r'CP\s+(\d+\.\d+)', title)
stephanus_match = re.search(r'(\w+\s+\d+[a-z])', title)
if cp_match:
current_canonical = f"CP {cp_match.group(1)}"
elif stephanus_match:
current_canonical = stephanus_match.group(1)
elif level == 1:
# Reset canonical ref at level 1 if none found
current_canonical = None
# Inherit canonical ref from parent if not found
if not current_canonical and parent_stack:
current_canonical = parent_stack[-1].get("canonical_ref")
# Update chapter title when at level 1
if level == 1:
current_chapter = title
# Create enriched entry
enriched_entry: FlatTOCEntryEnriched = {
"title": title,
"level": level,
"full_path": full_path,
"chapter_title": current_chapter,
"canonical_ref": current_canonical,
"parent_titles": parent_titles.copy(),
"index_in_flat_list": len(flat_toc),
}
flat_toc.append(enriched_entry)
# Add current entry to parent stack for next iteration
parent_stack.append({
"level": level,
"title": title,
"canonical_ref": current_canonical,
})
return flat_toc
def extract_paragraph_number(section_text: str) -> Optional[str]:
"""Extract paragraph number from section text.
Handles various academic paragraph numbering formats:
- "628. Text...""628"
- "§42 Text...""42"
- "80a. Text...""80a" (Stephanus pagination)
- "CP 5.628. Text...""628"
Args:
section_text: Section title or path text
Returns:
Extracted paragraph number or None if not found.
Example:
>>> extract_paragraph_number("628. It is the instincts...")
'628'
>>> extract_paragraph_number("§42 On the nature of...")
'42'
>>> extract_paragraph_number("80a. SOCRATE: Sais-tu...")
'80a'
"""
if not section_text:
return None
# Pattern 1: Standard paragraph number at start "628. Text"
match = re.match(r'^(\d+[a-z]?)\.\s', section_text)
if match:
return match.group(1)
# Pattern 2: Section symbol "§42 Text"
match = re.match(r'\s*(\d+[a-z]?)\s', section_text)
if match:
return match.group(1)
# Pattern 3: CP reference "CP 5.628. Text" → extract paragraph only
match = re.match(r'^CP\s+\d+\.(\d+)\.\s', section_text)
if match:
return match.group(1)
return None
def find_matching_toc_entry(
chunk: Dict[str, Any],
flat_toc: List[FlatTOCEntryEnriched],
) -> Optional[FlatTOCEntryEnriched]:
"""Find matching TOC entry for a chunk using multi-strategy matching.
Matching strategies (in priority order):
1. **Exact text match**: chunk.section == toc.title
2. **Paragraph number match**: Extract paragraph number from both and compare
3. **Proximity match**: Use order_index to find nearest TOC entry
Args:
chunk: Chunk dict with 'section', 'sectionPath', 'order_index' fields
flat_toc: Flattened TOC with enriched metadata
Returns:
Best matching TOC entry or None if no match found.
Example:
>>> chunk = {"section": "628. It is the instincts...", "order_index": 42}
>>> toc_entry = find_matching_toc_entry(chunk, flat_toc)
>>> toc_entry["canonical_ref"]
'CP 1.628'
"""
if not flat_toc:
return None
chunk_section = chunk.get("section", chunk.get("sectionPath", ""))
if not chunk_section:
return None
# Strategy 1: Exact title match
for entry in flat_toc:
if entry["title"] == chunk_section:
return entry
# Strategy 2: Paragraph number match
chunk_para = extract_paragraph_number(chunk_section)
if chunk_para:
# Look for matching paragraph in level 2 entries (actual content)
for i, entry in enumerate(flat_toc):
if entry["level"] == 2:
entry_para = extract_paragraph_number(entry["title"])
if entry_para == chunk_para:
# Additional text similarity check to disambiguate
# Get first significant word from chunk section
chunk_words = [w for w in chunk_section.split() if len(w) > 3]
entry_words = [w for w in entry["title"].split() if len(w) > 3]
if chunk_words and entry_words:
# Check if first significant words match
if chunk_words[0].lower() in entry["title"].lower():
return entry
else:
# No text to compare, return paragraph match
return entry
# Strategy 3: Proximity match using order_index
chunk_order = chunk.get("order_index")
if chunk_order is not None and flat_toc:
# Find TOC entry with closest index_in_flat_list to chunk order
# This is a fallback heuristic assuming TOC and chunks follow similar order
closest_entry = min(
flat_toc,
key=lambda e: abs(e["index_in_flat_list"] - chunk_order),
)
return closest_entry
return None
def enrich_chunks_with_toc(
chunks: List[Dict[str, Any]],
toc: List[Dict[str, Any]],
hierarchy: Dict[str, Any],
) -> List[Dict[str, Any]]:
"""Enrich chunks with hierarchical metadata from TOC.
Main orchestration function that:
1. Checks if TOC is available (guard clause)
2. Flattens TOC once for efficiency
3. Matches each chunk to its TOC entry
4. Updates chunk metadata: sectionPath, chapterTitle, canonical_reference
Args:
chunks: List of chunk dicts from pdf_pipeline
toc: Hierarchical TOC structure (may be empty)
hierarchy: Document hierarchy dict (may be empty)
Returns:
List of chunks with enriched metadata (same objects, modified in place).
If TOC is empty, returns chunks unchanged (no regression).
Example:
>>> chunks = [{"text": "...", "section": "628. It is..."}]
>>> toc = [
... {"title": "Peirce: CP 1.628", "level": 1, "children": [
... {"title": "628. It is...", "level": 2, "children": []}
... ]}
... ]
>>> enriched = enrich_chunks_with_toc(chunks, toc, {})
>>> enriched[0]["sectionPath"]
'Peirce: CP 1.628 > 628. It is the instincts...'
>>> enriched[0]["chapterTitle"]
'Peirce: CP 1.628'
>>> enriched[0]["canonical_reference"]
'CP 1.628'
"""
# Guard: If no TOC, return chunks unchanged (graceful fallback)
if not toc:
logger.info("No TOC available, skipping chunk enrichment")
return chunks
logger.info(f"Enriching {len(chunks)} chunks with TOC metadata...")
# Flatten TOC once for efficient matching
try:
flat_toc = flatten_toc_with_paths(toc, hierarchy)
logger.info(f"Flattened TOC: {len(flat_toc)} entries")
except Exception as e:
logger.error(f"Failed to flatten TOC: {e}")
return chunks # Fallback on error
# Match each chunk to TOC entry and enrich
enriched_count = 0
for chunk in chunks:
matching_entry = find_matching_toc_entry(chunk, flat_toc)
if matching_entry:
# Update sectionPath with full hierarchical path
chunk["sectionPath"] = matching_entry["full_path"]
# Update chapterTitle
chunk["chapterTitle"] = matching_entry["chapter_title"]
# Add canonicalReference if available
if matching_entry["canonical_ref"]:
chunk["canonicalReference"] = matching_entry["canonical_ref"]
enriched_count += 1
if chunks:
logger.info(
f"Enriched {enriched_count}/{len(chunks)} chunks "
f"({100 * enriched_count / len(chunks):.1f}%)"
)
else:
logger.info("No chunks to enrich")
return chunks

View File

@@ -0,0 +1,260 @@
"""Table of Contents (TOC) extraction using Mistral OCR with annotations.
This module is the **primary entry point** for TOC extraction in the Library RAG
pipeline. It provides intelligent routing between two extraction strategies:
1. **Visual (bbox) Analysis** (default, recommended): Uses bounding box coordinates
to detect indentation and hierarchy based on horizontal positioning.
2. **Semantic (annotation) Analysis**: Uses Mistral's document_annotation_format
for structured metadata and TOC extraction.
The visual approach is more reliable for philosophical texts with complex
hierarchies (parts, chapters, sections, subsections).
Extraction Strategies:
┌─────────────────────────────────────────────────────────────┐
│ extract_toc_from_annotations(use_visual_bbox=True) │
│ ↓ (default) │
│ toc_extractor_visual.py → X-coordinate based hierarchy │
│ │
│ extract_toc_from_annotations(use_visual_bbox=False) │
│ ↓ │
│ DocumentMetadata Pydantic schema → Structured extraction │
└─────────────────────────────────────────────────────────────┘
Cost Considerations:
- Annotated OCR: ~0.003€/page (3x standard OCR cost)
- Only first N pages are processed (default: 8)
- Total cost: max_toc_pages × 0.003€
Output Structure:
{
"success": bool,
"metadata": {...}, # Document metadata
"toc": [...], # Hierarchical TOC (nested children)
"toc_flat": [...], # Flat list with levels
"cost_ocr_annotated": float
}
Example:
>>> from pathlib import Path
>>> from utils.toc_extractor import extract_toc_from_annotations
>>>
>>> # Extract TOC using visual analysis (recommended)
>>> result = extract_toc_from_annotations(
... pdf_path=Path("input/philosophy_book.pdf"),
... max_toc_pages=8,
... use_visual_bbox=True # default
... )
>>> if result["success"]:
... for entry in result["toc"]:
... print(f"{entry['title']} (p.{entry['page']})")
Functions:
- extract_toc_from_annotations(): Main entry point with strategy routing
- build_hierarchical_toc(): Converts flat TOC entries to nested structure
- map_toc_to_content(): Associates TOC entries with document content
See Also:
- utils.toc_extractor_visual: Visual/bbox-based extraction (default)
- utils.toc_extractor_markdown: Markdown indentation-based extraction
- utils.llm_toc: LLM-based TOC extraction (alternative approach)
"""
import json
import logging
from typing import Any, Dict, List, Optional, Union, cast
from pathlib import Path
from .ocr_schemas import DocumentMetadata, TocEntry
from .ocr_processor import run_ocr_with_annotations
from .mistral_client import create_client
logger: logging.Logger = logging.getLogger(__name__)
# TypedDict for hierarchical TOC nodes
class TOCNode(Dict[str, Any]):
"""Type alias for TOC node structure with title, page, level, type, children."""
pass
def extract_toc_from_annotations(
pdf_path: Path,
api_key: Optional[str] = None,
max_toc_pages: int = 8,
use_visual_bbox: bool = True, # NOUVEAU : Utiliser l'analyse visuelle par défaut
) -> Dict[str, Any]:
"""Extrait la TOC structurée via OCR avec annotations.
Coût : 3€/1000 pages pour les pages annotées (vs 1€/1000 pour OCR basique).
Args:
pdf_path: Chemin du fichier PDF
api_key: Clé API Mistral (optionnel, sinon charge depuis .env)
max_toc_pages: Nombre max de pages à annoter (défaut 8, limite API pour document_annotation)
use_visual_bbox: Si True, utilise l'analyse visuelle des bounding boxes (plus fiable)
Returns:
Dict avec :
- success: bool
- metadata: dict avec métadonnées enrichies
- toc: liste hiérarchique [{title, page, level, children}]
- toc_flat: liste plate [{title, page, level, type, parent_title}]
- cost_ocr_annotated: float (coût en €)
- error: str (si échec)
"""
# Si demandé, utiliser l'approche visuelle (bbox)
if use_visual_bbox:
logger.info("Utilisation de l'analyse visuelle (bbox) pour extraction TOC")
from .toc_extractor_visual import extract_toc_with_visual_analysis
return cast(Dict[str, Any], extract_toc_with_visual_analysis(pdf_path, api_key, max_toc_pages))
# Sinon, continuer avec l'approche sémantique (document_annotation_format)
try:
client = create_client(api_key)
pdf_bytes = pdf_path.read_bytes()
except Exception as e:
logger.error(f"Erreur initialisation client/lecture PDF : {e}")
return {"success": False, "error": f"Initialisation échouée : {str(e)}"}
# Phase 1 : Annoter les premières pages pour extraire TOC + métadonnées
logger.info(f"Extraction TOC avec annotations sur {max_toc_pages} premières pages")
try:
annotated_response = run_ocr_with_annotations(
client=client,
file_bytes=pdf_bytes,
filename=pdf_path.name,
include_images=False, # Pas besoin d'images pour la TOC
document_annotation_format=DocumentMetadata,
pages=list(range(max_toc_pages)), # Pages 0 à max_toc_pages-1
)
except Exception as e:
logger.error(f"Erreur appel OCR avec annotations : {e}")
return {"success": False, "error": f"Appel OCR échoué : {str(e)}"}
# Extraire les annotations du document
doc_annotation = getattr(annotated_response, "document_annotation", None)
if not doc_annotation:
return {"success": False, "error": "Aucune annotation retournée par l'API"}
# Convertir en dictionnaire
try:
if isinstance(doc_annotation, str):
metadata_dict = json.loads(doc_annotation)
else:
metadata_dict = doc_annotation
except Exception as e:
logger.error(f"Erreur parsing annotations : {e}")
return {"success": False, "error": f"Parsing annotations échoué : {str(e)}"}
# Valider avec Pydantic
try:
metadata = DocumentMetadata(**metadata_dict)
toc_entries = metadata.toc.entries
logger.info(f"TOC extraite : {len(toc_entries)} entrées")
# Construire la TOC hiérarchique
hierarchical_toc = build_hierarchical_toc(toc_entries)
return {
"success": True,
"metadata": metadata.model_dump(),
"toc": hierarchical_toc,
"toc_flat": [entry.model_dump() for entry in toc_entries],
"cost_ocr_annotated": max_toc_pages * 0.003, # 3€/1000 pages
}
except Exception as e:
logger.error(f"Erreur validation annotations : {e}")
return {"success": False, "error": f"Validation Pydantic échouée : {str(e)}"}
def build_hierarchical_toc(entries: List[TocEntry]) -> List[Dict[str, Any]]:
"""Construit une TOC hiérarchique à partir des entrées plates avec niveaux.
Utilise une stack pour gérer la hiérarchie basée sur les niveaux.
Args:
entries: Liste d'entrées TocEntry avec level (1=racine, 2=enfant de 1, etc.)
Returns:
TOC hiérarchique avec structure [{title, page, level, type, children: [...]}]
"""
if not entries:
return []
toc: List[Dict[str, Any]] = []
stack: List[Dict[str, Any]] = [] # Stack pour gérer la hiérarchie courante
for entry in entries:
node: Dict[str, Any] = {
"title": entry.title,
"page": entry.page_number,
"level": entry.level,
"type": entry.entry_type.value,
"children": [],
}
# Remonter dans la stack jusqu'au parent approprié
# Un élément de level N doit être enfant du dernier élément de level < N
while stack and stack[-1]["level"] >= entry.level:
stack.pop()
if stack:
# Ajouter comme enfant du dernier élément de la stack
children: List[Dict[str, Any]] = stack[-1]["children"]
children.append(node)
else:
# Ajouter à la racine de la TOC
toc.append(node)
# Empiler ce nœud pour les prochaines itérations
stack.append(node)
return toc
def map_toc_to_content(
toc_entries: List[TocEntry],
all_pages_markdown: str,
) -> Dict[str, str]:
"""Associe les entrées de TOC au contenu réel du document.
Utilise les vrais numéros de page pour découper le contenu par section.
Args:
toc_entries: Entrées de TOC avec numéros de page réels
all_pages_markdown: Markdown complet du document avec <!-- Page N --> markers
Returns:
Mapping {section_title: content_text}
"""
# Découper le markdown par commentaires de page
pages: List[str] = all_pages_markdown.split("<!-- Page ")
content_map: Dict[str, str] = {}
for i, entry in enumerate(toc_entries):
start_page: int = entry.page_number
# Trouver la page de fin (numéro de page de la prochaine entrée ou fin du doc)
end_page: int
if i < len(toc_entries) - 1:
end_page = toc_entries[i + 1].page_number
else:
end_page = len(pages) # Jusqu'à la fin
# Extraire le contenu entre start_page et end_page
section_content: List[str] = []
for page_idx in range(start_page, end_page):
if page_idx < len(pages):
# Nettoyer le commentaire de page et extraire le contenu
page_text: str = pages[page_idx].split("-->", 1)[-1].strip()
section_content.append(page_text)
content_map[entry.title] = "\n\n".join(section_content)
return content_map

View File

@@ -0,0 +1,303 @@
"""TOC extraction via Markdown indentation analysis.
This module provides a **cost-free** TOC extraction strategy that works on
already-generated Markdown text. Unlike the OCR annotation approach, this
method doesn't require additional API calls.
Strategy:
1. Search for "Table des matières" heading in the first N lines
2. Parse lines matching pattern: "Title.....Page" or "Title Page"
3. Detect hierarchy from leading whitespace (indentation)
4. Build nested TOC structure using stack-based algorithm
When to Use:
- When OCR has already been performed (markdown available)
- When cost optimization is critical (no additional API calls)
- For documents with clear indentation in the TOC
Limitations:
- Requires French "Table des matières" header (can be extended)
- Indentation detection may be less accurate than visual/bbox analysis
- Only works if OCR preserved whitespace accurately
Indentation Levels:
- 0-2 spaces: Level 1 (main chapters/parts)
- 3-6 spaces: Level 2 (sections)
- 7+ spaces: Level 3 (subsections)
Output Structure:
{
"success": bool,
"toc": [...], # Hierarchical TOC
"toc_flat": [...], # Flat entries with levels
"cost_ocr_annotated": 0.0, # No additional cost
"method": "markdown_indentation"
}
Example:
>>> from utils.toc_extractor_markdown import extract_toc_from_markdown
>>>
>>> markdown = '''
... # Table des matières
... Introduction.............................5
... Première partie..........................10
... Chapitre 1............................15
... Chapitre 2............................25
... Deuxième partie..........................50
... '''
>>> result = extract_toc_from_markdown(markdown)
>>> if result["success"]:
... print(f"Found {len(result['toc_flat'])} entries")
Found 5 entries
Functions:
- extract_toc_from_markdown(): Main extraction from markdown text
- build_hierarchy(): Converts flat entries to nested structure
See Also:
- utils.toc_extractor: Main entry point (routes to visual by default)
- utils.toc_extractor_visual: More accurate X-position based extraction
"""
from __future__ import annotations
import logging
import re
from typing import Any, Dict, List, Optional, TypedDict, Union
from pathlib import Path
logger = logging.getLogger(__name__)
# Type definitions for internal data structures
class MarkdownTOCEntryRaw(TypedDict):
"""Raw TOC entry extracted from markdown with indentation info."""
title: str
page_number: int
level: int
leading_spaces: int
class MarkdownTOCNode(TypedDict):
"""Hierarchical TOC node with children."""
title: str
page: int
level: int
type: str
children: List[MarkdownTOCNode]
class MarkdownTOCFlatEntry(TypedDict):
"""Flat TOC entry with parent information."""
title: str
page_number: int
level: int
entry_type: str
parent_title: Optional[str]
class MarkdownTOCResultSuccess(TypedDict):
"""Successful TOC extraction result."""
success: bool # Always True
metadata: Dict[str, Any]
toc: List[MarkdownTOCNode]
toc_flat: List[MarkdownTOCFlatEntry]
cost_ocr_annotated: float
method: str
class MarkdownTOCResultError(TypedDict):
"""Failed TOC extraction result."""
success: bool # Always False
error: str
# Union type for function return
MarkdownTOCResult = Union[MarkdownTOCResultSuccess, MarkdownTOCResultError]
def extract_toc_from_markdown(
markdown_text: str,
max_lines: int = 200,
) -> MarkdownTOCResult:
"""Extract table of contents by analyzing raw markdown text.
Detects hierarchy by counting leading spaces (indentation) at the
beginning of each line. This is a cost-free alternative to OCR
annotation-based extraction.
Args:
markdown_text: Complete markdown text of the document.
max_lines: Maximum number of lines to analyze (searches TOC at start).
Returns:
Dictionary with hierarchical TOC structure. On success, includes:
- success: True
- metadata: Empty dict (for consistency with other extractors)
- toc: Hierarchical nested TOC structure
- toc_flat: Flat list of entries with levels
- cost_ocr_annotated: 0.0 (no additional cost)
- method: "markdown_indentation"
On failure, includes:
- success: False
- error: Error message string
Example:
>>> markdown = '''
... # Table des matières
... Introduction.....5
... Part One........10
... Chapter 1.....15
... '''
>>> result = extract_toc_from_markdown(markdown)
>>> if result["success"]:
... print(len(result["toc_flat"]))
3
"""
logger.info("Extraction TOC depuis markdown (analyse indentation)")
lines: List[str] = markdown_text.split('\n')[:max_lines]
# Find "Table des matières" section
toc_start: Optional[int] = None
for i, line in enumerate(lines):
if re.search(r'table\s+des\s+mati[èe]res', line, re.IGNORECASE):
toc_start = i + 1
logger.info(f"TOC trouvée à la ligne {i}")
break
if toc_start is None:
logger.warning("Aucune table des matières trouvée dans le markdown")
return MarkdownTOCResultError(
success=False,
error="Table des matières introuvable"
)
# Extract TOC entries
entries: List[MarkdownTOCEntryRaw] = []
toc_pattern: re.Pattern[str] = re.compile(r'^(\s*)(.+?)\s*\.+\s*(\d+)\s*$')
for line in lines[toc_start:toc_start + 100]: # Max 100 lines of TOC
line_stripped: str = line.strip()
if not line_stripped or line_stripped.startswith('#') or line_stripped.startswith('---'):
continue
# Search for pattern "Title.....Page"
# Must analyze line BEFORE strip() to count leading spaces
original_line: str = lines[lines.index(line) if line in lines else 0]
leading_spaces: int = len(original_line) - len(original_line.lstrip())
# Alternative pattern: search for title + number at end
match: Optional[re.Match[str]] = re.match(r'^(.+?)\s*\.{2,}\s*(\d+)\s*$', line_stripped)
if not match:
# Try without dotted leaders
match = re.match(r'^(.+?)\s+(\d+)\s*$', line_stripped)
if match:
title: str = match.group(1).strip()
page: int = int(match.group(2))
# Ignore lines too short or that don't look like titles
if len(title) < 3 or title.isdigit():
continue
# Determine level based on indentation
# 0-2 spaces = level 1
# 3-6 spaces = level 2
# 7+ spaces = level 3
level: int
if leading_spaces <= 2:
level = 1
elif leading_spaces <= 6:
level = 2
else:
level = 3
entries.append(MarkdownTOCEntryRaw(
title=title,
page_number=page,
level=level,
leading_spaces=leading_spaces,
))
logger.debug(f" '{title}'{leading_spaces} espaces → level {level} (page {page})")
if not entries:
logger.warning("Aucune entrée TOC extraite")
return MarkdownTOCResultError(
success=False,
error="Aucune entrée TOC trouvée"
)
logger.info(f"{len(entries)} entrées extraites depuis markdown")
# Build hierarchy
toc: List[MarkdownTOCNode] = build_hierarchy(entries)
return MarkdownTOCResultSuccess(
success=True,
metadata={},
toc=toc,
toc_flat=[
MarkdownTOCFlatEntry(
title=e["title"],
page_number=e["page_number"],
level=e["level"],
entry_type="section",
parent_title=None,
)
for e in entries
],
cost_ocr_annotated=0.0, # No additional cost, uses existing OCR
method="markdown_indentation",
)
def build_hierarchy(entries: List[MarkdownTOCEntryRaw]) -> List[MarkdownTOCNode]:
"""Build hierarchical structure from flat entries based on levels.
Uses a stack-based algorithm to construct nested TOC structure where
entries with higher indentation become children of the previous
less-indented entry.
Args:
entries: List of raw TOC entries with title, page, and level.
Returns:
Nested list of TOC nodes where each node contains children.
Example:
>>> entries = [
... {"title": "Part 1", "page_number": 1, "level": 1, "leading_spaces": 0},
... {"title": "Chapter 1", "page_number": 5, "level": 2, "leading_spaces": 4},
... ]
>>> hierarchy = build_hierarchy(entries)
>>> len(hierarchy[0]["children"])
1
"""
toc: List[MarkdownTOCNode] = []
stack: List[MarkdownTOCNode] = []
for entry in entries:
node: MarkdownTOCNode = MarkdownTOCNode(
title=entry["title"],
page=entry["page_number"],
level=entry["level"],
type="section",
children=[],
)
# Pop from stack until we find a parent at lower level
while stack and stack[-1]["level"] >= node["level"]:
stack.pop()
if stack:
# Add as child to top of stack
stack[-1]["children"].append(node)
else:
# Add as root-level entry
toc.append(node)
stack.append(node)
return toc

View File

@@ -0,0 +1,512 @@
"""Visual TOC extraction using bounding box X-coordinate analysis.
This module provides the **most accurate** TOC extraction strategy for
philosophical texts by analyzing the horizontal position (X-coordinate)
of each TOC entry. This approach is more reliable than text indentation
analysis because it directly measures visual layout.
How It Works:
1. OCR with annotations extracts text + bounding box positions
2. Pydantic schema (TocEntryBbox) captures title, page, and x_position
3. X-coordinates are clustered to identify distinct indentation levels
4. Hierarchy is built based on relative X-positions
X-Position Interpretation:
The x_position is normalized between 0.0 (left edge) and 1.0 (right edge):
- x ≈ 0.05-0.12: Level 1 (no indentation, main parts/chapters)
- x ≈ 0.13-0.22: Level 2 (small indentation, sections)
- x ≈ 0.23-0.35: Level 3 (double indentation, subsections)
Positions within 0.03 tolerance are grouped into the same level.
Advantages over Markdown Analysis:
- Works regardless of OCR whitespace accuracy
- More reliable for complex hierarchies
- Handles both printed and handwritten indentation
Cost:
- Uses OCR with annotations: ~0.003€/page
- Only processes first N pages (default: 8)
Pydantic Schemas:
- TocEntryBbox: Single TOC entry with text, page_number, x_position
- DocumentTocBbox: Container for list of entries
Output Structure:
{
"success": bool,
"metadata": {...},
"toc": [...], # Hierarchical TOC
"toc_flat": [...], # Flat entries with levels
"cost_ocr_annotated": float,
"method": "visual_x_position"
}
Example:
>>> from pathlib import Path
>>> from utils.toc_extractor_visual import extract_toc_with_visual_analysis
>>>
>>> result = extract_toc_with_visual_analysis(
... pdf_path=Path("input/philosophy_book.pdf"),
... max_toc_pages=8
... )
>>> if result["success"]:
... for entry in result["toc"]:
... indent = " " * (entry["level"] - 1)
... print(f"{indent}{entry['title']} (p.{entry['page']})")
Algorithm Details:
1. Collect all x_position values from OCR response
2. Sort and cluster positions (tolerance: 0.03)
3. Compute cluster centroids as level thresholds
4. Assign level to each entry based on nearest centroid
5. Build hierarchy using stack-based approach
Functions:
- extract_toc_with_visual_analysis(): Main extraction function
- build_hierarchy_from_bbox(): Converts entries with X-positions to hierarchy
- flatten_toc(): Flattens hierarchical TOC for storage
See Also:
- utils.toc_extractor: Main entry point (routes here by default)
- utils.toc_extractor_markdown: Alternative cost-free extraction
"""
from __future__ import annotations
import json
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional, Type, TypedDict, Union
from pydantic import BaseModel, Field
from .mistral_client import create_client
from .ocr_processor import run_ocr_with_annotations
logger: logging.Logger = logging.getLogger(__name__)
class TocEntryBbox(BaseModel):
"""TOC entry with bounding box for visual detection.
Attributes:
text: Complete entry text as it appears in the table of contents.
Example: 'Presentation' or 'What is virtue?' or 'Meno or on virtue'.
DO NOT include leader dots or page number in this field.
page_number: Actual page number as printed in the book (the visible number
on the right in the TOC). Example: if the line says 'Presentation.....3',
extract the number 3. This is the BOOK page number, not the PDF index.
x_position: Horizontal position (X coordinate) of the text start, normalized
between 0 and 1. This is the CRUCIAL COORDINATE for detecting indentation:
- x ≈ 0.05-0.12 = left-aligned title, NOT indented (hierarchical level 1)
- x ≈ 0.13-0.22 = title with SMALL indentation (hierarchical level 2)
- x ≈ 0.23-0.35 = title with DOUBLE indentation (hierarchical level 3)
Measure precisely where the first character of the title begins.
"""
text: str = Field(..., description="""Texte COMPLET de l'entrée tel qu'il apparaît dans la table des matières.
Exemple: 'Présentation' ou 'Qu'est-ce que la vertu ?' ou 'Ménon ou de la vertu'.
NE PAS inclure les points de suite ni le numéro de page dans ce champ.""")
page_number: int = Field(..., description="""Numéro de page réel tel qu'imprimé dans le livre (le numéro visible à droite dans la TOC).
Exemple: si la ligne dit 'Présentation.....3', extraire le nombre 3.
C'est le numéro de page du LIVRE, pas l'index PDF.""")
x_position: float = Field(..., description="""Position horizontale (coordonnée X) du début du texte, normalisée entre 0 et 1.
C'est LA COORDONNÉE CRUCIALE pour détecter l'indentation:
- x ≈ 0.05-0.12 = titre aligné à gauche, NON indenté (niveau hiérarchique 1)
- x ≈ 0.13-0.22 = titre avec PETITE indentation (niveau hiérarchique 2)
- x ≈ 0.23-0.35 = titre avec DOUBLE indentation (niveau hiérarchique 3)
Mesurer précisément où commence le premier caractère du titre.""")
class DocumentTocBbox(BaseModel):
"""Schema for extracting all TOC entries with their positions.
Attributes:
entries: Complete list of ALL entries found in the table of contents.
For EACH line in the TOC, extract:
1. The title text (without leader dots)
2. The page number (the number on the right)
3. The exact horizontal X position of the title start (to detect indentation)
Include ALL entries, even those that appear to be at the same visual level.
"""
entries: List[TocEntryBbox] = Field(
...,
description="""Complete list of ALL entries found in the table of contents.
For EACH line in the TOC, extract:
1. The title text (without leader dots)
2. The page number (the number on the right)
3. The exact horizontal X position of the title start (to detect indentation)
Include ALL entries, even those that appear to be at the same visual level.""",
)
# TypedDict classes for structured return types
class VisualTOCMetadata(TypedDict):
"""Metadata extracted from the document.
Attributes:
title: Document title.
author: Document author.
languages: List of languages present in the document.
summary: Brief document summary.
"""
title: str
author: str
languages: List[str]
summary: str
class VisualTOCNode(TypedDict):
"""Hierarchical TOC node.
Attributes:
title: Entry title text.
page: Page number in the book.
level: Hierarchical level (1 = top level, 2 = subsection, etc.).
type: Entry type (e.g., "section", "chapter").
children: List of child nodes.
"""
title: str
page: int
level: int
type: str
children: List[VisualTOCNode]
class VisualTOCFlatEntry(TypedDict):
"""Flattened TOC entry for storage.
Attributes:
title: Entry title text.
page_number: Page number in the book.
level: Hierarchical level.
entry_type: Entry type (e.g., "section", "chapter").
parent_title: Title of the parent entry, if any.
"""
title: str
page_number: int
level: int
entry_type: str
parent_title: Optional[str]
class VisualTOCResultSuccess(TypedDict):
"""Successful TOC extraction result.
Attributes:
success: Always True for success case.
metadata: Document metadata.
toc: Hierarchical TOC structure.
toc_flat: Flattened TOC entries.
cost_ocr_annotated: OCR processing cost in euros.
method: Extraction method identifier.
"""
success: bool
metadata: VisualTOCMetadata
toc: List[VisualTOCNode]
toc_flat: List[VisualTOCFlatEntry]
cost_ocr_annotated: float
method: str
class VisualTOCResultError(TypedDict):
"""Failed TOC extraction result.
Attributes:
success: Always False for error case.
error: Error message describing the failure.
"""
success: bool
error: str
# Union type for the function return
VisualTOCResult = Union[VisualTOCResultSuccess, VisualTOCResultError]
class VisualTOCEntryInternal(TypedDict):
"""Internal representation of TOC entry during processing.
Attributes:
text: Entry title text.
page_number: Page number in the book.
x_position: Normalized X position (0.0 to 1.0).
x_start: Same as x_position (for processing).
page: Same as page_number (for processing).
level: Computed hierarchical level.
"""
text: str
page_number: int
x_position: float
x_start: float
page: int
level: int
def extract_toc_with_visual_analysis(
pdf_path: Path,
api_key: Optional[str] = None,
max_toc_pages: int = 8,
) -> VisualTOCResult:
"""Extract TOC by visually analyzing bounding boxes.
Detects hierarchy from horizontal alignment (X coordinate). This method
uses OCR with annotations to extract the precise X-coordinate of each
TOC entry, then clusters these positions to identify indentation levels.
Args:
pdf_path: Path to the PDF file.
api_key: Mistral API key (optional, uses environment variable if not provided).
max_toc_pages: Number of pages to analyze (default: 8).
Returns:
Dictionary containing either:
- Success: metadata, hierarchical TOC, flat TOC, cost, method
- Error: success=False and error message
Raises:
Does not raise exceptions; errors are returned in the result dictionary.
Example:
>>> from pathlib import Path
>>> result = extract_toc_with_visual_analysis(Path("book.pdf"))
>>> if result["success"]:
... print(f"Extracted {len(result['toc'])} top-level entries")
... else:
... print(f"Error: {result['error']}")
"""
try:
client = create_client(api_key)
pdf_bytes: bytes = pdf_path.read_bytes()
except Exception as e:
logger.error(f"Initialization error: {e}")
return {"success": False, "error": str(e)}
logger.info(f"Visual TOC extraction on {max_toc_pages} pages")
# Call OCR with document_annotation_format for global structure
try:
response = run_ocr_with_annotations(
client=client,
file_bytes=pdf_bytes,
filename=pdf_path.name,
include_images=False,
document_annotation_format=DocumentTocBbox,
pages=list(range(max_toc_pages)),
)
except Exception as e:
logger.error(f"OCR with annotations error: {e}")
return {"success": False, "error": f"OCR failed: {str(e)}"}
# Extract annotations
doc_annotation: Any = getattr(response, "document_annotation", None)
if not doc_annotation:
return {"success": False, "error": "No annotation returned"}
# Parse entries
try:
if isinstance(doc_annotation, str):
toc_data: Any = json.loads(doc_annotation)
else:
toc_data = doc_annotation
entries_data: List[Dict[str, Any]] = (
toc_data.get("entries", []) if isinstance(toc_data, dict) else toc_data
)
# Build hierarchy from X coordinates
toc_entries: List[VisualTOCNode] = build_hierarchy_from_bbox(entries_data)
logger.info(f"TOC extracted visually: {len(toc_entries)} entries")
# Basic metadata (no enriched metadata in visual mode)
metadata: VisualTOCMetadata = {
"title": pdf_path.stem,
"author": "Unknown author",
"languages": [],
"summary": "",
}
result: VisualTOCResultSuccess = {
"success": True,
"metadata": metadata,
"toc": toc_entries,
"toc_flat": flatten_toc(toc_entries),
"cost_ocr_annotated": max_toc_pages * 0.003,
"method": "visual_x_position",
}
return result
except Exception as e:
logger.error(f"Bbox parsing error: {e}")
return {"success": False, "error": f"Parsing failed: {str(e)}"}
def build_hierarchy_from_bbox(entries: List[Dict[str, Any]]) -> List[VisualTOCNode]:
"""Build TOC hierarchy from X positions (indentation).
Detects the hierarchical level by analyzing the horizontal X coordinate.
Clusters nearby X positions to identify distinct indentation levels, then
builds a tree structure using a stack-based approach.
Args:
entries: List of entries with x_position field. Each entry should have:
- text: Entry title
- page_number: Page number
- x_position: Normalized X coordinate (0.0 to 1.0)
Returns:
Hierarchical TOC structure as a list of nodes. Each node contains:
- title: Entry title
- page: Page number
- level: Hierarchical level (1, 2, 3, ...)
- type: Entry type (always "section")
- children: List of child nodes
Example:
>>> entries = [
... {"text": "Chapter 1", "page_number": 1, "x_position": 0.1},
... {"text": "Section 1.1", "page_number": 2, "x_position": 0.2},
... ]
>>> hierarchy = build_hierarchy_from_bbox(entries)
>>> hierarchy[0]["children"][0]["title"]
'Section 1.1'
"""
if not entries:
return []
# Extract X positions and normalize entry data
entry_list: List[VisualTOCEntryInternal] = []
for entry in entries:
x_start: float = entry.get("x_position", 0.1)
page_num: int = entry.get("page_number", 0)
entry["x_start"] = x_start
entry["page"] = page_num
entry_list.append(entry) # type: ignore[arg-type]
# Find unique indentation thresholds
x_positions: List[float] = sorted(set(e["x_start"] for e in entry_list))
if not x_positions:
logger.warning("No X position detected")
return []
# Group nearby positions (tolerance 0.03 to normalize small variations)
x_levels: List[float] = []
current_group: List[float] = [x_positions[0]]
for x in x_positions[1:]:
if x - current_group[-1] < 0.03:
current_group.append(x)
else:
x_levels.append(sum(current_group) / len(current_group))
current_group = [x]
if current_group:
x_levels.append(sum(current_group) / len(current_group))
logger.info(
f"Indentation levels detected (X positions): {[f'{x:.3f}' for x in x_levels]}"
)
# Assign levels based on X position
for entry_item in entry_list:
x_val: float = entry_item["x_start"]
# Find the closest level
level: int = min(range(len(x_levels)), key=lambda i: abs(x_levels[i] - x_val)) + 1
entry_item["level"] = level
logger.debug(f" '{entry_item.get('text', '')}' -> X={x_val:.3f} -> level {level}")
# Build hierarchy
toc: List[VisualTOCNode] = []
stack: List[VisualTOCNode] = []
for entry_item in entry_list:
node: VisualTOCNode = {
"title": entry_item.get("text", "").strip(),
"page": entry_item["page"],
"level": entry_item["level"],
"type": "section",
"children": [],
}
# Pop from stack while current level is less than or equal to stack top
while stack and stack[-1]["level"] >= node["level"]:
stack.pop()
if stack:
stack[-1]["children"].append(node)
else:
toc.append(node)
stack.append(node)
return toc
def flatten_toc(toc: List[VisualTOCNode]) -> List[VisualTOCFlatEntry]:
"""Flatten a hierarchical TOC.
Converts a nested TOC structure into a flat list of entries, preserving
parent-child relationships through the parent_title field.
Args:
toc: Hierarchical TOC structure (list of VisualTOCNode).
Returns:
Flat list of TOC entries with parent references.
Example:
>>> toc = [{
... "title": "Chapter 1",
... "page": 1,
... "level": 1,
... "type": "section",
... "children": [{
... "title": "Section 1.1",
... "page": 2,
... "level": 2,
... "type": "section",
... "children": []
... }]
... }]
>>> flat = flatten_toc(toc)
>>> len(flat)
2
>>> flat[1]["parent_title"]
'Chapter 1'
"""
flat: List[VisualTOCFlatEntry] = []
def recurse(items: List[VisualTOCNode], parent_title: Optional[str] = None) -> None:
"""Recursively flatten TOC nodes.
Args:
items: List of TOC nodes to process.
parent_title: Title of the parent node (None for top level).
"""
for item in items:
flat_entry: VisualTOCFlatEntry = {
"title": item["title"],
"page_number": item["page"],
"level": item["level"],
"entry_type": item["type"],
"parent_title": parent_title,
}
flat.append(flat_entry)
if item.get("children"):
recurse(item["children"], item["title"])
recurse(toc)
return flat

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,815 @@
"""Weaviate document ingestion module for the Library RAG pipeline.
This module handles the ingestion of processed documents (chunks, metadata,
summaries) into the Weaviate vector database. It supports the V3.0 schema
with nested objects for efficient semantic search.
Architecture:
The module uses four Weaviate collections:
- **Work**: Represents a literary/philosophical work (title, author, year)
- **Document**: A specific edition/version of a work (sourceId, pages, TOC)
- **Chunk**: Text chunks with vectorized content for semantic search
- **Summary**: Section summaries with vectorized concepts
Chunks and Summaries use nested objects to reference their parent
Work and Document, avoiding data duplication while enabling
efficient filtering.
Batch Operations:
The module uses Weaviate insert_many() for efficient batch insertion.
Chunks are prepared as a list and inserted in a single operation,
which is significantly faster than individual insertions.
Nested Objects:
Each Chunk contains nested work and document objects::
{
"text": "La justice est une vertu...",
"work": {"title": "La Republique", "author": "Platon"},
"document": {"sourceId": "platon_republique", "edition": "GF"}
}
This enables filtering like: document.sourceId == "platon_republique"
Typical Usage:
>>> from utils.weaviate_ingest import ingest_document, delete_document_chunks
>>>
>>> # Ingest a processed document
>>> result = ingest_document(
... doc_name="platon_republique",
... chunks=[{"text": "La justice est...", "section": "Livre I"}],
... metadata={"title": "La Republique", "author": "Platon"},
... language="fr",
... )
>>> print(f"Ingested {result['count']} chunks")
Connection:
The module connects to a local Weaviate instance using:
- HTTP port: 8080
- gRPC port: 50051
Ensure Weaviate is running via: docker-compose up -d
See Also:
- schema.py: Weaviate schema definitions
- pdf_pipeline.py: Document processing pipeline
- flask_app.py: Web interface for search
"""
from __future__ import annotations
import json
import logging
from contextlib import contextmanager
from datetime import datetime, timezone
from typing import Any, Dict, Generator, List, Optional, TypedDict
import weaviate
from weaviate import WeaviateClient
from weaviate.collections import Collection
import weaviate.classes.query as wvq
# Import type definitions from central types module
from utils.types import WeaviateIngestResult as IngestResult
# Import TOC enrichment functions
from .toc_enricher import enrich_chunks_with_toc
# =============================================================================
# Type Definitions (module-specific, not exported to utils.types)
# =============================================================================
class SummaryObject(TypedDict):
"""Weaviate Summary object structure for section summaries.
This TypedDict defines the structure of Summary objects stored in Weaviate.
Summaries are vectorized and can be searched semantically.
Attributes:
sectionPath: Full hierarchical path (e.g., "Livre I > Chapitre 2").
title: Section title.
level: Hierarchy level (1 = top level, 2 = subsection, etc.).
text: Summary text content (vectorized for search).
concepts: List of key concepts extracted from the section.
chunksCount: Number of chunks in this section.
document: Nested object with document reference (sourceId).
"""
sectionPath: str
title: str
level: int
text: str
concepts: List[str]
chunksCount: int
document: Dict[str, str]
class ChunkObject(TypedDict, total=False):
"""Weaviate Chunk object structure for text chunks.
This TypedDict defines the structure of Chunk objects stored in Weaviate.
The text and keywords fields are vectorized for semantic search.
Attributes:
text: Chunk text content (vectorized for search).
sectionPath: Full hierarchical path (e.g., "Livre I > Chapitre 2").
sectionLevel: Hierarchy level (1 = top level).
chapterTitle: Title of the containing chapter.
canonicalReference: Canonical academic reference (e.g., "CP 1.628", "Ménon 80a").
unitType: Type of argumentative unit (main_content, exposition, etc.).
keywords: List of keywords/concepts (vectorized for search).
language: Language code (e.g., "fr", "en").
orderIndex: Position in document for ordering.
work: Nested object with work metadata (title, author).
document: Nested object with document reference (sourceId, edition).
Note:
Uses total=False because some fields are optional during creation.
"""
text: str
sectionPath: str
sectionLevel: int
chapterTitle: str
canonicalReference: str
unitType: str
keywords: List[str]
language: str
orderIndex: int
work: Dict[str, str]
document: Dict[str, str]
class InsertedChunkSummary(TypedDict):
"""Summary of an inserted chunk for display purposes.
This TypedDict provides a preview of inserted chunks, useful for
displaying ingestion results to users.
Attributes:
chunk_id: Generated chunk identifier.
sectionPath: Hierarchical path of the chunk.
work: Title of the work.
author: Author name.
text_preview: First 150 characters of chunk text.
unitType: Type of argumentative unit.
"""
chunk_id: str
sectionPath: str
work: str
author: str
text_preview: str
unitType: str
# Note: IngestResult is imported from utils.types as WeaviateIngestResult
class DeleteResult(TypedDict, total=False):
"""Result from document deletion operation.
This TypedDict contains the result of a deletion operation,
including counts of deleted objects from each collection.
Attributes:
success: Whether deletion succeeded.
error: Error message if deletion failed.
deleted_chunks: Number of chunks deleted from Chunk collection.
deleted_summaries: Number of summaries deleted from Summary collection.
deleted_document: Whether the Document object was deleted.
Example:
>>> result = delete_document_chunks("platon_republique")
>>> print(f"Deleted {result['deleted_chunks']} chunks")
"""
success: bool
error: str
deleted_chunks: int
deleted_summaries: int
deleted_document: bool
class DocumentStats(TypedDict, total=False):
"""Document statistics from Weaviate.
This TypedDict contains statistics about a document stored in Weaviate,
retrieved by querying the Chunk collection.
Attributes:
success: Whether stats retrieval succeeded.
error: Error message if retrieval failed.
sourceId: Document identifier.
chunks_count: Total number of chunks for this document.
work: Title of the work (from first chunk).
author: Author name (from first chunk).
Example:
>>> stats = get_document_stats("platon_republique")
>>> print(f"Document has {stats['chunks_count']} chunks")
"""
success: bool
error: str
sourceId: str
chunks_count: int
work: Optional[str]
author: Optional[str]
# Logger
logger: logging.Logger = logging.getLogger(__name__)
@contextmanager
def get_weaviate_client() -> Generator[Optional[WeaviateClient], None, None]:
"""Context manager for Weaviate connection with automatic cleanup.
Creates a connection to the local Weaviate instance and ensures
proper cleanup when the context exits. Handles connection errors
gracefully by yielding None instead of raising.
Yields:
Connected WeaviateClient instance, or None if connection failed.
Example:
>>> with get_weaviate_client() as client:
... if client is not None:
... chunks = client.collections.get("Chunk")
... # Perform operations...
... else:
... print("Connection failed")
Note:
Connects to localhost:8080 (HTTP) and localhost:50051 (gRPC).
Ensure Weaviate is running via docker-compose up -d.
"""
client: Optional[WeaviateClient] = None
try:
# Increased timeout for long text vectorization (e.g., Peirce CP 3.403, CP 8.388, Menon chunk 10)
# Default is 60s, increased to 600s (10 minutes) for exceptionally large texts
from weaviate.classes.init import AdditionalConfig, Timeout
client = weaviate.connect_to_local(
host="localhost",
port=8080,
grpc_port=50051,
additional_config=AdditionalConfig(
timeout=Timeout(init=30, query=600, insert=600) # 10 min for insert/query
)
)
yield client
except Exception as e:
logger.error(f"Erreur connexion Weaviate: {e}")
yield None
finally:
if client:
client.close()
def ingest_document_metadata(
client: WeaviateClient,
doc_name: str,
metadata: Dict[str, Any],
toc: List[Dict[str, Any]],
hierarchy: Dict[str, Any],
chunks_count: int,
pages: int,
) -> Optional[str]:
"""Insert document metadata into the Document collection.
Creates a Document object containing metadata about a processed document,
including its table of contents, hierarchy structure, and statistics.
Args:
client: Active Weaviate client connection.
doc_name: Unique document identifier (sourceId).
metadata: Extracted metadata dict with keys: title, author, language.
toc: Table of contents as a hierarchical list of dicts.
hierarchy: Complete document hierarchy structure.
chunks_count: Total number of chunks in the document.
pages: Number of pages in the source PDF.
Returns:
UUID string of the created Document object, or None if insertion failed.
Example:
>>> with get_weaviate_client() as client:
... uuid = ingest_document_metadata(
... client,
... doc_name="platon_republique",
... metadata={"title": "La Republique", "author": "Platon"},
... toc=[{"title": "Livre I", "level": 1}],
... hierarchy={},
... chunks_count=150,
... pages=300,
... )
Note:
The TOC and hierarchy are serialized to JSON strings for storage.
The createdAt field is set to the current timestamp.
"""
try:
doc_collection: Collection[Any, Any] = client.collections.get("Document")
except Exception as e:
logger.warning(f"Collection Document non trouvée: {e}")
return None
try:
doc_obj: Dict[str, Any] = {
"sourceId": doc_name,
"title": metadata.get("title") or doc_name,
"author": metadata.get("author") or "Inconnu",
"toc": json.dumps(toc, ensure_ascii=False) if toc else "[]",
"hierarchy": json.dumps(hierarchy, ensure_ascii=False) if hierarchy else "{}",
"pages": pages,
"chunksCount": chunks_count,
"language": metadata.get("language", "fr"),
"createdAt": datetime.now(timezone.utc).isoformat(),
}
result = doc_collection.data.insert(doc_obj)
logger.info(f"Document metadata ingéré: {doc_name}")
return str(result)
except Exception as e:
logger.warning(f"Erreur ingestion document metadata: {e}")
return None
def ingest_summaries(
client: WeaviateClient,
doc_name: str,
toc: List[Dict[str, Any]],
summaries_content: Dict[str, str],
) -> int:
"""Insert section summaries into the Summary collection.
Creates Summary objects for each entry in the table of contents,
with optional summary text content. Summaries are vectorized and
can be searched semantically.
Args:
client: Active Weaviate client connection.
doc_name: Document identifier for linking summaries.
toc: Hierarchical table of contents list.
summaries_content: Mapping of section titles to summary text.
If a title is not in this dict, the title itself is used as text.
Returns:
Number of summaries successfully inserted.
Example:
>>> with get_weaviate_client() as client:
... count = ingest_summaries(
... client,
... doc_name="platon_republique",
... toc=[{"title": "Livre I", "level": 1}],
... summaries_content={"Livre I": "Discussion sur la justice..."},
... )
... print(f"Inserted {count} summaries")
Note:
Uses batch insertion via insert_many() for efficiency.
Recursively processes nested TOC entries (children).
"""
try:
summary_collection: Collection[Any, Any] = client.collections.get("Summary")
except Exception as e:
logger.warning(f"Collection Summary non trouvée: {e}")
return 0
summaries_to_insert: List[SummaryObject] = []
def process_toc(items: List[Dict[str, Any]], parent_path: str = "") -> None:
for item in items:
title: str = item.get("title", "")
level: int = item.get("level", 1)
path: str = f"{parent_path} > {title}" if parent_path else title
summary_obj: SummaryObject = {
"sectionPath": path,
"title": title,
"level": level,
"text": summaries_content.get(title, title),
"concepts": item.get("concepts", []),
"chunksCount": 0,
"document": {
"sourceId": doc_name,
},
}
summaries_to_insert.append(summary_obj)
if "children" in item:
process_toc(item["children"], path)
process_toc(toc)
if not summaries_to_insert:
return 0
# Insérer par petits lots pour éviter les timeouts
BATCH_SIZE = 50
total_inserted = 0
try:
logger.info(f"Ingesting {len(summaries_to_insert)} summaries in batches of {BATCH_SIZE}...")
for batch_start in range(0, len(summaries_to_insert), BATCH_SIZE):
batch_end = min(batch_start + BATCH_SIZE, len(summaries_to_insert))
batch = summaries_to_insert[batch_start:batch_end]
try:
summary_collection.data.insert_many(batch)
total_inserted += len(batch)
logger.info(f" Batch {batch_start//BATCH_SIZE + 1}: Inserted {len(batch)} summaries ({total_inserted}/{len(summaries_to_insert)})")
except Exception as batch_error:
logger.warning(f" Batch {batch_start//BATCH_SIZE + 1} failed: {batch_error}")
continue
logger.info(f"{total_inserted} résumés ingérés pour {doc_name}")
return total_inserted
except Exception as e:
logger.warning(f"Erreur ingestion résumés: {e}")
return 0
def ingest_document(
doc_name: str,
chunks: List[Dict[str, Any]],
metadata: Dict[str, Any],
language: str = "fr",
toc: Optional[List[Dict[str, Any]]] = None,
hierarchy: Optional[Dict[str, Any]] = None,
pages: int = 0,
ingest_document_collection: bool = True,
ingest_summary_collection: bool = False,
) -> IngestResult:
"""Ingest document chunks into Weaviate with nested objects.
Main ingestion function that inserts chunks into the Chunk collection
with nested Work and Document references. Optionally also creates
entries in the Document and Summary collections.
This function uses batch insertion for optimal performance and
constructs proper nested objects for filtering capabilities.
Args:
doc_name: Unique document identifier (used as sourceId).
chunks: List of chunk dicts, each containing at minimum:
- text: The chunk text content
- section (optional): Section path string
- hierarchy (optional): Dict with part/chapter/section
- type (optional): Argumentative unit type
- concepts/keywords (optional): List of keywords
metadata: Document metadata dict with keys:
- title: Work title
- author: Author name
- edition (optional): Edition identifier
language: ISO language code. Defaults to "fr".
toc: Optional table of contents for Document/Summary collections.
hierarchy: Optional complete document hierarchy structure.
pages: Number of pages in source document. Defaults to 0.
ingest_document_collection: If True, also insert into Document
collection. Defaults to True.
ingest_summary_collection: If True, also insert into Summary
collection (requires toc). Defaults to False.
Returns:
IngestResult dict containing:
- success: True if ingestion succeeded
- count: Number of chunks inserted
- inserted: Preview of first 10 inserted chunks
- work: Work title
- author: Author name
- document_uuid: UUID of Document object (if created)
- all_objects: Complete list of inserted ChunkObjects
- error: Error message (if failed)
Raises:
No exceptions are raised; errors are returned in the result dict.
Example:
>>> result = ingest_document(
... doc_name="platon_republique",
... chunks=[{"text": "La justice est...", "section": "Livre I"}],
... metadata={"title": "La Republique", "author": "Platon"},
... language="fr",
... pages=450,
... )
>>> if result["success"]:
... print(f"Ingested {result['count']} chunks")
Note:
Empty chunks (no text or whitespace-only) are automatically skipped.
The function logs progress and errors using the module logger.
"""
try:
with get_weaviate_client() as client:
if client is None:
return IngestResult(
success=False,
error="Connexion Weaviate impossible",
inserted=[],
)
# Récupérer la collection Chunk
try:
chunk_collection: Collection[Any, Any] = client.collections.get("Chunk")
except Exception as e:
return IngestResult(
success=False,
error=f"Collection Chunk non trouvée: {e}",
inserted=[],
)
# Insérer les métadonnées du document (optionnel)
doc_uuid: Optional[str] = None
if ingest_document_collection:
doc_uuid = ingest_document_metadata(
client, doc_name, metadata, toc or [], hierarchy or {},
len(chunks), pages
)
# Insérer les résumés (optionnel)
if ingest_summary_collection and toc:
ingest_summaries(client, doc_name, toc, {})
# NOUVEAU : Enrichir chunks avec métadonnées TOC si disponibles
if toc and hierarchy:
logger.info(f"Enriching {len(chunks)} chunks with TOC metadata...")
chunks = enrich_chunks_with_toc(chunks, toc, hierarchy)
else:
logger.info("No TOC/hierarchy available, using basic metadata")
# Préparer les objets Chunk à insérer avec nested objects
objects_to_insert: List[ChunkObject] = []
title: str = metadata.get("title") or metadata.get("work") or doc_name
author: str = metadata.get("author") or "Inconnu"
edition: str = metadata.get("edition", "")
for idx, chunk in enumerate(chunks):
# Extraire le texte du chunk
text: str = chunk.get("text", "")
if not text or not text.strip():
continue
# Utiliser sectionPath enrichi si disponible, sinon fallback vers logique existante
section_path: str = chunk.get("sectionPath", "")
if not section_path:
section_path = chunk.get("section", "")
if not section_path:
chunk_hierarchy: Dict[str, Any] = chunk.get("hierarchy", {})
section_parts: List[str] = []
if chunk_hierarchy.get("part"):
section_parts.append(chunk_hierarchy["part"])
if chunk_hierarchy.get("chapter"):
section_parts.append(chunk_hierarchy["chapter"])
if chunk_hierarchy.get("section"):
section_parts.append(chunk_hierarchy["section"])
section_path = " > ".join(section_parts) if section_parts else chunk.get("title", f"Section {idx}")
# Utiliser chapterTitle enrichi si disponible
chapter_title: str = chunk.get("chapterTitle", chunk.get("chapter_title", ""))
# Utiliser canonicalReference enrichi si disponible
canonical_ref: str = chunk.get("canonicalReference", "")
# Créer l objet Chunk avec nested objects
chunk_obj: ChunkObject = {
"text": text,
"sectionPath": section_path,
"sectionLevel": chunk.get("section_level", chunk.get("level", 1)),
"chapterTitle": chapter_title,
"canonicalReference": canonical_ref,
"unitType": chunk.get("type", "main_content"),
"keywords": chunk.get("concepts", chunk.get("keywords", [])),
"language": language,
"orderIndex": idx,
"work": {
"title": title,
"author": author,
},
"document": {
"sourceId": doc_name,
"edition": edition,
},
}
objects_to_insert.append(chunk_obj)
if not objects_to_insert:
return IngestResult(
success=True,
message="Aucun chunk à insérer",
inserted=[],
count=0,
)
# Insérer les objets par petits lots pour éviter les timeouts
BATCH_SIZE = 50 # Process 50 chunks at a time
total_inserted = 0
logger.info(f"Ingesting {len(objects_to_insert)} chunks in batches of {BATCH_SIZE}...")
for batch_start in range(0, len(objects_to_insert), BATCH_SIZE):
batch_end = min(batch_start + BATCH_SIZE, len(objects_to_insert))
batch = objects_to_insert[batch_start:batch_end]
try:
_response = chunk_collection.data.insert_many(objects=batch)
total_inserted += len(batch)
logger.info(f" Batch {batch_start//BATCH_SIZE + 1}: Inserted {len(batch)} chunks ({total_inserted}/{len(objects_to_insert)})")
except Exception as batch_error:
logger.error(f" Batch {batch_start//BATCH_SIZE + 1} failed: {batch_error}")
# Continue with next batch instead of failing completely
continue
# Préparer le résumé des objets insérés
inserted_summary: List[InsertedChunkSummary] = []
for i, obj in enumerate(objects_to_insert[:10]):
text_content: str = obj.get("text", "")
work_obj: Dict[str, str] = obj.get("work", {})
inserted_summary.append(InsertedChunkSummary(
chunk_id=f"chunk_{i:05d}",
sectionPath=obj.get("sectionPath", ""),
work=work_obj.get("title", ""),
author=work_obj.get("author", ""),
text_preview=text_content[:150] + "..." if len(text_content) > 150 else text_content,
unitType=obj.get("unitType", ""),
))
logger.info(f"Ingestion réussie: {total_inserted} chunks insérés pour {doc_name}")
return IngestResult(
success=True,
count=total_inserted,
inserted=inserted_summary,
work=title,
author=author,
document_uuid=doc_uuid,
all_objects=objects_to_insert,
)
except Exception as e:
logger.error(f"Erreur ingestion: {e}")
return IngestResult(
success=False,
error=str(e),
inserted=[],
)
def delete_document_chunks(doc_name: str) -> DeleteResult:
"""Delete all data for a document from Weaviate collections.
Removes chunks, summaries, and the document metadata from their
respective collections. Uses nested object filtering to find
related objects.
This function is useful for re-processing a document after changes
to the processing pipeline or to clean up test data.
Args:
doc_name: Document identifier (sourceId) to delete.
Returns:
DeleteResult dict containing:
- success: True if deletion succeeded (even if no objects found)
- deleted_chunks: Number of Chunk objects deleted
- deleted_summaries: Number of Summary objects deleted
- deleted_document: True if Document object was deleted
- error: Error message (if failed)
Example:
>>> result = delete_document_chunks("platon_republique")
>>> if result["success"]:
... print(f"Deleted {result['deleted_chunks']} chunks")
... # Now safe to re-ingest
... ingest_document("platon_republique", new_chunks, metadata)
Note:
Uses delete_many() with filters on nested object properties.
Continues even if some collections fail (logs warnings).
"""
try:
with get_weaviate_client() as client:
if client is None:
return DeleteResult(success=False, error="Connexion Weaviate impossible")
deleted_chunks: int = 0
deleted_summaries: int = 0
deleted_document: bool = False
# Supprimer les chunks (filtrer sur document.sourceId nested)
try:
chunk_collection: Collection[Any, Any] = client.collections.get("Chunk")
result = chunk_collection.data.delete_many(
where=wvq.Filter.by_property("document.sourceId").equal(doc_name)
)
deleted_chunks = result.successful
except Exception as e:
logger.warning(f"Erreur suppression chunks: {e}")
# Supprimer les summaries (filtrer sur document.sourceId nested)
try:
summary_collection: Collection[Any, Any] = client.collections.get("Summary")
result = summary_collection.data.delete_many(
where=wvq.Filter.by_property("document.sourceId").equal(doc_name)
)
deleted_summaries = result.successful
except Exception as e:
logger.warning(f"Erreur suppression summaries: {e}")
# Supprimer le document
try:
doc_collection: Collection[Any, Any] = client.collections.get("Document")
result = doc_collection.data.delete_many(
where=wvq.Filter.by_property("sourceId").equal(doc_name)
)
deleted_document = result.successful > 0
except Exception as e:
logger.warning(f"Erreur suppression document: {e}")
logger.info(f"Suppression: {deleted_chunks} chunks, {deleted_summaries} summaries pour {doc_name}")
return DeleteResult(
success=True,
deleted_chunks=deleted_chunks,
deleted_summaries=deleted_summaries,
deleted_document=deleted_document,
)
except Exception as e:
logger.error(f"Erreur suppression: {e}")
return DeleteResult(success=False, error=str(e))
def get_document_stats(doc_name: str) -> DocumentStats:
"""Retrieve statistics for a document from Weaviate.
Queries the Chunk collection to count chunks and extract work
metadata for a given document identifier.
Args:
doc_name: Document identifier (sourceId) to query.
Returns:
DocumentStats dict containing:
- success: True if query succeeded
- sourceId: The queried document identifier
- chunks_count: Number of chunks found
- work: Work title (from first chunk, if any)
- author: Author name (from first chunk, if any)
- error: Error message (if failed)
Example:
>>> stats = get_document_stats("platon_republique")
>>> if stats["success"]:
... print(f"Document: {stats['work']} by {stats['author']}")
... print(f"Chunks: {stats['chunks_count']}")
Note:
Limited to 1000 chunks for counting. For documents with more
chunks, consider using Weaviate's aggregate queries.
"""
try:
with get_weaviate_client() as client:
if client is None:
return DocumentStats(success=False, error="Connexion Weaviate impossible")
# Compter les chunks (filtrer sur document.sourceId nested)
chunk_collection: Collection[Any, Any] = client.collections.get("Chunk")
chunks = chunk_collection.query.fetch_objects(
filters=wvq.Filter.by_property("document.sourceId").equal(doc_name),
limit=1000,
)
chunks_count: int = len(chunks.objects)
# Récupérer les infos du premier chunk
work: Optional[str] = None
author: Optional[str] = None
if chunks.objects:
first: Dict[str, Any] = chunks.objects[0].properties
work_obj: Any = first.get("work", {})
work = work_obj.get("title") if isinstance(work_obj, dict) else None
author = work_obj.get("author") if isinstance(work_obj, dict) else None
return DocumentStats(
success=True,
sourceId=doc_name,
chunks_count=chunks_count,
work=work,
author=author,
)
except Exception as e:
logger.error(f"Erreur stats document: {e}")
return DocumentStats(success=False, error=str(e))