- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
421 lines
14 KiB
Python
421 lines
14 KiB
Python
"""LLM-based Table of Contents (TOC) extraction module.
|
|
|
|
This module provides functionality to extract hierarchical table of contents
|
|
from markdown documents using Large Language Models. It intelligently parses
|
|
document structure and creates both hierarchical and flat representations
|
|
of the TOC.
|
|
|
|
Key Features:
|
|
- Hierarchical TOC extraction with chapters, sections, and subsections
|
|
- Flat TOC generation with full paths for navigation
|
|
- Content-to-TOC matching for associating sections with TOC entries
|
|
- Support for multiple LLM providers (Ollama local, Mistral API)
|
|
|
|
TOC Structure Levels:
|
|
- Level 1: Introduction, main chapters, Conclusion, Bibliography
|
|
- Level 2: Sections listed under a chapter (same visual level)
|
|
- Level 3: Only if explicit indentation or subsection visible
|
|
|
|
Typical Usage:
|
|
>>> from utils.llm_toc import extract_toc
|
|
>>> result = extract_toc(
|
|
... markdown=document_text,
|
|
... document_title="The Republic",
|
|
... provider="ollama"
|
|
... )
|
|
>>> print(result["toc"]) # Hierarchical structure
|
|
[
|
|
{
|
|
"title": "Introduction",
|
|
"level": 1,
|
|
"children": []
|
|
},
|
|
{
|
|
"title": "Book I: Justice",
|
|
"level": 1,
|
|
"chapter_number": 1,
|
|
"children": [
|
|
{"title": "The Nature of Justice", "level": 2, "children": []}
|
|
]
|
|
}
|
|
]
|
|
>>> print(result["flat_toc"]) # Flat list with paths
|
|
[
|
|
{"title": "Introduction", "level": 1, "path": "Introduction"},
|
|
{"title": "Book I: Justice", "level": 1, "path": "Book I: Justice"},
|
|
{
|
|
"title": "The Nature of Justice",
|
|
"level": 2,
|
|
"path": "Book I: Justice > The Nature of Justice"
|
|
}
|
|
]
|
|
|
|
LLM Provider Options:
|
|
- "ollama": Local processing, free but slower
|
|
- "mistral": Cloud API, faster but incurs costs
|
|
|
|
Note:
|
|
For documents without a clear TOC (short articles, book reviews),
|
|
the module returns an empty TOC list rather than inventing structure.
|
|
|
|
See Also:
|
|
- llm_metadata: Document metadata extraction
|
|
- llm_classifier: Section classification
|
|
- toc_extractor: Non-LLM TOC extraction alternatives
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from typing import cast, Any, Dict, List, Optional
|
|
|
|
from .llm_structurer import (
|
|
_clean_json_string,
|
|
_get_default_mistral_model,
|
|
_get_default_model,
|
|
call_llm,
|
|
)
|
|
from .types import FlatTOCEntry, LLMProvider, TOCEntry, TOCResult
|
|
|
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _extract_json_from_response(text: str) -> Dict[str, Any]:
|
|
"""Extract JSON data from an LLM response.
|
|
|
|
Parses the LLM response to extract JSON content, handling both
|
|
explicitly tagged JSON (between <JSON></JSON> tags) and raw JSON
|
|
embedded in the response text.
|
|
|
|
Args:
|
|
text: The raw LLM response text that may contain JSON.
|
|
|
|
Returns:
|
|
A dictionary containing the parsed JSON data. Returns
|
|
{"toc": []} if no valid JSON can be extracted.
|
|
|
|
Note:
|
|
This function attempts two parsing strategies:
|
|
1. Look for JSON between <JSON></JSON> tags
|
|
2. Find JSON by locating first '{' and last '}'
|
|
"""
|
|
json_match: Optional[re.Match[str]] = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL)
|
|
if json_match:
|
|
json_str: str = _clean_json_string(json_match.group(1))
|
|
try:
|
|
result: Dict[str, Any] = json.loads(json_str)
|
|
return result
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
start: int = text.find("{")
|
|
end: int = text.rfind("}")
|
|
if start != -1 and end > start:
|
|
json_str = _clean_json_string(text[start:end + 1])
|
|
try:
|
|
result = json.loads(json_str)
|
|
return result
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"JSON invalide: {e}")
|
|
|
|
return {"toc": []}
|
|
|
|
|
|
def extract_toc(
|
|
markdown: str,
|
|
document_title: Optional[str] = None,
|
|
model: Optional[str] = None,
|
|
provider: LLMProvider = "ollama",
|
|
temperature: float = 0.1,
|
|
) -> Dict[str, Any]:
|
|
r"""Extract a structured table of contents from a document using LLM.
|
|
|
|
Analyzes markdown content to identify the document's hierarchical
|
|
structure and generates both a nested TOC (with children) and a
|
|
flat TOC (with navigation paths).
|
|
|
|
Args:
|
|
markdown: Complete markdown text of the document to analyze.
|
|
document_title: Optional title of the document for context.
|
|
Helps the LLM better understand the document structure.
|
|
model: LLM model name to use. If None, uses the default model
|
|
for the specified provider.
|
|
provider: LLM provider to use. Either "ollama" for local
|
|
processing or "mistral" for cloud API.
|
|
temperature: Model temperature for response generation.
|
|
Lower values (0.1) produce more consistent results.
|
|
|
|
Returns:
|
|
A dictionary containing:
|
|
- toc: Hierarchical list of TOC entries, each with:
|
|
- title: Section title
|
|
- level: Hierarchy level (1, 2, or 3)
|
|
- chapter_number: Optional chapter number
|
|
- children: List of nested TOC entries
|
|
- flat_toc: Flat list of all TOC entries with paths:
|
|
- title: Section title
|
|
- level: Hierarchy level
|
|
- path: Full navigation path (e.g., "Chapter 1 > Section 1")
|
|
- error: Error message string (only if extraction failed)
|
|
|
|
Raises:
|
|
No exceptions are raised; errors are captured in the return dict.
|
|
|
|
Example:
|
|
>>> result = extract_toc(
|
|
... markdown="# Introduction\n...\n# Chapter 1\n## Section 1.1",
|
|
... document_title="My Book",
|
|
... provider="ollama"
|
|
... )
|
|
>>> len(result["toc"])
|
|
2
|
|
>>> result["toc"][0]["title"]
|
|
'Introduction'
|
|
|
|
Note:
|
|
- Documents longer than 12,000 characters are truncated
|
|
- Short articles without clear TOC return empty lists
|
|
- The LLM is instructed to never invent structure
|
|
"""
|
|
if model is None:
|
|
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
|
|
|
# Tronquer si trop long mais garder les sections importantes
|
|
max_chars: int = 12000
|
|
content: str = markdown[:max_chars]
|
|
if len(markdown) > max_chars:
|
|
content += "\n\n[... suite du document ...]"
|
|
|
|
title_context: str = f"Titre du document: {document_title}\n" if document_title else ""
|
|
|
|
prompt: str = f"""Tu es un expert en structuration de documents académiques.
|
|
|
|
TÂCHE: Extraire la table des matières FIDÈLE au document fourni.
|
|
|
|
{title_context}
|
|
⚠️ RÈGLES CRITIQUES:
|
|
|
|
1. **ANALYSER LE DOCUMENT RÉEL** - Ne JAMAIS copier les exemples ci-dessous!
|
|
2. **DOCUMENTS SANS TOC** - Si le document est un article court, une revue de livre, ou n'a pas de table des matières explicite, retourner {{"toc": []}}
|
|
3. **RESPECTER LA STRUCTURE PLATE** - Ne pas inventer de hiérarchie entre des lignes au même niveau
|
|
4. **IGNORER** - Métadonnées éditoriales (DOI, ISBN, éditeur, copyright, numéros de page)
|
|
|
|
NIVEAUX DE STRUCTURE:
|
|
- level 1: Introduction, Chapitres principaux, Conclusion, Bibliographie
|
|
- level 2: Sections listées sous un chapitre (même niveau visuel)
|
|
- level 3: UNIQUEMENT si indentation ou sous-titre explicite visible
|
|
|
|
FORMAT DE RÉPONSE (JSON entre balises <JSON></JSON>):
|
|
|
|
Pour un livre avec TOC:
|
|
<JSON>
|
|
{{
|
|
"toc": [
|
|
{{
|
|
"title": "Titre Chapitre 1",
|
|
"level": 1,
|
|
"chapter_number": 1,
|
|
"children": [
|
|
{{"title": "Section 1.1", "level": 2, "children": []}},
|
|
{{"title": "Section 1.2", "level": 2, "children": []}}
|
|
]
|
|
}}
|
|
]
|
|
}}
|
|
</JSON>
|
|
|
|
Pour un article SANS TOC (revue de livre, article court, etc.):
|
|
<JSON>
|
|
{{
|
|
"toc": []
|
|
}}
|
|
</JSON>
|
|
|
|
⚠️ NE PAS COPIER CES EXEMPLES ! Analyser uniquement le DOCUMENT RÉEL ci-dessous.
|
|
|
|
DOCUMENT À ANALYSER:
|
|
{content}
|
|
|
|
Réponds UNIQUEMENT avec le JSON correspondant à CE document (pas aux exemples)."""
|
|
|
|
logger.info(f"Extraction TOC via {provider.upper()} ({model})")
|
|
|
|
try:
|
|
response: str = call_llm(prompt, model=model, provider=provider, temperature=temperature, timeout=360)
|
|
result: Dict[str, Any] = _extract_json_from_response(response)
|
|
|
|
toc: List[Dict[str, Any]] = result.get("toc", [])
|
|
|
|
# Générer la version plate de la TOC
|
|
flat_toc: List[Dict[str, Any]] = _flatten_toc(toc)
|
|
|
|
logger.info(f"TOC extraite: {len(toc)} entrées niveau 1, {len(flat_toc)} entrées totales")
|
|
|
|
return {
|
|
"toc": toc,
|
|
"flat_toc": flat_toc,
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Erreur extraction TOC: {e}")
|
|
return {
|
|
"toc": [],
|
|
"flat_toc": [],
|
|
"error": str(e),
|
|
}
|
|
|
|
|
|
def _flatten_toc(
|
|
toc: List[Dict[str, Any]],
|
|
parent_path: str = "",
|
|
result: Optional[List[Dict[str, Any]]] = None
|
|
) -> List[Dict[str, Any]]:
|
|
"""Flatten a hierarchical TOC into a list with navigation paths.
|
|
|
|
Recursively traverses a nested TOC structure and produces a flat
|
|
list where each entry includes its full path from the root.
|
|
|
|
Args:
|
|
toc: Hierarchical TOC list with nested children.
|
|
parent_path: Path accumulated from parent entries. Used
|
|
internally during recursion.
|
|
result: Accumulator list for results. Used internally
|
|
during recursion.
|
|
|
|
Returns:
|
|
A flat list of TOC entries, each containing:
|
|
- title: The section title
|
|
- level: Hierarchy level (1, 2, or 3)
|
|
- path: Full navigation path (e.g., "Chapter > Section")
|
|
- chapter_number: Optional chapter number if present
|
|
|
|
Example:
|
|
>>> hierarchical_toc = [
|
|
... {
|
|
... "title": "Chapter 1",
|
|
... "level": 1,
|
|
... "children": [
|
|
... {"title": "Section 1.1", "level": 2, "children": []}
|
|
... ]
|
|
... }
|
|
... ]
|
|
>>> flat = _flatten_toc(hierarchical_toc)
|
|
>>> flat[0]["path"]
|
|
'Chapter 1'
|
|
>>> flat[1]["path"]
|
|
'Chapter 1 > Section 1.1'
|
|
"""
|
|
if result is None:
|
|
result = []
|
|
|
|
for item in toc:
|
|
title: str = item.get("title", "")
|
|
level: int = item.get("level", 1)
|
|
|
|
# Construire le chemin
|
|
path: str
|
|
if parent_path:
|
|
path = f"{parent_path} > {title}"
|
|
else:
|
|
path = title
|
|
|
|
result.append({
|
|
"title": title,
|
|
"level": level,
|
|
"path": path,
|
|
"chapter_number": item.get("chapter_number"),
|
|
})
|
|
|
|
# Récursion sur les enfants
|
|
children: List[Dict[str, Any]] = item.get("children", [])
|
|
if children:
|
|
_flatten_toc(children, path, result)
|
|
|
|
return result
|
|
|
|
|
|
def match_content_to_toc(
|
|
content_sections: List[Dict[str, Any]],
|
|
flat_toc: List[Dict[str, Any]],
|
|
model: Optional[str] = None,
|
|
provider: LLMProvider = "ollama",
|
|
) -> List[Dict[str, Any]]:
|
|
"""Match content sections to TOC entries using LLM.
|
|
|
|
Uses an LLM to intelligently associate extracted content sections
|
|
with their corresponding entries in the table of contents. This
|
|
enables navigation and context-aware content organization.
|
|
|
|
Args:
|
|
content_sections: List of content sections extracted from
|
|
the document. Each section should have a "title" key.
|
|
flat_toc: Flat TOC list as returned by extract_toc()["flat_toc"].
|
|
Each entry should have a "title" key.
|
|
model: LLM model name to use. If None, uses the default
|
|
model for the specified provider.
|
|
provider: LLM provider to use. Either "ollama" for local
|
|
processing or "mistral" for cloud API.
|
|
|
|
Returns:
|
|
The input content_sections list with a "toc_match" key added
|
|
to each section. The value is either:
|
|
- The matched TOC entry dict (if a match was found)
|
|
- None (if no match was found)
|
|
|
|
Example:
|
|
>>> sections = [{"title": "Introduction"}, {"title": "Methods"}]
|
|
>>> toc = [{"title": "Introduction", "level": 1, "path": "Introduction"}]
|
|
>>> matched = match_content_to_toc(sections, toc)
|
|
>>> matched[0]["toc_match"]["title"]
|
|
'Introduction'
|
|
>>> matched[1]["toc_match"] is None
|
|
True
|
|
|
|
Note:
|
|
- Only the first 30 content sections are processed to limit costs
|
|
- Failed matches are silently handled (sections get toc_match=None)
|
|
"""
|
|
if model is None:
|
|
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
|
|
|
# Préparer les données pour le prompt
|
|
toc_titles: List[str] = [item["title"] for item in flat_toc]
|
|
section_titles: List[str] = [s.get("title", "") for s in content_sections[:30]] # Limiter
|
|
|
|
prompt: str = f"""Tu dois associer les sections de contenu aux entrées de la table des matières.
|
|
|
|
TABLE DES MATIÈRES:
|
|
{json.dumps(toc_titles, ensure_ascii=False, indent=2)}
|
|
|
|
SECTIONS DE CONTENU:
|
|
{json.dumps(section_titles, ensure_ascii=False, indent=2)}
|
|
|
|
Pour chaque section de contenu, indique l'index (0-based) de l'entrée TOC correspondante.
|
|
Si pas de correspondance, indique -1.
|
|
|
|
RÉPONDS avec un JSON:
|
|
<JSON>
|
|
{{
|
|
"matches": [0, 1, 2, -1, 3, ...]
|
|
}}
|
|
</JSON>
|
|
"""
|
|
|
|
try:
|
|
response: str = call_llm(prompt, model=model, provider=provider, temperature=0.1)
|
|
result: Dict[str, Any] = _extract_json_from_response(response)
|
|
matches: List[int] = result.get("matches", [])
|
|
|
|
# Appliquer les correspondances
|
|
for i, section in enumerate(content_sections):
|
|
if i < len(matches) and matches[i] >= 0 and matches[i] < len(flat_toc):
|
|
section["toc_match"] = flat_toc[matches[i]]
|
|
else:
|
|
section["toc_match"] = None
|
|
|
|
return content_sections
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Erreur correspondance TOC: {e}")
|
|
return content_sections
|