"""LLM-based Table of Contents (TOC) extraction module. This module provides functionality to extract hierarchical table of contents from markdown documents using Large Language Models. It intelligently parses document structure and creates both hierarchical and flat representations of the TOC. Key Features: - Hierarchical TOC extraction with chapters, sections, and subsections - Flat TOC generation with full paths for navigation - Content-to-TOC matching for associating sections with TOC entries - Support for multiple LLM providers (Ollama local, Mistral API) TOC Structure Levels: - Level 1: Introduction, main chapters, Conclusion, Bibliography - Level 2: Sections listed under a chapter (same visual level) - Level 3: Only if explicit indentation or subsection visible Typical Usage: >>> from utils.llm_toc import extract_toc >>> result = extract_toc( ... markdown=document_text, ... document_title="The Republic", ... provider="ollama" ... ) >>> print(result["toc"]) # Hierarchical structure [ { "title": "Introduction", "level": 1, "children": [] }, { "title": "Book I: Justice", "level": 1, "chapter_number": 1, "children": [ {"title": "The Nature of Justice", "level": 2, "children": []} ] } ] >>> print(result["flat_toc"]) # Flat list with paths [ {"title": "Introduction", "level": 1, "path": "Introduction"}, {"title": "Book I: Justice", "level": 1, "path": "Book I: Justice"}, { "title": "The Nature of Justice", "level": 2, "path": "Book I: Justice > The Nature of Justice" } ] LLM Provider Options: - "ollama": Local processing, free but slower - "mistral": Cloud API, faster but incurs costs Note: For documents without a clear TOC (short articles, book reviews), the module returns an empty TOC list rather than inventing structure. See Also: - llm_metadata: Document metadata extraction - llm_classifier: Section classification - toc_extractor: Non-LLM TOC extraction alternatives """ import json import logging import re from typing import cast, Any, Dict, List, Optional from .llm_structurer import ( _clean_json_string, _get_default_mistral_model, _get_default_model, call_llm, ) from .types import FlatTOCEntry, LLMProvider, TOCEntry, TOCResult logger: logging.Logger = logging.getLogger(__name__) def _extract_json_from_response(text: str) -> Dict[str, Any]: """Extract JSON data from an LLM response. Parses the LLM response to extract JSON content, handling both explicitly tagged JSON (between tags) and raw JSON embedded in the response text. Args: text: The raw LLM response text that may contain JSON. Returns: A dictionary containing the parsed JSON data. Returns {"toc": []} if no valid JSON can be extracted. Note: This function attempts two parsing strategies: 1. Look for JSON between tags 2. Find JSON by locating first '{' and last '}' """ json_match: Optional[re.Match[str]] = re.search(r'\s*(.*?)\s*', text, re.DOTALL) if json_match: json_str: str = _clean_json_string(json_match.group(1)) try: result: Dict[str, Any] = json.loads(json_str) return result except json.JSONDecodeError: pass start: int = text.find("{") end: int = text.rfind("}") if start != -1 and end > start: json_str = _clean_json_string(text[start:end + 1]) try: result = json.loads(json_str) return result except json.JSONDecodeError as e: logger.warning(f"JSON invalide: {e}") return {"toc": []} def extract_toc( markdown: str, document_title: Optional[str] = None, model: Optional[str] = None, provider: LLMProvider = "ollama", temperature: float = 0.1, ) -> Dict[str, Any]: r"""Extract a structured table of contents from a document using LLM. Analyzes markdown content to identify the document's hierarchical structure and generates both a nested TOC (with children) and a flat TOC (with navigation paths). Args: markdown: Complete markdown text of the document to analyze. document_title: Optional title of the document for context. Helps the LLM better understand the document structure. model: LLM model name to use. If None, uses the default model for the specified provider. provider: LLM provider to use. Either "ollama" for local processing or "mistral" for cloud API. temperature: Model temperature for response generation. Lower values (0.1) produce more consistent results. Returns: A dictionary containing: - toc: Hierarchical list of TOC entries, each with: - title: Section title - level: Hierarchy level (1, 2, or 3) - chapter_number: Optional chapter number - children: List of nested TOC entries - flat_toc: Flat list of all TOC entries with paths: - title: Section title - level: Hierarchy level - path: Full navigation path (e.g., "Chapter 1 > Section 1") - error: Error message string (only if extraction failed) Raises: No exceptions are raised; errors are captured in the return dict. Example: >>> result = extract_toc( ... markdown="# Introduction\n...\n# Chapter 1\n## Section 1.1", ... document_title="My Book", ... provider="ollama" ... ) >>> len(result["toc"]) 2 >>> result["toc"][0]["title"] 'Introduction' Note: - Documents longer than 12,000 characters are truncated - Short articles without clear TOC return empty lists - The LLM is instructed to never invent structure """ if model is None: model = _get_default_mistral_model() if provider == "mistral" else _get_default_model() # Tronquer si trop long mais garder les sections importantes max_chars: int = 12000 content: str = markdown[:max_chars] if len(markdown) > max_chars: content += "\n\n[... suite du document ...]" title_context: str = f"Titre du document: {document_title}\n" if document_title else "" prompt: str = f"""Tu es un expert en structuration de documents académiques. TÂCHE: Extraire la table des matières FIDÈLE au document fourni. {title_context} ⚠️ RÈGLES CRITIQUES: 1. **ANALYSER LE DOCUMENT RÉEL** - Ne JAMAIS copier les exemples ci-dessous! 2. **DOCUMENTS SANS TOC** - Si le document est un article court, une revue de livre, ou n'a pas de table des matières explicite, retourner {{"toc": []}} 3. **RESPECTER LA STRUCTURE PLATE** - Ne pas inventer de hiérarchie entre des lignes au même niveau 4. **IGNORER** - Métadonnées éditoriales (DOI, ISBN, éditeur, copyright, numéros de page) NIVEAUX DE STRUCTURE: - level 1: Introduction, Chapitres principaux, Conclusion, Bibliographie - level 2: Sections listées sous un chapitre (même niveau visuel) - level 3: UNIQUEMENT si indentation ou sous-titre explicite visible FORMAT DE RÉPONSE (JSON entre balises ): Pour un livre avec TOC: {{ "toc": [ {{ "title": "Titre Chapitre 1", "level": 1, "chapter_number": 1, "children": [ {{"title": "Section 1.1", "level": 2, "children": []}}, {{"title": "Section 1.2", "level": 2, "children": []}} ] }} ] }} Pour un article SANS TOC (revue de livre, article court, etc.): {{ "toc": [] }} ⚠️ NE PAS COPIER CES EXEMPLES ! Analyser uniquement le DOCUMENT RÉEL ci-dessous. DOCUMENT À ANALYSER: {content} Réponds UNIQUEMENT avec le JSON correspondant à CE document (pas aux exemples).""" logger.info(f"Extraction TOC via {provider.upper()} ({model})") try: response: str = call_llm(prompt, model=model, provider=provider, temperature=temperature, timeout=360) result: Dict[str, Any] = _extract_json_from_response(response) toc: List[Dict[str, Any]] = result.get("toc", []) # Générer la version plate de la TOC flat_toc: List[Dict[str, Any]] = _flatten_toc(toc) logger.info(f"TOC extraite: {len(toc)} entrées niveau 1, {len(flat_toc)} entrées totales") return { "toc": toc, "flat_toc": flat_toc, } except Exception as e: logger.error(f"Erreur extraction TOC: {e}") return { "toc": [], "flat_toc": [], "error": str(e), } def _flatten_toc( toc: List[Dict[str, Any]], parent_path: str = "", result: Optional[List[Dict[str, Any]]] = None ) -> List[Dict[str, Any]]: """Flatten a hierarchical TOC into a list with navigation paths. Recursively traverses a nested TOC structure and produces a flat list where each entry includes its full path from the root. Args: toc: Hierarchical TOC list with nested children. parent_path: Path accumulated from parent entries. Used internally during recursion. result: Accumulator list for results. Used internally during recursion. Returns: A flat list of TOC entries, each containing: - title: The section title - level: Hierarchy level (1, 2, or 3) - path: Full navigation path (e.g., "Chapter > Section") - chapter_number: Optional chapter number if present Example: >>> hierarchical_toc = [ ... { ... "title": "Chapter 1", ... "level": 1, ... "children": [ ... {"title": "Section 1.1", "level": 2, "children": []} ... ] ... } ... ] >>> flat = _flatten_toc(hierarchical_toc) >>> flat[0]["path"] 'Chapter 1' >>> flat[1]["path"] 'Chapter 1 > Section 1.1' """ if result is None: result = [] for item in toc: title: str = item.get("title", "") level: int = item.get("level", 1) # Construire le chemin path: str if parent_path: path = f"{parent_path} > {title}" else: path = title result.append({ "title": title, "level": level, "path": path, "chapter_number": item.get("chapter_number"), }) # Récursion sur les enfants children: List[Dict[str, Any]] = item.get("children", []) if children: _flatten_toc(children, path, result) return result def match_content_to_toc( content_sections: List[Dict[str, Any]], flat_toc: List[Dict[str, Any]], model: Optional[str] = None, provider: LLMProvider = "ollama", ) -> List[Dict[str, Any]]: """Match content sections to TOC entries using LLM. Uses an LLM to intelligently associate extracted content sections with their corresponding entries in the table of contents. This enables navigation and context-aware content organization. Args: content_sections: List of content sections extracted from the document. Each section should have a "title" key. flat_toc: Flat TOC list as returned by extract_toc()["flat_toc"]. Each entry should have a "title" key. model: LLM model name to use. If None, uses the default model for the specified provider. provider: LLM provider to use. Either "ollama" for local processing or "mistral" for cloud API. Returns: The input content_sections list with a "toc_match" key added to each section. The value is either: - The matched TOC entry dict (if a match was found) - None (if no match was found) Example: >>> sections = [{"title": "Introduction"}, {"title": "Methods"}] >>> toc = [{"title": "Introduction", "level": 1, "path": "Introduction"}] >>> matched = match_content_to_toc(sections, toc) >>> matched[0]["toc_match"]["title"] 'Introduction' >>> matched[1]["toc_match"] is None True Note: - Only the first 30 content sections are processed to limit costs - Failed matches are silently handled (sections get toc_match=None) """ if model is None: model = _get_default_mistral_model() if provider == "mistral" else _get_default_model() # Préparer les données pour le prompt toc_titles: List[str] = [item["title"] for item in flat_toc] section_titles: List[str] = [s.get("title", "") for s in content_sections[:30]] # Limiter prompt: str = f"""Tu dois associer les sections de contenu aux entrées de la table des matières. TABLE DES MATIÈRES: {json.dumps(toc_titles, ensure_ascii=False, indent=2)} SECTIONS DE CONTENU: {json.dumps(section_titles, ensure_ascii=False, indent=2)} Pour chaque section de contenu, indique l'index (0-based) de l'entrée TOC correspondante. Si pas de correspondance, indique -1. RÉPONDS avec un JSON: {{ "matches": [0, 1, 2, -1, 3, ...] }} """ try: response: str = call_llm(prompt, model=model, provider=provider, temperature=0.1) result: Dict[str, Any] = _extract_json_from_response(response) matches: List[int] = result.get("matches", []) # Appliquer les correspondances for i, section in enumerate(content_sections): if i < len(matches) and matches[i] >= 0 and matches[i] < len(flat_toc): section["toc_match"] = flat_toc[matches[i]] else: section["toc_match"] = None return content_sections except Exception as e: logger.warning(f"Erreur correspondance TOC: {e}") return content_sections