"""LLM-based section classification module for document structure analysis. This module provides functionality to classify document sections by type (front_matter, chapter, appendix, etc.) using Large Language Models and determine which sections should be indexed for semantic search. Key Features: - Section classification via LLM (classify_sections) - Automatic TOC/metadata section exclusion (is_excluded_section) - Post-classification validation (validate_classified_sections) - Filtering for indexable content (filter_indexable_sections) Section Types: The following section types are recognized: **Indexable Content (should_index=True):** - chapter: Main document content, essays, articles, book reviews - introduction: Document introductions - conclusion: Document conclusions - preface: Prefaces, forewords, warnings (intellectual content) - abstract: Summaries, abstracts **Non-Indexable Content (should_index=False):** - front_matter: Title pages, copyright, credits, colophon - toc_display: Table of contents display (not content) - appendix: Document appendices - bibliography: References, bibliography - index: Document index - notes: End notes - ignore: Ads, empty pages, technical metadata Classification Strategy: 1. LLM analyzes section titles and content previews 2. Automatic exclusion rules catch common TOC/metadata patterns 3. Post-classification validation detects false positives 4. Filtering extracts only indexable content Typical Usage: >>> from utils.llm_classifier import classify_sections, filter_indexable_sections >>> sections = [ ... {"title": "Table of Contents", "content": "...", "level": 1}, ... {"title": "Introduction", "content": "...", "level": 1}, ... {"title": "Chapter 1", "content": "...", "level": 1} ... ] >>> classified = classify_sections(sections, provider="ollama") >>> indexable = filter_indexable_sections(classified) >>> print([s["title"] for s in indexable]) ['Introduction', 'Chapter 1'] LLM Provider Options: - "ollama": Local processing, free but slower - "mistral": Cloud API, faster but incurs costs Note: The classifier is designed to handle edge cases like: - Book reviews with analytical content (classified as chapter) - Editor's notes without analysis (classified as front_matter) - TOC fragments embedded in content (detected and excluded) See Also: - llm_toc: Table of contents extraction - llm_chunker: Semantic chunking of classified sections - llm_metadata: Document metadata extraction """ from __future__ import annotations import json import logging import re from typing import cast, Any, Dict, Final from .llm_structurer import ( _clean_json_string, _get_default_mistral_model, _get_default_model, call_llm, ) from .types import LLMProvider logger: logging.Logger = logging.getLogger(__name__) # Types de sections possibles SECTION_TYPES: Final[dict[str, str]] = { "front_matter": "Métadonnées, page de titre, copyright, crédits, NOTE DE L'ÉDITEUR, colophon", "toc_display": "Table des matières affichée (pas le contenu)", "preface": "Préface, avant-propos, avertissement (contenu intellectuel à indexer)", "abstract": "Résumé, abstract", "introduction": "Introduction de l'œuvre", "chapter": "Chapitre principal du document", "conclusion": "Conclusion de l'œuvre", "appendix": "Annexes", "bibliography": "Bibliographie, références", "index": "Index", "notes": "Notes de fin", "ignore": "À ignorer (publicités, pages vides, métadonnées techniques)", } def _extract_json_from_response(text: str) -> dict[str, Any]: """Extract JSON from LLM response text. Handles two formats: 1. JSON wrapped in tags 2. Raw JSON object in the response Args: text: Raw LLM response text. Returns: Parsed JSON as dictionary. Returns {"classifications": []} on failure. """ json_match: re.Match[str] | None = re.search( r'\s*(.*?)\s*', text, re.DOTALL ) if json_match: json_str: str = _clean_json_string(json_match.group(1)) try: result: Dict[str, Any] = json.loads(json_str) return result except json.JSONDecodeError: pass start: int = text.find("{") end: int = text.rfind("}") if start != -1 and end > start: json_str = _clean_json_string(text[start:end + 1]) try: result = json.loads(json_str) return result except json.JSONDecodeError as e: logger.warning(f"JSON invalide: {e}") return {"classifications": []} def classify_sections( sections: list[dict[str, Any]], document_title: str | None = None, model: str | None = None, provider: LLMProvider = "ollama", temperature: float = 0.1, ) -> list[dict[str, Any]]: """Classify document sections by type using LLM. Uses an LLM to analyze section titles and content previews to determine the type of each section (chapter, front_matter, toc_display, etc.) and whether it should be indexed for semantic search. Args: sections: List of section dictionaries with keys: - title: Section title - content: Section content (preview used) - level: Hierarchy level (1=chapter, 2=section, etc.) document_title: Optional document title for context. model: LLM model name. If None, uses provider default. provider: LLM provider ("ollama" or "mistral"). temperature: Model temperature (0.0-1.0). Lower = more deterministic. Returns: Same sections list with added classification fields: - type: Section type (SectionType literal) - should_index: Whether to include in vector index - chapter_number: Chapter number if applicable - classification_reason: Explanation for the classification Example: >>> sections = [{"title": "Introduction", "content": "...", "level": 1}] >>> classified = classify_sections(sections, provider="ollama") >>> classified[0]["type"] 'introduction' >>> classified[0]["should_index"] True """ if model is None: model = _get_default_mistral_model() if provider == "mistral" else _get_default_model() # Préparer les sections pour le prompt sections_for_prompt: list[dict[str, Any]] = [] for i, section in enumerate(sections[:50]): # Limiter à 50 sections sections_for_prompt.append({ "index": i, "title": section.get("title", ""), "preview": section.get("content", "")[:200] if section.get("content") else "", "level": section.get("level", 1), }) types_description: str = "\n".join([f"- {k}: {v}" for k, v in SECTION_TYPES.items()]) title_context: str = f"Titre du document: {document_title}\n" if document_title else "" prompt: str = f"""Tu es un expert en analyse de structure documentaire. TÂCHE: Classifier chaque section selon son type. {title_context} TYPES DISPONIBLES: {types_description} RÈGLES: 1. "front_matter": UNIQUEMENT pages de titre SANS contenu, copyright, colophon (métadonnées pures) 2. "toc_display": la TABLE DES MATIÈRES elle-même (pas son contenu) 3. "preface": préface, avant-propos, avertissement (À INDEXER car contenu intellectuel) 4. "chapter": TOUT contenu principal - chapitres, sections, articles, revues de livre, essais 5. "ignore": publicités, pages vides, métadonnées techniques sans valeur IMPORTANT - REVUES DE LIVRE ET ARTICLES: - Une REVUE DE LIVRE ("Book Review") avec analyse critique → chapter, should_index = true - Un ARTICLE académique avec contenu substantiel → chapter, should_index = true - Les métadonnées éditoriales (auteur, affiliation, journal) au début d'un article NE sont PAS un motif pour classer comme "front_matter" - Si le document contient un TEXTE ANALYTIQUE développé → chapter CAS PARTICULIERS: - "NOTE DE L'ÉDITEUR" (infos édition, réimpression, SANS analyse) → front_matter, should_index = false - "PRÉFACE" ou "AVANT-PROPOS" (texte intellectuel) → preface, should_index = true - "Book Review" ou "Article" avec paragraphes d'analyse → chapter, should_index = true INDEXATION: - should_index = true pour: preface, introduction, chapter, conclusion, abstract - should_index = false pour: front_matter, toc_display, ignore ⚠️ ATTENTION AUX FAUX POSITIFS - LISTE DE TITRES VS CONTENU RÉEL: LISTE DE TITRES (toc_display, should_index=false): - Suite de titres courts sans texte explicatif - Lignes commençant par "Comment...", "Où...", "Les dispositions à..." - Énumération de sections sans phrase complète - Exemple: "Comment fixer la croyance?\\nOù la croyance s'oppose au savoir\\nL'idéal de rationalité" CONTENU RÉEL (chapter, should_index=true): - Texte avec phrases complètes et verbes conjugués - Paragraphes développés avec arguments - Explications, définitions, raisonnements - Exemple: "Comment fixer la croyance? Cette question se pose dès lors que..." SECTIONS À CLASSIFIER: {json.dumps(sections_for_prompt, ensure_ascii=False, indent=2)} RÉPONDS avec un JSON entre : {{ "classifications": [ {{ "index": 0, "type": "front_matter", "should_index": false, "chapter_number": null, "reason": "Page de titre avec métadonnées éditeur" }}, {{ "index": 1, "type": "chapter", "should_index": true, "chapter_number": 1, "reason": "Premier chapitre du document" }} ] }} """ logger.info(f"Classification de {len(sections_for_prompt)} sections via {provider.upper()} ({model})") try: response: str = call_llm(prompt, model=model, provider=provider, temperature=temperature, timeout=300) result: dict[str, Any] = _extract_json_from_response(response) classifications: list[dict[str, Any]] = result.get("classifications", []) # Créer un mapping index -> classification class_map: dict[int, dict[str, Any]] = { c["index"]: c for c in classifications if "index" in c } # Appliquer les classifications for i, section in enumerate(sections): if i in class_map: c: dict[str, Any] = class_map[i] section["type"] = c.get("type", "chapter") section["should_index"] = c.get("should_index", True) section["chapter_number"] = c.get("chapter_number") section["classification_reason"] = c.get("reason", "") else: # Défaut: traiter comme contenu section["type"] = "chapter" section["should_index"] = True section["chapter_number"] = None # Stats types_count: dict[str, int] = {} for s in sections: t: str = s.get("type", "unknown") types_count[t] = types_count.get(t, 0) + 1 logger.info(f"Classification terminée: {types_count}") return sections except Exception as e: logger.error(f"Erreur classification sections: {e}") # En cas d'erreur, marquer tout comme indexable for section in sections: section["type"] = "chapter" section["should_index"] = True return sections # Titres à exclure automatiquement (insensible à la casse) EXCLUDED_SECTION_TITLES: Final[list[str]] = [ "table des matières", "table des matieres", "sommaire", "table of contents", "contents", "toc", "index", "liste des figures", "liste des tableaux", "list of figures", "list of tables", "note de l'éditeur", "note de l'editeur", "note de la rédaction", "copyright", "mentions légales", "crédits", "colophon", "achevé d'imprimer", ] def is_excluded_section(section: dict[str, Any]) -> bool: """Check if a section should be automatically excluded from indexing. Excludes sections based on: 1. Title matching known TOC/metadata patterns 2. Content analysis detecting TOC-like structure (short lines, title patterns) Args: section: Section dictionary with optional keys: - title: Section title - chapterTitle: Parent chapter title - content: Section content Returns: True if section should be excluded from indexing. Example: >>> is_excluded_section({"title": "Table des matières"}) True >>> is_excluded_section({"title": "Introduction", "content": "..."}) False """ title: str = (section.get("title") or "").lower().strip() chapter_title: str = (section.get("chapterTitle") or "").lower().strip() # Vérifier le titre de la section for excluded in EXCLUDED_SECTION_TITLES: if excluded in title or title == excluded: return True if excluded in chapter_title or chapter_title == excluded: return True # Vérifier si le contenu ressemble à une liste de titres (TOC) content: str = section.get("content", "") if content: lines: list[str] = [l.strip() for l in content.split("\n") if l.strip()] # Si pas assez de lignes, pas de détection if len(lines) < 3: return False # Critère 1: Lignes courtes (moyenne < 50 chars) avg_len: float = sum(len(l) for l in lines) / len(lines) # Critère 2: Toutes les lignes sont courtes (< 100 chars) all_short: bool = all(len(l) < 100 for l in lines[:10]) # Critère 3: Patterns typiques de titres de sections title_patterns: list[str] = [ r'^Comment\s+.+\?', # "Comment fixer la croyance?" r'^Où\s+.+', # "Où la croyance s'oppose" r'^Les?\s+\w+\s+à\s+', # "Les dispositions à penser" r'^Que\s+.+\?', # "Que peut-on savoir?" r'^L[ae]\s+\w+\s+(de|du)\s+', # "La critique de l'intuition" r'^Entre\s+.+\s+et\s+', # "Entre nature et norme" ] # Compter combien de lignes matchent les patterns de titres title_like_count: int = 0 for line in lines[:10]: for pattern in title_patterns: if re.match(pattern, line, re.IGNORECASE): title_like_count += 1 break # Critère 4: Pas de verbes conjugués typiques du contenu narratif narrative_verbs: list[str] = [ r'\best\b', r'\bsont\b', r'\bétait\b', r'\bsera\b', r'\ba\b', r'\bont\b', r'\bavait\b', r'\bavaient\b', r'\bfait\b', r'\bdit\b', r'\bpense\b', r'\bexplique\b' ] has_narrative: bool = False for line in lines[:5]: for verb_pattern in narrative_verbs: if re.search(verb_pattern, line, re.IGNORECASE): has_narrative = True break if has_narrative: break # Décision: C'est une liste de titres (TOC) si: # - Lignes courtes ET toutes < 100 chars ET (beaucoup de patterns de titres OU pas de verbes narratifs) if len(lines) >= 5 and avg_len < 50 and all_short: if title_like_count >= len(lines) * 0.4 or not has_narrative: logger.debug(f"Section '{title}' exclue: ressemble à une TOC (lignes courtes, {title_like_count}/{len(lines)} titres)") return True return False def filter_indexable_sections(sections: list[dict[str, Any]]) -> list[dict[str, Any]]: """Filter sections to keep only those that should be indexed. Applies multiple exclusion criteria: 1. Automatic exclusion by title pattern (TOC, index, etc.) 2. Parent chapter exclusion (if parent is TOC) 3. LLM classification (should_index flag) Args: sections: List of classified section dictionaries. Returns: Filtered list containing only indexable sections. Example: >>> sections = [ ... {"title": "TOC", "should_index": False}, ... {"title": "Chapter 1", "should_index": True} ... ] >>> filtered = filter_indexable_sections(sections) >>> len(filtered) 1 """ filtered: list[dict[str, Any]] = [] excluded_count: int = 0 for s in sections: # Vérifier l'exclusion automatique if is_excluded_section(s): logger.info(f"Section exclue automatiquement: '{s.get('title', 'Sans titre')}'") excluded_count += 1 continue # Vérifier si le chapitre parent est une TOC chapter_title: str = (s.get("chapterTitle") or "").lower().strip() if any(excluded in chapter_title for excluded in EXCLUDED_SECTION_TITLES): logger.info(f"Section exclue (chapitre TOC): '{s.get('title', 'Sans titre')}' dans '{chapter_title}'") excluded_count += 1 continue # Vérifier la classification LLM if s.get("should_index", True): filtered.append(s) else: excluded_count += 1 if excluded_count > 0: logger.info(f"Sections exclues: {excluded_count}, indexables: {len(filtered)}") return filtered def validate_classified_sections(sections: list[dict[str, Any]]) -> list[dict[str, Any]]: """Post-classification validation to detect false positives. Performs additional checks on sections marked should_index=True to catch TOC fragments that escaped initial classification: 1. Parent chapter is TOC -> exclude 2. Content is mostly short title-like lines -> reclassify as toc_display Args: sections: List of already-classified section dictionaries. Returns: Validated sections with corrections applied. Corrections are logged and stored in 'validation_correction' field. Example: >>> sections = [{"title": "Part 1", "should_index": True, "content": "..."}] >>> validated = validate_classified_sections(sections) >>> # May reclassify sections with TOC-like content """ validated: list[dict[str, Any]] = [] fixed_count: int = 0 for section in sections: # Vérifier d'abord si le titre du chapitre parent est une TOC chapter_title: str = (section.get("chapter_title") or "").lower().strip() section_title: str = (section.get("title") or "").lower().strip() # Exclure si le chapitre parent est une TOC is_toc_chapter: bool = False for excluded in EXCLUDED_SECTION_TITLES: if excluded in chapter_title: logger.warning(f"Section '{section.get('title', 'Sans titre')}' exclue: chapitre parent est '{chapter_title}'") section["should_index"] = False section["type"] = "toc_display" section["validation_correction"] = f"Exclue car chapitre parent = {chapter_title}" fixed_count += 1 is_toc_chapter = True break if is_toc_chapter: validated.append(section) continue # Si déjà marquée comme non-indexable, garder tel quel if not section.get("should_index", True): validated.append(section) continue content: str = section.get("content", "") # Validation supplémentaire sur le contenu if content: lines: list[str] = [l.strip() for l in content.split("\n") if l.strip()] # Si très peu de lignes, probablement pas un problème if len(lines) < 3: validated.append(section) continue # Calculer le ratio de lignes qui ressemblent à des titres title_question_pattern: str = r'^(Comment|Où|Que|Quelle|Quel|Les?\s+\w+\s+(de|du|à)|Entre\s+.+\s+et)\s+' title_like: int = sum(1 for l in lines if re.match(title_question_pattern, l, re.IGNORECASE)) # Si > 50% des lignes ressemblent à des titres ET lignes courtes avg_len: float = sum(len(l) for l in lines) / len(lines) if len(lines) >= 4 and title_like >= len(lines) * 0.5 and avg_len < 55: # C'est probablement une liste de titres extraite de la TOC logger.warning(f"Section '{section.get('title', 'Sans titre')}' reclassée: détectée comme liste de titres TOC") section["should_index"] = False section["type"] = "toc_display" section["validation_correction"] = "Reclassée comme toc_display (liste de titres)" fixed_count += 1 validated.append(section) continue validated.append(section) if fixed_count > 0: logger.info(f"Validation post-classification: {fixed_count} section(s) reclassée(s)") return validated def get_chapter_sections(sections: list[dict[str, Any]]) -> list[dict[str, Any]]: """Filter sections to return only chapter-type content. Returns sections with types that contain main document content: chapter, introduction, conclusion, abstract, preface. Args: sections: List of classified section dictionaries. Returns: Filtered list containing only chapter-type sections. Example: >>> sections = [ ... {"title": "TOC", "type": "toc_display"}, ... {"title": "Chapter 1", "type": "chapter"} ... ] >>> chapters = get_chapter_sections(sections) >>> len(chapters) 1 """ chapter_types: set[str] = {"chapter", "introduction", "conclusion", "abstract", "preface"} return [s for s in sections if s.get("type") in chapter_types]