Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions
--- a/generations/library_rag/utils/llm_classifier.py
+++ b/generations/library_rag/utils/llm_classifier.py
@@ -0,0 +1,582 @@
+"""LLM-based section classification module for document structure analysis.
+
+This module provides functionality to classify document sections by type
+(front_matter, chapter, appendix, etc.) using Large Language Models and
+determine which sections should be indexed for semantic search.
+
+Key Features:
+    - Section classification via LLM (classify_sections)
+    - Automatic TOC/metadata section exclusion (is_excluded_section)
+    - Post-classification validation (validate_classified_sections)
+    - Filtering for indexable content (filter_indexable_sections)
+
+Section Types:
+    The following section types are recognized:
+
+    **Indexable Content (should_index=True):**
+        - chapter: Main document content, essays, articles, book reviews
+        - introduction: Document introductions
+        - conclusion: Document conclusions
+        - preface: Prefaces, forewords, warnings (intellectual content)
+        - abstract: Summaries, abstracts
+
+    **Non-Indexable Content (should_index=False):**
+        - front_matter: Title pages, copyright, credits, colophon
+        - toc_display: Table of contents display (not content)
+        - appendix: Document appendices
+        - bibliography: References, bibliography
+        - index: Document index
+        - notes: End notes
+        - ignore: Ads, empty pages, technical metadata
+
+Classification Strategy:
+    1. LLM analyzes section titles and content previews
+    2. Automatic exclusion rules catch common TOC/metadata patterns
+    3. Post-classification validation detects false positives
+    4. Filtering extracts only indexable content
+
+Typical Usage:
+    >>> from utils.llm_classifier import classify_sections, filter_indexable_sections
+    >>> sections = [
+    ...     {"title": "Table of Contents", "content": "...", "level": 1},
+    ...     {"title": "Introduction", "content": "...", "level": 1},
+    ...     {"title": "Chapter 1", "content": "...", "level": 1}
+    ... ]
+    >>> classified = classify_sections(sections, provider="ollama")
+    >>> indexable = filter_indexable_sections(classified)
+    >>> print([s["title"] for s in indexable])
+    ['Introduction', 'Chapter 1']
+
+LLM Provider Options:
+    - "ollama": Local processing, free but slower
+    - "mistral": Cloud API, faster but incurs costs
+
+Note:
+    The classifier is designed to handle edge cases like:
+    - Book reviews with analytical content (classified as chapter)
+    - Editor's notes without analysis (classified as front_matter)
+    - TOC fragments embedded in content (detected and excluded)
+
+See Also:
+    - llm_toc: Table of contents extraction
+    - llm_chunker: Semantic chunking of classified sections
+    - llm_metadata: Document metadata extraction
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from typing import cast, Any, Dict, Final
+
+from .llm_structurer import (
+    _clean_json_string,
+    _get_default_mistral_model,
+    _get_default_model,
+    call_llm,
+)
+from .types import LLMProvider
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+# Types de sections possibles
+SECTION_TYPES: Final[dict[str, str]] = {
+    "front_matter": "Métadonnées, page de titre, copyright, crédits, NOTE DE L'ÉDITEUR, colophon",
+    "toc_display": "Table des matières affichée (pas le contenu)",
+    "preface": "Préface, avant-propos, avertissement (contenu intellectuel à indexer)",
+    "abstract": "Résumé, abstract",
+    "introduction": "Introduction de l'œuvre",
+    "chapter": "Chapitre principal du document",
+    "conclusion": "Conclusion de l'œuvre",
+    "appendix": "Annexes",
+    "bibliography": "Bibliographie, références",
+    "index": "Index",
+    "notes": "Notes de fin",
+    "ignore": "À ignorer (publicités, pages vides, métadonnées techniques)",
+}
+
+
+def _extract_json_from_response(text: str) -> dict[str, Any]:
+    """Extract JSON from LLM response text.
+
+    Handles two formats:
+    1. JSON wrapped in <JSON></JSON> tags
+    2. Raw JSON object in the response
+
+    Args:
+        text: Raw LLM response text.
+
+    Returns:
+        Parsed JSON as dictionary. Returns {"classifications": []} on failure.
+    """
+    json_match: re.Match[str] | None = re.search(
+        r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL
+    )
+    if json_match:
+        json_str: str = _clean_json_string(json_match.group(1))
+        try:
+            result: Dict[str, Any] = json.loads(json_str)
+            return result
+        except json.JSONDecodeError:
+            pass
+
+    start: int = text.find("{")
+    end: int = text.rfind("}")
+    if start != -1 and end > start:
+        json_str = _clean_json_string(text[start:end + 1])
+        try:
+            result = json.loads(json_str)
+            return result
+        except json.JSONDecodeError as e:
+            logger.warning(f"JSON invalide: {e}")
+
+    return {"classifications": []}
+
+
+def classify_sections(
+    sections: list[dict[str, Any]],
+    document_title: str | None = None,
+    model: str | None = None,
+    provider: LLMProvider = "ollama",
+    temperature: float = 0.1,
+) -> list[dict[str, Any]]:
+    """Classify document sections by type using LLM.
+
+    Uses an LLM to analyze section titles and content previews to determine
+    the type of each section (chapter, front_matter, toc_display, etc.) and
+    whether it should be indexed for semantic search.
+
+    Args:
+        sections: List of section dictionaries with keys:
+            - title: Section title
+            - content: Section content (preview used)
+            - level: Hierarchy level (1=chapter, 2=section, etc.)
+        document_title: Optional document title for context.
+        model: LLM model name. If None, uses provider default.
+        provider: LLM provider ("ollama" or "mistral").
+        temperature: Model temperature (0.0-1.0). Lower = more deterministic.
+
+    Returns:
+        Same sections list with added classification fields:
+            - type: Section type (SectionType literal)
+            - should_index: Whether to include in vector index
+            - chapter_number: Chapter number if applicable
+            - classification_reason: Explanation for the classification
+
+    Example:
+        >>> sections = [{"title": "Introduction", "content": "...", "level": 1}]
+        >>> classified = classify_sections(sections, provider="ollama")
+        >>> classified[0]["type"]
+        'introduction'
+        >>> classified[0]["should_index"]
+        True
+    """
+    if model is None:
+        model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
+
+    # Préparer les sections pour le prompt
+    sections_for_prompt: list[dict[str, Any]] = []
+    for i, section in enumerate(sections[:50]):  # Limiter à 50 sections
+        sections_for_prompt.append({
+            "index": i,
+            "title": section.get("title", ""),
+            "preview": section.get("content", "")[:200] if section.get("content") else "",
+            "level": section.get("level", 1),
+        })
+
+    types_description: str = "\n".join([f"- {k}: {v}" for k, v in SECTION_TYPES.items()])
+    title_context: str = f"Titre du document: {document_title}\n" if document_title else ""
+
+    prompt: str = f"""Tu es un expert en analyse de structure documentaire.
+
+TÂCHE: Classifier chaque section selon son type.
+
+{title_context}
+TYPES DISPONIBLES:
+{types_description}
+
+RÈGLES:
+1. "front_matter": UNIQUEMENT pages de titre SANS contenu, copyright, colophon (métadonnées pures)
+2. "toc_display": la TABLE DES MATIÈRES elle-même (pas son contenu)
+3. "preface": préface, avant-propos, avertissement (À INDEXER car contenu intellectuel)
+4. "chapter": TOUT contenu principal - chapitres, sections, articles, revues de livre, essais
+5. "ignore": publicités, pages vides, métadonnées techniques sans valeur
+
+IMPORTANT - REVUES DE LIVRE ET ARTICLES:
+- Une REVUE DE LIVRE ("Book Review") avec analyse critique → chapter, should_index = true
+- Un ARTICLE académique avec contenu substantiel → chapter, should_index = true
+- Les métadonnées éditoriales (auteur, affiliation, journal) au début d'un article NE sont PAS un motif pour classer comme "front_matter"
+- Si le document contient un TEXTE ANALYTIQUE développé → chapter
+
+CAS PARTICULIERS:
+- "NOTE DE L'ÉDITEUR" (infos édition, réimpression, SANS analyse) → front_matter, should_index = false
+- "PRÉFACE" ou "AVANT-PROPOS" (texte intellectuel) → preface, should_index = true
+- "Book Review" ou "Article" avec paragraphes d'analyse → chapter, should_index = true
+
+INDEXATION:
+- should_index = true pour: preface, introduction, chapter, conclusion, abstract
+- should_index = false pour: front_matter, toc_display, ignore
+
+⚠️ ATTENTION AUX FAUX POSITIFS - LISTE DE TITRES VS CONTENU RÉEL:
+
+LISTE DE TITRES (toc_display, should_index=false):
+- Suite de titres courts sans texte explicatif
+- Lignes commençant par "Comment...", "Où...", "Les dispositions à..."
+- Énumération de sections sans phrase complète
+- Exemple: "Comment fixer la croyance?\\nOù la croyance s'oppose au savoir\\nL'idéal de rationalité"
+
+CONTENU RÉEL (chapter, should_index=true):
+- Texte avec phrases complètes et verbes conjugués
+- Paragraphes développés avec arguments
+- Explications, définitions, raisonnements
+- Exemple: "Comment fixer la croyance? Cette question se pose dès lors que..."
+
+SECTIONS À CLASSIFIER:
+{json.dumps(sections_for_prompt, ensure_ascii=False, indent=2)}
+
+RÉPONDS avec un JSON entre <JSON></JSON>:
+
+<JSON>
+{{
+    "classifications": [
+        {{
+            "index": 0,
+            "type": "front_matter",
+            "should_index": false,
+            "chapter_number": null,
+            "reason": "Page de titre avec métadonnées éditeur"
+        }},
+        {{
+            "index": 1,
+            "type": "chapter",
+            "should_index": true,
+            "chapter_number": 1,
+            "reason": "Premier chapitre du document"
+        }}
+    ]
+}}
+</JSON>
+"""
+
+    logger.info(f"Classification de {len(sections_for_prompt)} sections via {provider.upper()} ({model})")
+
+    try:
+        response: str = call_llm(prompt, model=model, provider=provider, temperature=temperature, timeout=300)
+        result: dict[str, Any] = _extract_json_from_response(response)
+        classifications: list[dict[str, Any]] = result.get("classifications", [])
+
+        # Créer un mapping index -> classification
+        class_map: dict[int, dict[str, Any]] = {
+            c["index"]: c for c in classifications if "index" in c
+        }
+
+        # Appliquer les classifications
+        for i, section in enumerate(sections):
+            if i in class_map:
+                c: dict[str, Any] = class_map[i]
+                section["type"] = c.get("type", "chapter")
+                section["should_index"] = c.get("should_index", True)
+                section["chapter_number"] = c.get("chapter_number")
+                section["classification_reason"] = c.get("reason", "")
+            else:
+                # Défaut: traiter comme contenu
+                section["type"] = "chapter"
+                section["should_index"] = True
+                section["chapter_number"] = None
+
+        # Stats
+        types_count: dict[str, int] = {}
+        for s in sections:
+            t: str = s.get("type", "unknown")
+            types_count[t] = types_count.get(t, 0) + 1
+
+        logger.info(f"Classification terminée: {types_count}")
+
+        return sections
+
+    except Exception as e:
+        logger.error(f"Erreur classification sections: {e}")
+        # En cas d'erreur, marquer tout comme indexable
+        for section in sections:
+            section["type"] = "chapter"
+            section["should_index"] = True
+        return sections
+
+
+# Titres à exclure automatiquement (insensible à la casse)
+EXCLUDED_SECTION_TITLES: Final[list[str]] = [
+    "table des matières",
+    "table des matieres",
+    "sommaire",
+    "table of contents",
+    "contents",
+    "toc",
+    "index",
+    "liste des figures",
+    "liste des tableaux",
+    "list of figures",
+    "list of tables",
+    "note de l'éditeur",
+    "note de l'editeur",
+    "note de la rédaction",
+    "copyright",
+    "mentions légales",
+    "crédits",
+    "colophon",
+    "achevé d'imprimer",
+]
+
+
+def is_excluded_section(section: dict[str, Any]) -> bool:
+    """Check if a section should be automatically excluded from indexing.
+
+    Excludes sections based on:
+    1. Title matching known TOC/metadata patterns
+    2. Content analysis detecting TOC-like structure (short lines, title patterns)
+
+    Args:
+        section: Section dictionary with optional keys:
+            - title: Section title
+            - chapterTitle: Parent chapter title
+            - content: Section content
+
+    Returns:
+        True if section should be excluded from indexing.
+
+    Example:
+        >>> is_excluded_section({"title": "Table des matières"})
+        True
+        >>> is_excluded_section({"title": "Introduction", "content": "..."})
+        False
+    """
+    title: str = section.get("title", "").lower().strip()
+    chapter_title: str = section.get("chapterTitle", "").lower().strip()
+
+    # Vérifier le titre de la section
+    for excluded in EXCLUDED_SECTION_TITLES:
+        if excluded in title or title == excluded:
+            return True
+        if excluded in chapter_title or chapter_title == excluded:
+            return True
+
+    # Vérifier si le contenu ressemble à une liste de titres (TOC)
+    content: str = section.get("content", "")
+    if content:
+        lines: list[str] = [l.strip() for l in content.split("\n") if l.strip()]
+
+        # Si pas assez de lignes, pas de détection
+        if len(lines) < 3:
+            return False
+
+        # Critère 1: Lignes courtes (moyenne < 50 chars)
+        avg_len: float = sum(len(l) for l in lines) / len(lines)
+
+        # Critère 2: Toutes les lignes sont courtes (< 100 chars)
+        all_short: bool = all(len(l) < 100 for l in lines[:10])
+
+        # Critère 3: Patterns typiques de titres de sections
+        title_patterns: list[str] = [
+            r'^Comment\s+.+\?',           # "Comment fixer la croyance?"
+            r'^Où\s+.+',                  # "Où la croyance s'oppose"
+            r'^Les?\s+\w+\s+à\s+',        # "Les dispositions à penser"
+            r'^Que\s+.+\?',               # "Que peut-on savoir?"
+            r'^L[ae]\s+\w+\s+(de|du)\s+', # "La critique de l'intuition"
+            r'^Entre\s+.+\s+et\s+',       # "Entre nature et norme"
+        ]
+
+        # Compter combien de lignes matchent les patterns de titres
+        title_like_count: int = 0
+        for line in lines[:10]:
+            for pattern in title_patterns:
+                if re.match(pattern, line, re.IGNORECASE):
+                    title_like_count += 1
+                    break
+
+        # Critère 4: Pas de verbes conjugués typiques du contenu narratif
+        narrative_verbs: list[str] = [
+            r'\best\b', r'\bsont\b', r'\bétait\b', r'\bsera\b',
+            r'\ba\b', r'\bont\b', r'\bavait\b', r'\bavaient\b',
+            r'\bfait\b', r'\bdit\b', r'\bpense\b', r'\bexplique\b'
+        ]
+
+        has_narrative: bool = False
+        for line in lines[:5]:
+            for verb_pattern in narrative_verbs:
+                if re.search(verb_pattern, line, re.IGNORECASE):
+                    has_narrative = True
+                    break
+            if has_narrative:
+                break
+
+        # Décision: C'est une liste de titres (TOC) si:
+        # - Lignes courtes ET toutes < 100 chars ET (beaucoup de patterns de titres OU pas de verbes narratifs)
+        if len(lines) >= 5 and avg_len < 50 and all_short:
+            if title_like_count >= len(lines) * 0.4 or not has_narrative:
+                logger.debug(f"Section '{title}' exclue: ressemble à une TOC (lignes courtes, {title_like_count}/{len(lines)} titres)")
+                return True
+
+    return False
+
+
+def filter_indexable_sections(sections: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Filter sections to keep only those that should be indexed.
+
+    Applies multiple exclusion criteria:
+    1. Automatic exclusion by title pattern (TOC, index, etc.)
+    2. Parent chapter exclusion (if parent is TOC)
+    3. LLM classification (should_index flag)
+
+    Args:
+        sections: List of classified section dictionaries.
+
+    Returns:
+        Filtered list containing only indexable sections.
+
+    Example:
+        >>> sections = [
+        ...     {"title": "TOC", "should_index": False},
+        ...     {"title": "Chapter 1", "should_index": True}
+        ... ]
+        >>> filtered = filter_indexable_sections(sections)
+        >>> len(filtered)
+        1
+    """
+    filtered: list[dict[str, Any]] = []
+    excluded_count: int = 0
+
+    for s in sections:
+        # Vérifier l'exclusion automatique
+        if is_excluded_section(s):
+            logger.info(f"Section exclue automatiquement: '{s.get('title', 'Sans titre')}'")
+            excluded_count += 1
+            continue
+
+        # Vérifier si le chapitre parent est une TOC
+        chapter_title: str = s.get("chapterTitle", "").lower().strip()
+        if any(excluded in chapter_title for excluded in EXCLUDED_SECTION_TITLES):
+            logger.info(f"Section exclue (chapitre TOC): '{s.get('title', 'Sans titre')}' dans '{chapter_title}'")
+            excluded_count += 1
+            continue
+
+        # Vérifier la classification LLM
+        if s.get("should_index", True):
+            filtered.append(s)
+        else:
+            excluded_count += 1
+
+    if excluded_count > 0:
+        logger.info(f"Sections exclues: {excluded_count}, indexables: {len(filtered)}")
+
+    return filtered
+
+
+def validate_classified_sections(sections: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Post-classification validation to detect false positives.
+
+    Performs additional checks on sections marked should_index=True to catch
+    TOC fragments that escaped initial classification:
+    1. Parent chapter is TOC -> exclude
+    2. Content is mostly short title-like lines -> reclassify as toc_display
+
+    Args:
+        sections: List of already-classified section dictionaries.
+
+    Returns:
+        Validated sections with corrections applied. Corrections are logged
+        and stored in 'validation_correction' field.
+
+    Example:
+        >>> sections = [{"title": "Part 1", "should_index": True, "content": "..."}]
+        >>> validated = validate_classified_sections(sections)
+        >>> # May reclassify sections with TOC-like content
+    """
+    validated: list[dict[str, Any]] = []
+    fixed_count: int = 0
+
+    for section in sections:
+        # Vérifier d'abord si le titre du chapitre parent est une TOC
+        chapter_title: str = section.get("chapter_title", "").lower().strip()
+        section_title: str = section.get("title", "").lower().strip()
+
+        # Exclure si le chapitre parent est une TOC
+        is_toc_chapter: bool = False
+        for excluded in EXCLUDED_SECTION_TITLES:
+            if excluded in chapter_title:
+                logger.warning(f"Section '{section.get('title', 'Sans titre')}' exclue: chapitre parent est '{chapter_title}'")
+                section["should_index"] = False
+                section["type"] = "toc_display"
+                section["validation_correction"] = f"Exclue car chapitre parent = {chapter_title}"
+                fixed_count += 1
+                is_toc_chapter = True
+                break
+
+        if is_toc_chapter:
+            validated.append(section)
+            continue
+
+        # Si déjà marquée comme non-indexable, garder tel quel
+        if not section.get("should_index", True):
+            validated.append(section)
+            continue
+
+        content: str = section.get("content", "")
+
+        # Validation supplémentaire sur le contenu
+        if content:
+            lines: list[str] = [l.strip() for l in content.split("\n") if l.strip()]
+
+            # Si très peu de lignes, probablement pas un problème
+            if len(lines) < 3:
+                validated.append(section)
+                continue
+
+            # Calculer le ratio de lignes qui ressemblent à des titres
+            title_question_pattern: str = r'^(Comment|Où|Que|Quelle|Quel|Les?\s+\w+\s+(de|du|à)|Entre\s+.+\s+et)\s+'
+            title_like: int = sum(1 for l in lines if re.match(title_question_pattern, l, re.IGNORECASE))
+
+            # Si > 50% des lignes ressemblent à des titres ET lignes courtes
+            avg_len: float = sum(len(l) for l in lines) / len(lines)
+
+            if len(lines) >= 4 and title_like >= len(lines) * 0.5 and avg_len < 55:
+                # C'est probablement une liste de titres extraite de la TOC
+                logger.warning(f"Section '{section.get('title', 'Sans titre')}' reclassée: détectée comme liste de titres TOC")
+                section["should_index"] = False
+                section["type"] = "toc_display"
+                section["validation_correction"] = "Reclassée comme toc_display (liste de titres)"
+                fixed_count += 1
+                validated.append(section)
+                continue
+
+        validated.append(section)
+
+    if fixed_count > 0:
+        logger.info(f"Validation post-classification: {fixed_count} section(s) reclassée(s)")
+
+    return validated
+
+
+def get_chapter_sections(sections: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Filter sections to return only chapter-type content.
+
+    Returns sections with types that contain main document content:
+    chapter, introduction, conclusion, abstract, preface.
+
+    Args:
+        sections: List of classified section dictionaries.
+
+    Returns:
+        Filtered list containing only chapter-type sections.
+
+    Example:
+        >>> sections = [
+        ...     {"title": "TOC", "type": "toc_display"},
+        ...     {"title": "Chapter 1", "type": "chapter"}
+        ... ]
+        >>> chapters = get_chapter_sections(sections)
+        >>> len(chapters)
+        1
+    """
+    chapter_types: set[str] = {"chapter", "introduction", "conclusion", "abstract", "preface"}
+    return [s for s in sections if s.get("type") in chapter_types]