Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions
--- a/generations/library_rag/utils/hierarchy_parser.py
+++ b/generations/library_rag/utils/hierarchy_parser.py
@@ -0,0 +1,267 @@
+"""Hierarchical Markdown document parser for semantic chunking.
+
+This module provides utilities for parsing Markdown documents into
+hierarchical structures based on heading levels (# to ######). It is
+a key component of the RAG pipeline, enabling:
+
+1. **Structure Extraction**: Parse Markdown into a tree of sections
+2. **Context Preservation**: Maintain hierarchical context (part > chapter > section)
+3. **Semantic Chunking**: Flatten hierarchy into chunks with full path context
+
+The parser uses a stack-based algorithm to build nested section trees,
+preserving the document's logical structure for downstream processing.
+
+Architecture:
+    Input: Raw Markdown text with headings
+        ↓
+    build_hierarchy() → DocumentHierarchy (tree structure)
+        ↓
+    flatten_hierarchy() → List[FlatChunk] (with hierarchical context)
+
+TypedDict Definitions:
+    - HierarchyPath: Hierarchical path (part/chapter/section/subsection)
+    - HierarchyNode: Tree node with title, level, content, children
+    - DocumentHierarchy: Complete document structure
+    - FlatChunk: Flattened chunk with context for RAG ingestion
+
+Algorithm:
+    The build_hierarchy() function uses a stack-based approach:
+    1. Initialize a virtual root node at level 0
+    2. For each line in the document:
+       - If heading: pop stack until parent level found, then push new node
+       - If content: append to current node's content
+    3. Finalize nodes by joining content lines
+
+Example:
+    >>> markdown = '''
+    ... # Introduction
+    ... This is the intro.
+    ...
+    ... ## Background
+    ... Some background text.
+    ...
+    ... ## Methodology
+    ... Methods used here.
+    ... '''
+    >>> hierarchy = build_hierarchy(markdown)
+    >>> print(hierarchy["sections"][0]["title"])
+    'Introduction'
+    >>> chunks = flatten_hierarchy(hierarchy)
+    >>> for chunk in chunks:
+    ...     print(f"{chunk['chunk_id']}: {chunk['title']}")
+    chunk_00001: Introduction
+    chunk_00002: Background
+    chunk_00003: Methodology
+
+See Also:
+    - utils.llm_chunker: Semantic chunking using LLM
+    - utils.markdown_builder: Markdown generation from OCR
+    - utils.weaviate_ingest: Ingestion of chunks into Weaviate
+"""
+
+from __future__ import annotations
+
+import re
+from typing import List, Optional, Pattern, TypedDict
+
+# Import type definitions from central types module
+from utils.types import (
+    DocumentHierarchy,
+    FlatChunk,
+    HierarchyNode,
+    HierarchyPath,
+)
+
+
+class _BuildNode(TypedDict):
+    """Noeud interne pour la construction de la hiérarchie."""
+
+    title: Optional[str]
+    level: int
+    content: List[str]
+    children: List[_BuildNode]
+
+
+def build_hierarchy(markdown_text: str) -> DocumentHierarchy:
+    """Construit une structure hiérarchique à partir des titres Markdown.
+
+    Analyse les titres (# à ######) et construit un arbre de sections
+    avec leur contenu textuel.
+
+    Args:
+        markdown_text: Texte Markdown à analyser
+
+    Returns:
+        Dictionnaire avec :
+        - preamble: Texte avant le premier titre
+        - sections: Liste de sections imbriquées
+
+        Chaque section contient :
+        - title: Titre de la section
+        - level: Niveau (1-6)
+        - content: Contenu textuel
+        - children: Sous-sections
+    """
+    # Regex pour les titres Markdown
+    heading_re: Pattern[str] = re.compile(r"^(#{1,6})\s+(.*)$")
+
+    lines: List[str] = markdown_text.splitlines()
+
+    # Noeud racine (niveau 0, virtuel)
+    root: _BuildNode = {
+        "title": None,
+        "level": 0,
+        "content": [],
+        "children": [],
+    }
+
+    # Pile pour suivre la hiérarchie
+    stack: List[_BuildNode] = [root]
+
+    for line in lines:
+        stripped: str = line.rstrip()
+        match: Optional[re.Match[str]] = heading_re.match(stripped)
+
+        if match:
+            # C'est un titre
+            level: int = len(match.group(1))
+            title: str = match.group(2).strip()
+
+            # Remonter dans la pile jusqu'au parent approprié
+            while stack and stack[-1]["level"] >= level:
+                stack.pop()
+
+            # Créer le nouveau noeud
+            node: _BuildNode = {
+                "title": title,
+                "level": level,
+                "content": [],
+                "children": [],
+            }
+
+            # Ajouter au parent
+            parent: _BuildNode = stack[-1]
+            parent["children"].append(node)
+
+            # Empiler le nouveau noeud
+            stack.append(node)
+        else:
+            # C'est du contenu, l'ajouter au noeud courant
+            stack[-1]["content"].append(stripped)
+
+    # Finaliser les noeuds (joindre le contenu)
+    def finalize(node: _BuildNode) -> HierarchyNode:
+        """Convertit un noeud de construction en noeud final."""
+        return HierarchyNode(
+            title=node["title"],
+            level=node["level"],
+            content="\n".join(node["content"]).strip(),
+            children=[finalize(child) for child in node["children"]],
+        )
+
+    # Extraire le préambule et les sections
+    preamble: str = "\n".join(root["content"]).strip()
+    sections: List[HierarchyNode] = [finalize(child) for child in root["children"]]
+
+    return DocumentHierarchy(
+        preamble=preamble,
+        sections=sections,
+    )
+
+
+def flatten_hierarchy(hierarchy: DocumentHierarchy) -> List[FlatChunk]:
+    """Aplatit la hiérarchie en une liste de chunks.
+
+    Args:
+        hierarchy: Structure hiérarchique (sortie de build_hierarchy)
+
+    Returns:
+        Liste de chunks avec leur contexte hiérarchique
+    """
+    chunks: List[FlatChunk] = []
+
+    # Préambule comme premier chunk
+    if hierarchy.get("preamble"):
+        preamble_chunk: FlatChunk = {
+            "chunk_id": "chunk_00000",
+            "text": hierarchy["preamble"],
+            "hierarchy": HierarchyPath(
+                part=None,
+                chapter=None,
+                section=None,
+                subsection=None,
+            ),
+            "type": "preamble",
+            "level": 0,
+            "title": None,
+        }
+        chunks.append(preamble_chunk)
+
+    def process_section(
+        section: HierarchyNode,
+        path: HierarchyPath,
+        index: int,
+    ) -> int:
+        """Traite récursivement une section.
+
+        Args:
+            section: Noeud de section à traiter
+            path: Chemin hiérarchique courant
+            index: Index du prochain chunk
+
+        Returns:
+            Nouvel index après traitement
+        """
+        level: int = section["level"]
+        title: Optional[str] = section["title"]
+
+        # Mettre à jour le chemin hiérarchique
+        current_path: HierarchyPath = path.copy()
+        if level == 1:
+            current_path = HierarchyPath(
+                part=title,
+                chapter=None,
+                section=None,
+                subsection=None,
+            )
+        elif level == 2:
+            current_path["chapter"] = title
+            current_path["section"] = None
+            current_path["subsection"] = None
+        elif level == 3:
+            current_path["section"] = title
+            current_path["subsection"] = None
+        elif level >= 4:
+            current_path["subsection"] = title
+
+        # Créer le chunk si contenu
+        if section["content"]:
+            chunk: FlatChunk = {
+                "chunk_id": f"chunk_{index:05d}",
+                "text": section["content"],
+                "hierarchy": current_path.copy(),
+                "type": "main_content",
+                "level": level,
+                "title": title,
+            }
+            chunks.append(chunk)
+            index += 1
+
+        # Traiter les enfants
+        for child in section["children"]:
+            index = process_section(child, current_path, index)
+
+        return index
+
+    # Traiter toutes les sections
+    idx: int = 1
+    initial_path: HierarchyPath = HierarchyPath(
+        part=None,
+        chapter=None,
+        section=None,
+        subsection=None,
+    )
+    for section in hierarchy.get("sections", []):
+        idx = process_section(section, initial_path, idx)
+
+    return chunks