Files
David Blanc Brioir d2f7165120 Add Library RAG project and cleanup root directory
- Add complete Library RAG application (Flask + MCP server)
  - PDF processing pipeline with OCR and LLM extraction
  - Weaviate vector database integration (BGE-M3 embeddings)
  - Flask web interface with search and document management
  - MCP server for Claude Desktop integration
  - Comprehensive test suite (134 tests)

- Clean up root directory
  - Remove obsolete documentation files
  - Remove backup and temporary files
  - Update autonomous agent configuration

- Update prompts
  - Enhance initializer bis prompt with better instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 11:57:12 +01:00

268 lines
7.9 KiB
Python

"""Hierarchical Markdown document parser for semantic chunking.
This module provides utilities for parsing Markdown documents into
hierarchical structures based on heading levels (# to ######). It is
a key component of the RAG pipeline, enabling:
1. **Structure Extraction**: Parse Markdown into a tree of sections
2. **Context Preservation**: Maintain hierarchical context (part > chapter > section)
3. **Semantic Chunking**: Flatten hierarchy into chunks with full path context
The parser uses a stack-based algorithm to build nested section trees,
preserving the document's logical structure for downstream processing.
Architecture:
Input: Raw Markdown text with headings
build_hierarchy() → DocumentHierarchy (tree structure)
flatten_hierarchy() → List[FlatChunk] (with hierarchical context)
TypedDict Definitions:
- HierarchyPath: Hierarchical path (part/chapter/section/subsection)
- HierarchyNode: Tree node with title, level, content, children
- DocumentHierarchy: Complete document structure
- FlatChunk: Flattened chunk with context for RAG ingestion
Algorithm:
The build_hierarchy() function uses a stack-based approach:
1. Initialize a virtual root node at level 0
2. For each line in the document:
- If heading: pop stack until parent level found, then push new node
- If content: append to current node's content
3. Finalize nodes by joining content lines
Example:
>>> markdown = '''
... # Introduction
... This is the intro.
...
... ## Background
... Some background text.
...
... ## Methodology
... Methods used here.
... '''
>>> hierarchy = build_hierarchy(markdown)
>>> print(hierarchy["sections"][0]["title"])
'Introduction'
>>> chunks = flatten_hierarchy(hierarchy)
>>> for chunk in chunks:
... print(f"{chunk['chunk_id']}: {chunk['title']}")
chunk_00001: Introduction
chunk_00002: Background
chunk_00003: Methodology
See Also:
- utils.llm_chunker: Semantic chunking using LLM
- utils.markdown_builder: Markdown generation from OCR
- utils.weaviate_ingest: Ingestion of chunks into Weaviate
"""
from __future__ import annotations
import re
from typing import List, Optional, Pattern, TypedDict
# Import type definitions from central types module
from utils.types import (
DocumentHierarchy,
FlatChunk,
HierarchyNode,
HierarchyPath,
)
class _BuildNode(TypedDict):
"""Noeud interne pour la construction de la hiérarchie."""
title: Optional[str]
level: int
content: List[str]
children: List[_BuildNode]
def build_hierarchy(markdown_text: str) -> DocumentHierarchy:
"""Construit une structure hiérarchique à partir des titres Markdown.
Analyse les titres (# à ######) et construit un arbre de sections
avec leur contenu textuel.
Args:
markdown_text: Texte Markdown à analyser
Returns:
Dictionnaire avec :
- preamble: Texte avant le premier titre
- sections: Liste de sections imbriquées
Chaque section contient :
- title: Titre de la section
- level: Niveau (1-6)
- content: Contenu textuel
- children: Sous-sections
"""
# Regex pour les titres Markdown
heading_re: Pattern[str] = re.compile(r"^(#{1,6})\s+(.*)$")
lines: List[str] = markdown_text.splitlines()
# Noeud racine (niveau 0, virtuel)
root: _BuildNode = {
"title": None,
"level": 0,
"content": [],
"children": [],
}
# Pile pour suivre la hiérarchie
stack: List[_BuildNode] = [root]
for line in lines:
stripped: str = line.rstrip()
match: Optional[re.Match[str]] = heading_re.match(stripped)
if match:
# C'est un titre
level: int = len(match.group(1))
title: str = match.group(2).strip()
# Remonter dans la pile jusqu'au parent approprié
while stack and stack[-1]["level"] >= level:
stack.pop()
# Créer le nouveau noeud
node: _BuildNode = {
"title": title,
"level": level,
"content": [],
"children": [],
}
# Ajouter au parent
parent: _BuildNode = stack[-1]
parent["children"].append(node)
# Empiler le nouveau noeud
stack.append(node)
else:
# C'est du contenu, l'ajouter au noeud courant
stack[-1]["content"].append(stripped)
# Finaliser les noeuds (joindre le contenu)
def finalize(node: _BuildNode) -> HierarchyNode:
"""Convertit un noeud de construction en noeud final."""
return HierarchyNode(
title=node["title"],
level=node["level"],
content="\n".join(node["content"]).strip(),
children=[finalize(child) for child in node["children"]],
)
# Extraire le préambule et les sections
preamble: str = "\n".join(root["content"]).strip()
sections: List[HierarchyNode] = [finalize(child) for child in root["children"]]
return DocumentHierarchy(
preamble=preamble,
sections=sections,
)
def flatten_hierarchy(hierarchy: DocumentHierarchy) -> List[FlatChunk]:
"""Aplatit la hiérarchie en une liste de chunks.
Args:
hierarchy: Structure hiérarchique (sortie de build_hierarchy)
Returns:
Liste de chunks avec leur contexte hiérarchique
"""
chunks: List[FlatChunk] = []
# Préambule comme premier chunk
if hierarchy.get("preamble"):
preamble_chunk: FlatChunk = {
"chunk_id": "chunk_00000",
"text": hierarchy["preamble"],
"hierarchy": HierarchyPath(
part=None,
chapter=None,
section=None,
subsection=None,
),
"type": "preamble",
"level": 0,
"title": None,
}
chunks.append(preamble_chunk)
def process_section(
section: HierarchyNode,
path: HierarchyPath,
index: int,
) -> int:
"""Traite récursivement une section.
Args:
section: Noeud de section à traiter
path: Chemin hiérarchique courant
index: Index du prochain chunk
Returns:
Nouvel index après traitement
"""
level: int = section["level"]
title: Optional[str] = section["title"]
# Mettre à jour le chemin hiérarchique
current_path: HierarchyPath = path.copy()
if level == 1:
current_path = HierarchyPath(
part=title,
chapter=None,
section=None,
subsection=None,
)
elif level == 2:
current_path["chapter"] = title
current_path["section"] = None
current_path["subsection"] = None
elif level == 3:
current_path["section"] = title
current_path["subsection"] = None
elif level >= 4:
current_path["subsection"] = title
# Créer le chunk si contenu
if section["content"]:
chunk: FlatChunk = {
"chunk_id": f"chunk_{index:05d}",
"text": section["content"],
"hierarchy": current_path.copy(),
"type": "main_content",
"level": level,
"title": title,
}
chunks.append(chunk)
index += 1
# Traiter les enfants
for child in section["children"]:
index = process_section(child, current_path, index)
return index
# Traiter toutes les sections
idx: int = 1
initial_path: HierarchyPath = HierarchyPath(
part=None,
chapter=None,
section=None,
subsection=None,
)
for section in hierarchy.get("sections", []):
idx = process_section(section, initial_path, idx)
return chunks