- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
268 lines
7.9 KiB
Python
268 lines
7.9 KiB
Python
"""Hierarchical Markdown document parser for semantic chunking.
|
|
|
|
This module provides utilities for parsing Markdown documents into
|
|
hierarchical structures based on heading levels (# to ######). It is
|
|
a key component of the RAG pipeline, enabling:
|
|
|
|
1. **Structure Extraction**: Parse Markdown into a tree of sections
|
|
2. **Context Preservation**: Maintain hierarchical context (part > chapter > section)
|
|
3. **Semantic Chunking**: Flatten hierarchy into chunks with full path context
|
|
|
|
The parser uses a stack-based algorithm to build nested section trees,
|
|
preserving the document's logical structure for downstream processing.
|
|
|
|
Architecture:
|
|
Input: Raw Markdown text with headings
|
|
↓
|
|
build_hierarchy() → DocumentHierarchy (tree structure)
|
|
↓
|
|
flatten_hierarchy() → List[FlatChunk] (with hierarchical context)
|
|
|
|
TypedDict Definitions:
|
|
- HierarchyPath: Hierarchical path (part/chapter/section/subsection)
|
|
- HierarchyNode: Tree node with title, level, content, children
|
|
- DocumentHierarchy: Complete document structure
|
|
- FlatChunk: Flattened chunk with context for RAG ingestion
|
|
|
|
Algorithm:
|
|
The build_hierarchy() function uses a stack-based approach:
|
|
1. Initialize a virtual root node at level 0
|
|
2. For each line in the document:
|
|
- If heading: pop stack until parent level found, then push new node
|
|
- If content: append to current node's content
|
|
3. Finalize nodes by joining content lines
|
|
|
|
Example:
|
|
>>> markdown = '''
|
|
... # Introduction
|
|
... This is the intro.
|
|
...
|
|
... ## Background
|
|
... Some background text.
|
|
...
|
|
... ## Methodology
|
|
... Methods used here.
|
|
... '''
|
|
>>> hierarchy = build_hierarchy(markdown)
|
|
>>> print(hierarchy["sections"][0]["title"])
|
|
'Introduction'
|
|
>>> chunks = flatten_hierarchy(hierarchy)
|
|
>>> for chunk in chunks:
|
|
... print(f"{chunk['chunk_id']}: {chunk['title']}")
|
|
chunk_00001: Introduction
|
|
chunk_00002: Background
|
|
chunk_00003: Methodology
|
|
|
|
See Also:
|
|
- utils.llm_chunker: Semantic chunking using LLM
|
|
- utils.markdown_builder: Markdown generation from OCR
|
|
- utils.weaviate_ingest: Ingestion of chunks into Weaviate
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import List, Optional, Pattern, TypedDict
|
|
|
|
# Import type definitions from central types module
|
|
from utils.types import (
|
|
DocumentHierarchy,
|
|
FlatChunk,
|
|
HierarchyNode,
|
|
HierarchyPath,
|
|
)
|
|
|
|
|
|
class _BuildNode(TypedDict):
|
|
"""Noeud interne pour la construction de la hiérarchie."""
|
|
|
|
title: Optional[str]
|
|
level: int
|
|
content: List[str]
|
|
children: List[_BuildNode]
|
|
|
|
|
|
def build_hierarchy(markdown_text: str) -> DocumentHierarchy:
|
|
"""Construit une structure hiérarchique à partir des titres Markdown.
|
|
|
|
Analyse les titres (# à ######) et construit un arbre de sections
|
|
avec leur contenu textuel.
|
|
|
|
Args:
|
|
markdown_text: Texte Markdown à analyser
|
|
|
|
Returns:
|
|
Dictionnaire avec :
|
|
- preamble: Texte avant le premier titre
|
|
- sections: Liste de sections imbriquées
|
|
|
|
Chaque section contient :
|
|
- title: Titre de la section
|
|
- level: Niveau (1-6)
|
|
- content: Contenu textuel
|
|
- children: Sous-sections
|
|
"""
|
|
# Regex pour les titres Markdown
|
|
heading_re: Pattern[str] = re.compile(r"^(#{1,6})\s+(.*)$")
|
|
|
|
lines: List[str] = markdown_text.splitlines()
|
|
|
|
# Noeud racine (niveau 0, virtuel)
|
|
root: _BuildNode = {
|
|
"title": None,
|
|
"level": 0,
|
|
"content": [],
|
|
"children": [],
|
|
}
|
|
|
|
# Pile pour suivre la hiérarchie
|
|
stack: List[_BuildNode] = [root]
|
|
|
|
for line in lines:
|
|
stripped: str = line.rstrip()
|
|
match: Optional[re.Match[str]] = heading_re.match(stripped)
|
|
|
|
if match:
|
|
# C'est un titre
|
|
level: int = len(match.group(1))
|
|
title: str = match.group(2).strip()
|
|
|
|
# Remonter dans la pile jusqu'au parent approprié
|
|
while stack and stack[-1]["level"] >= level:
|
|
stack.pop()
|
|
|
|
# Créer le nouveau noeud
|
|
node: _BuildNode = {
|
|
"title": title,
|
|
"level": level,
|
|
"content": [],
|
|
"children": [],
|
|
}
|
|
|
|
# Ajouter au parent
|
|
parent: _BuildNode = stack[-1]
|
|
parent["children"].append(node)
|
|
|
|
# Empiler le nouveau noeud
|
|
stack.append(node)
|
|
else:
|
|
# C'est du contenu, l'ajouter au noeud courant
|
|
stack[-1]["content"].append(stripped)
|
|
|
|
# Finaliser les noeuds (joindre le contenu)
|
|
def finalize(node: _BuildNode) -> HierarchyNode:
|
|
"""Convertit un noeud de construction en noeud final."""
|
|
return HierarchyNode(
|
|
title=node["title"],
|
|
level=node["level"],
|
|
content="\n".join(node["content"]).strip(),
|
|
children=[finalize(child) for child in node["children"]],
|
|
)
|
|
|
|
# Extraire le préambule et les sections
|
|
preamble: str = "\n".join(root["content"]).strip()
|
|
sections: List[HierarchyNode] = [finalize(child) for child in root["children"]]
|
|
|
|
return DocumentHierarchy(
|
|
preamble=preamble,
|
|
sections=sections,
|
|
)
|
|
|
|
|
|
def flatten_hierarchy(hierarchy: DocumentHierarchy) -> List[FlatChunk]:
|
|
"""Aplatit la hiérarchie en une liste de chunks.
|
|
|
|
Args:
|
|
hierarchy: Structure hiérarchique (sortie de build_hierarchy)
|
|
|
|
Returns:
|
|
Liste de chunks avec leur contexte hiérarchique
|
|
"""
|
|
chunks: List[FlatChunk] = []
|
|
|
|
# Préambule comme premier chunk
|
|
if hierarchy.get("preamble"):
|
|
preamble_chunk: FlatChunk = {
|
|
"chunk_id": "chunk_00000",
|
|
"text": hierarchy["preamble"],
|
|
"hierarchy": HierarchyPath(
|
|
part=None,
|
|
chapter=None,
|
|
section=None,
|
|
subsection=None,
|
|
),
|
|
"type": "preamble",
|
|
"level": 0,
|
|
"title": None,
|
|
}
|
|
chunks.append(preamble_chunk)
|
|
|
|
def process_section(
|
|
section: HierarchyNode,
|
|
path: HierarchyPath,
|
|
index: int,
|
|
) -> int:
|
|
"""Traite récursivement une section.
|
|
|
|
Args:
|
|
section: Noeud de section à traiter
|
|
path: Chemin hiérarchique courant
|
|
index: Index du prochain chunk
|
|
|
|
Returns:
|
|
Nouvel index après traitement
|
|
"""
|
|
level: int = section["level"]
|
|
title: Optional[str] = section["title"]
|
|
|
|
# Mettre à jour le chemin hiérarchique
|
|
current_path: HierarchyPath = path.copy()
|
|
if level == 1:
|
|
current_path = HierarchyPath(
|
|
part=title,
|
|
chapter=None,
|
|
section=None,
|
|
subsection=None,
|
|
)
|
|
elif level == 2:
|
|
current_path["chapter"] = title
|
|
current_path["section"] = None
|
|
current_path["subsection"] = None
|
|
elif level == 3:
|
|
current_path["section"] = title
|
|
current_path["subsection"] = None
|
|
elif level >= 4:
|
|
current_path["subsection"] = title
|
|
|
|
# Créer le chunk si contenu
|
|
if section["content"]:
|
|
chunk: FlatChunk = {
|
|
"chunk_id": f"chunk_{index:05d}",
|
|
"text": section["content"],
|
|
"hierarchy": current_path.copy(),
|
|
"type": "main_content",
|
|
"level": level,
|
|
"title": title,
|
|
}
|
|
chunks.append(chunk)
|
|
index += 1
|
|
|
|
# Traiter les enfants
|
|
for child in section["children"]:
|
|
index = process_section(child, current_path, index)
|
|
|
|
return index
|
|
|
|
# Traiter toutes les sections
|
|
idx: int = 1
|
|
initial_path: HierarchyPath = HierarchyPath(
|
|
part=None,
|
|
chapter=None,
|
|
section=None,
|
|
subsection=None,
|
|
)
|
|
for section in hierarchy.get("sections", []):
|
|
idx = process_section(section, initial_path, idx)
|
|
|
|
return chunks
|