Add Library RAG project and cleanup root directory
- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
267
generations/library_rag/utils/hierarchy_parser.py
Normal file
267
generations/library_rag/utils/hierarchy_parser.py
Normal file
@@ -0,0 +1,267 @@
|
||||
"""Hierarchical Markdown document parser for semantic chunking.
|
||||
|
||||
This module provides utilities for parsing Markdown documents into
|
||||
hierarchical structures based on heading levels (# to ######). It is
|
||||
a key component of the RAG pipeline, enabling:
|
||||
|
||||
1. **Structure Extraction**: Parse Markdown into a tree of sections
|
||||
2. **Context Preservation**: Maintain hierarchical context (part > chapter > section)
|
||||
3. **Semantic Chunking**: Flatten hierarchy into chunks with full path context
|
||||
|
||||
The parser uses a stack-based algorithm to build nested section trees,
|
||||
preserving the document's logical structure for downstream processing.
|
||||
|
||||
Architecture:
|
||||
Input: Raw Markdown text with headings
|
||||
↓
|
||||
build_hierarchy() → DocumentHierarchy (tree structure)
|
||||
↓
|
||||
flatten_hierarchy() → List[FlatChunk] (with hierarchical context)
|
||||
|
||||
TypedDict Definitions:
|
||||
- HierarchyPath: Hierarchical path (part/chapter/section/subsection)
|
||||
- HierarchyNode: Tree node with title, level, content, children
|
||||
- DocumentHierarchy: Complete document structure
|
||||
- FlatChunk: Flattened chunk with context for RAG ingestion
|
||||
|
||||
Algorithm:
|
||||
The build_hierarchy() function uses a stack-based approach:
|
||||
1. Initialize a virtual root node at level 0
|
||||
2. For each line in the document:
|
||||
- If heading: pop stack until parent level found, then push new node
|
||||
- If content: append to current node's content
|
||||
3. Finalize nodes by joining content lines
|
||||
|
||||
Example:
|
||||
>>> markdown = '''
|
||||
... # Introduction
|
||||
... This is the intro.
|
||||
...
|
||||
... ## Background
|
||||
... Some background text.
|
||||
...
|
||||
... ## Methodology
|
||||
... Methods used here.
|
||||
... '''
|
||||
>>> hierarchy = build_hierarchy(markdown)
|
||||
>>> print(hierarchy["sections"][0]["title"])
|
||||
'Introduction'
|
||||
>>> chunks = flatten_hierarchy(hierarchy)
|
||||
>>> for chunk in chunks:
|
||||
... print(f"{chunk['chunk_id']}: {chunk['title']}")
|
||||
chunk_00001: Introduction
|
||||
chunk_00002: Background
|
||||
chunk_00003: Methodology
|
||||
|
||||
See Also:
|
||||
- utils.llm_chunker: Semantic chunking using LLM
|
||||
- utils.markdown_builder: Markdown generation from OCR
|
||||
- utils.weaviate_ingest: Ingestion of chunks into Weaviate
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import List, Optional, Pattern, TypedDict
|
||||
|
||||
# Import type definitions from central types module
|
||||
from utils.types import (
|
||||
DocumentHierarchy,
|
||||
FlatChunk,
|
||||
HierarchyNode,
|
||||
HierarchyPath,
|
||||
)
|
||||
|
||||
|
||||
class _BuildNode(TypedDict):
|
||||
"""Noeud interne pour la construction de la hiérarchie."""
|
||||
|
||||
title: Optional[str]
|
||||
level: int
|
||||
content: List[str]
|
||||
children: List[_BuildNode]
|
||||
|
||||
|
||||
def build_hierarchy(markdown_text: str) -> DocumentHierarchy:
|
||||
"""Construit une structure hiérarchique à partir des titres Markdown.
|
||||
|
||||
Analyse les titres (# à ######) et construit un arbre de sections
|
||||
avec leur contenu textuel.
|
||||
|
||||
Args:
|
||||
markdown_text: Texte Markdown à analyser
|
||||
|
||||
Returns:
|
||||
Dictionnaire avec :
|
||||
- preamble: Texte avant le premier titre
|
||||
- sections: Liste de sections imbriquées
|
||||
|
||||
Chaque section contient :
|
||||
- title: Titre de la section
|
||||
- level: Niveau (1-6)
|
||||
- content: Contenu textuel
|
||||
- children: Sous-sections
|
||||
"""
|
||||
# Regex pour les titres Markdown
|
||||
heading_re: Pattern[str] = re.compile(r"^(#{1,6})\s+(.*)$")
|
||||
|
||||
lines: List[str] = markdown_text.splitlines()
|
||||
|
||||
# Noeud racine (niveau 0, virtuel)
|
||||
root: _BuildNode = {
|
||||
"title": None,
|
||||
"level": 0,
|
||||
"content": [],
|
||||
"children": [],
|
||||
}
|
||||
|
||||
# Pile pour suivre la hiérarchie
|
||||
stack: List[_BuildNode] = [root]
|
||||
|
||||
for line in lines:
|
||||
stripped: str = line.rstrip()
|
||||
match: Optional[re.Match[str]] = heading_re.match(stripped)
|
||||
|
||||
if match:
|
||||
# C'est un titre
|
||||
level: int = len(match.group(1))
|
||||
title: str = match.group(2).strip()
|
||||
|
||||
# Remonter dans la pile jusqu'au parent approprié
|
||||
while stack and stack[-1]["level"] >= level:
|
||||
stack.pop()
|
||||
|
||||
# Créer le nouveau noeud
|
||||
node: _BuildNode = {
|
||||
"title": title,
|
||||
"level": level,
|
||||
"content": [],
|
||||
"children": [],
|
||||
}
|
||||
|
||||
# Ajouter au parent
|
||||
parent: _BuildNode = stack[-1]
|
||||
parent["children"].append(node)
|
||||
|
||||
# Empiler le nouveau noeud
|
||||
stack.append(node)
|
||||
else:
|
||||
# C'est du contenu, l'ajouter au noeud courant
|
||||
stack[-1]["content"].append(stripped)
|
||||
|
||||
# Finaliser les noeuds (joindre le contenu)
|
||||
def finalize(node: _BuildNode) -> HierarchyNode:
|
||||
"""Convertit un noeud de construction en noeud final."""
|
||||
return HierarchyNode(
|
||||
title=node["title"],
|
||||
level=node["level"],
|
||||
content="\n".join(node["content"]).strip(),
|
||||
children=[finalize(child) for child in node["children"]],
|
||||
)
|
||||
|
||||
# Extraire le préambule et les sections
|
||||
preamble: str = "\n".join(root["content"]).strip()
|
||||
sections: List[HierarchyNode] = [finalize(child) for child in root["children"]]
|
||||
|
||||
return DocumentHierarchy(
|
||||
preamble=preamble,
|
||||
sections=sections,
|
||||
)
|
||||
|
||||
|
||||
def flatten_hierarchy(hierarchy: DocumentHierarchy) -> List[FlatChunk]:
|
||||
"""Aplatit la hiérarchie en une liste de chunks.
|
||||
|
||||
Args:
|
||||
hierarchy: Structure hiérarchique (sortie de build_hierarchy)
|
||||
|
||||
Returns:
|
||||
Liste de chunks avec leur contexte hiérarchique
|
||||
"""
|
||||
chunks: List[FlatChunk] = []
|
||||
|
||||
# Préambule comme premier chunk
|
||||
if hierarchy.get("preamble"):
|
||||
preamble_chunk: FlatChunk = {
|
||||
"chunk_id": "chunk_00000",
|
||||
"text": hierarchy["preamble"],
|
||||
"hierarchy": HierarchyPath(
|
||||
part=None,
|
||||
chapter=None,
|
||||
section=None,
|
||||
subsection=None,
|
||||
),
|
||||
"type": "preamble",
|
||||
"level": 0,
|
||||
"title": None,
|
||||
}
|
||||
chunks.append(preamble_chunk)
|
||||
|
||||
def process_section(
|
||||
section: HierarchyNode,
|
||||
path: HierarchyPath,
|
||||
index: int,
|
||||
) -> int:
|
||||
"""Traite récursivement une section.
|
||||
|
||||
Args:
|
||||
section: Noeud de section à traiter
|
||||
path: Chemin hiérarchique courant
|
||||
index: Index du prochain chunk
|
||||
|
||||
Returns:
|
||||
Nouvel index après traitement
|
||||
"""
|
||||
level: int = section["level"]
|
||||
title: Optional[str] = section["title"]
|
||||
|
||||
# Mettre à jour le chemin hiérarchique
|
||||
current_path: HierarchyPath = path.copy()
|
||||
if level == 1:
|
||||
current_path = HierarchyPath(
|
||||
part=title,
|
||||
chapter=None,
|
||||
section=None,
|
||||
subsection=None,
|
||||
)
|
||||
elif level == 2:
|
||||
current_path["chapter"] = title
|
||||
current_path["section"] = None
|
||||
current_path["subsection"] = None
|
||||
elif level == 3:
|
||||
current_path["section"] = title
|
||||
current_path["subsection"] = None
|
||||
elif level >= 4:
|
||||
current_path["subsection"] = title
|
||||
|
||||
# Créer le chunk si contenu
|
||||
if section["content"]:
|
||||
chunk: FlatChunk = {
|
||||
"chunk_id": f"chunk_{index:05d}",
|
||||
"text": section["content"],
|
||||
"hierarchy": current_path.copy(),
|
||||
"type": "main_content",
|
||||
"level": level,
|
||||
"title": title,
|
||||
}
|
||||
chunks.append(chunk)
|
||||
index += 1
|
||||
|
||||
# Traiter les enfants
|
||||
for child in section["children"]:
|
||||
index = process_section(child, current_path, index)
|
||||
|
||||
return index
|
||||
|
||||
# Traiter toutes les sections
|
||||
idx: int = 1
|
||||
initial_path: HierarchyPath = HierarchyPath(
|
||||
part=None,
|
||||
chapter=None,
|
||||
section=None,
|
||||
subsection=None,
|
||||
)
|
||||
for section in hierarchy.get("sections", []):
|
||||
idx = process_section(section, initial_path, idx)
|
||||
|
||||
return chunks
|
||||
Reference in New Issue
Block a user