Add Library RAG project and cleanup root directory
- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
303
generations/library_rag/utils/toc_extractor_markdown.py
Normal file
303
generations/library_rag/utils/toc_extractor_markdown.py
Normal file
@@ -0,0 +1,303 @@
|
||||
"""TOC extraction via Markdown indentation analysis.
|
||||
|
||||
This module provides a **cost-free** TOC extraction strategy that works on
|
||||
already-generated Markdown text. Unlike the OCR annotation approach, this
|
||||
method doesn't require additional API calls.
|
||||
|
||||
Strategy:
|
||||
1. Search for "Table des matières" heading in the first N lines
|
||||
2. Parse lines matching pattern: "Title.....Page" or "Title Page"
|
||||
3. Detect hierarchy from leading whitespace (indentation)
|
||||
4. Build nested TOC structure using stack-based algorithm
|
||||
|
||||
When to Use:
|
||||
- When OCR has already been performed (markdown available)
|
||||
- When cost optimization is critical (no additional API calls)
|
||||
- For documents with clear indentation in the TOC
|
||||
|
||||
Limitations:
|
||||
- Requires French "Table des matières" header (can be extended)
|
||||
- Indentation detection may be less accurate than visual/bbox analysis
|
||||
- Only works if OCR preserved whitespace accurately
|
||||
|
||||
Indentation Levels:
|
||||
- 0-2 spaces: Level 1 (main chapters/parts)
|
||||
- 3-6 spaces: Level 2 (sections)
|
||||
- 7+ spaces: Level 3 (subsections)
|
||||
|
||||
Output Structure:
|
||||
{
|
||||
"success": bool,
|
||||
"toc": [...], # Hierarchical TOC
|
||||
"toc_flat": [...], # Flat entries with levels
|
||||
"cost_ocr_annotated": 0.0, # No additional cost
|
||||
"method": "markdown_indentation"
|
||||
}
|
||||
|
||||
Example:
|
||||
>>> from utils.toc_extractor_markdown import extract_toc_from_markdown
|
||||
>>>
|
||||
>>> markdown = '''
|
||||
... # Table des matières
|
||||
... Introduction.............................5
|
||||
... Première partie..........................10
|
||||
... Chapitre 1............................15
|
||||
... Chapitre 2............................25
|
||||
... Deuxième partie..........................50
|
||||
... '''
|
||||
>>> result = extract_toc_from_markdown(markdown)
|
||||
>>> if result["success"]:
|
||||
... print(f"Found {len(result['toc_flat'])} entries")
|
||||
Found 5 entries
|
||||
|
||||
Functions:
|
||||
- extract_toc_from_markdown(): Main extraction from markdown text
|
||||
- build_hierarchy(): Converts flat entries to nested structure
|
||||
|
||||
See Also:
|
||||
- utils.toc_extractor: Main entry point (routes to visual by default)
|
||||
- utils.toc_extractor_visual: More accurate X-position based extraction
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, TypedDict, Union
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Type definitions for internal data structures
|
||||
class MarkdownTOCEntryRaw(TypedDict):
|
||||
"""Raw TOC entry extracted from markdown with indentation info."""
|
||||
title: str
|
||||
page_number: int
|
||||
level: int
|
||||
leading_spaces: int
|
||||
|
||||
|
||||
class MarkdownTOCNode(TypedDict):
|
||||
"""Hierarchical TOC node with children."""
|
||||
title: str
|
||||
page: int
|
||||
level: int
|
||||
type: str
|
||||
children: List[MarkdownTOCNode]
|
||||
|
||||
|
||||
class MarkdownTOCFlatEntry(TypedDict):
|
||||
"""Flat TOC entry with parent information."""
|
||||
title: str
|
||||
page_number: int
|
||||
level: int
|
||||
entry_type: str
|
||||
parent_title: Optional[str]
|
||||
|
||||
|
||||
class MarkdownTOCResultSuccess(TypedDict):
|
||||
"""Successful TOC extraction result."""
|
||||
success: bool # Always True
|
||||
metadata: Dict[str, Any]
|
||||
toc: List[MarkdownTOCNode]
|
||||
toc_flat: List[MarkdownTOCFlatEntry]
|
||||
cost_ocr_annotated: float
|
||||
method: str
|
||||
|
||||
|
||||
class MarkdownTOCResultError(TypedDict):
|
||||
"""Failed TOC extraction result."""
|
||||
success: bool # Always False
|
||||
error: str
|
||||
|
||||
|
||||
# Union type for function return
|
||||
MarkdownTOCResult = Union[MarkdownTOCResultSuccess, MarkdownTOCResultError]
|
||||
|
||||
|
||||
def extract_toc_from_markdown(
|
||||
markdown_text: str,
|
||||
max_lines: int = 200,
|
||||
) -> MarkdownTOCResult:
|
||||
"""Extract table of contents by analyzing raw markdown text.
|
||||
|
||||
Detects hierarchy by counting leading spaces (indentation) at the
|
||||
beginning of each line. This is a cost-free alternative to OCR
|
||||
annotation-based extraction.
|
||||
|
||||
Args:
|
||||
markdown_text: Complete markdown text of the document.
|
||||
max_lines: Maximum number of lines to analyze (searches TOC at start).
|
||||
|
||||
Returns:
|
||||
Dictionary with hierarchical TOC structure. On success, includes:
|
||||
- success: True
|
||||
- metadata: Empty dict (for consistency with other extractors)
|
||||
- toc: Hierarchical nested TOC structure
|
||||
- toc_flat: Flat list of entries with levels
|
||||
- cost_ocr_annotated: 0.0 (no additional cost)
|
||||
- method: "markdown_indentation"
|
||||
On failure, includes:
|
||||
- success: False
|
||||
- error: Error message string
|
||||
|
||||
Example:
|
||||
>>> markdown = '''
|
||||
... # Table des matières
|
||||
... Introduction.....5
|
||||
... Part One........10
|
||||
... Chapter 1.....15
|
||||
... '''
|
||||
>>> result = extract_toc_from_markdown(markdown)
|
||||
>>> if result["success"]:
|
||||
... print(len(result["toc_flat"]))
|
||||
3
|
||||
"""
|
||||
logger.info("Extraction TOC depuis markdown (analyse indentation)")
|
||||
|
||||
lines: List[str] = markdown_text.split('\n')[:max_lines]
|
||||
|
||||
# Find "Table des matières" section
|
||||
toc_start: Optional[int] = None
|
||||
for i, line in enumerate(lines):
|
||||
if re.search(r'table\s+des\s+mati[èe]res', line, re.IGNORECASE):
|
||||
toc_start = i + 1
|
||||
logger.info(f"TOC trouvée à la ligne {i}")
|
||||
break
|
||||
|
||||
if toc_start is None:
|
||||
logger.warning("Aucune table des matières trouvée dans le markdown")
|
||||
return MarkdownTOCResultError(
|
||||
success=False,
|
||||
error="Table des matières introuvable"
|
||||
)
|
||||
|
||||
# Extract TOC entries
|
||||
entries: List[MarkdownTOCEntryRaw] = []
|
||||
toc_pattern: re.Pattern[str] = re.compile(r'^(\s*)(.+?)\s*\.+\s*(\d+)\s*$')
|
||||
|
||||
for line in lines[toc_start:toc_start + 100]: # Max 100 lines of TOC
|
||||
line_stripped: str = line.strip()
|
||||
if not line_stripped or line_stripped.startswith('#') or line_stripped.startswith('---'):
|
||||
continue
|
||||
|
||||
# Search for pattern "Title.....Page"
|
||||
# Must analyze line BEFORE strip() to count leading spaces
|
||||
original_line: str = lines[lines.index(line) if line in lines else 0]
|
||||
leading_spaces: int = len(original_line) - len(original_line.lstrip())
|
||||
|
||||
# Alternative pattern: search for title + number at end
|
||||
match: Optional[re.Match[str]] = re.match(r'^(.+?)\s*\.{2,}\s*(\d+)\s*$', line_stripped)
|
||||
if not match:
|
||||
# Try without dotted leaders
|
||||
match = re.match(r'^(.+?)\s+(\d+)\s*$', line_stripped)
|
||||
|
||||
if match:
|
||||
title: str = match.group(1).strip()
|
||||
page: int = int(match.group(2))
|
||||
|
||||
# Ignore lines too short or that don't look like titles
|
||||
if len(title) < 3 or title.isdigit():
|
||||
continue
|
||||
|
||||
# Determine level based on indentation
|
||||
# 0-2 spaces = level 1
|
||||
# 3-6 spaces = level 2
|
||||
# 7+ spaces = level 3
|
||||
level: int
|
||||
if leading_spaces <= 2:
|
||||
level = 1
|
||||
elif leading_spaces <= 6:
|
||||
level = 2
|
||||
else:
|
||||
level = 3
|
||||
|
||||
entries.append(MarkdownTOCEntryRaw(
|
||||
title=title,
|
||||
page_number=page,
|
||||
level=level,
|
||||
leading_spaces=leading_spaces,
|
||||
))
|
||||
|
||||
logger.debug(f" '{title}' → {leading_spaces} espaces → level {level} (page {page})")
|
||||
|
||||
if not entries:
|
||||
logger.warning("Aucune entrée TOC extraite")
|
||||
return MarkdownTOCResultError(
|
||||
success=False,
|
||||
error="Aucune entrée TOC trouvée"
|
||||
)
|
||||
|
||||
logger.info(f"✅ {len(entries)} entrées extraites depuis markdown")
|
||||
|
||||
# Build hierarchy
|
||||
toc: List[MarkdownTOCNode] = build_hierarchy(entries)
|
||||
|
||||
return MarkdownTOCResultSuccess(
|
||||
success=True,
|
||||
metadata={},
|
||||
toc=toc,
|
||||
toc_flat=[
|
||||
MarkdownTOCFlatEntry(
|
||||
title=e["title"],
|
||||
page_number=e["page_number"],
|
||||
level=e["level"],
|
||||
entry_type="section",
|
||||
parent_title=None,
|
||||
)
|
||||
for e in entries
|
||||
],
|
||||
cost_ocr_annotated=0.0, # No additional cost, uses existing OCR
|
||||
method="markdown_indentation",
|
||||
)
|
||||
|
||||
|
||||
def build_hierarchy(entries: List[MarkdownTOCEntryRaw]) -> List[MarkdownTOCNode]:
|
||||
"""Build hierarchical structure from flat entries based on levels.
|
||||
|
||||
Uses a stack-based algorithm to construct nested TOC structure where
|
||||
entries with higher indentation become children of the previous
|
||||
less-indented entry.
|
||||
|
||||
Args:
|
||||
entries: List of raw TOC entries with title, page, and level.
|
||||
|
||||
Returns:
|
||||
Nested list of TOC nodes where each node contains children.
|
||||
|
||||
Example:
|
||||
>>> entries = [
|
||||
... {"title": "Part 1", "page_number": 1, "level": 1, "leading_spaces": 0},
|
||||
... {"title": "Chapter 1", "page_number": 5, "level": 2, "leading_spaces": 4},
|
||||
... ]
|
||||
>>> hierarchy = build_hierarchy(entries)
|
||||
>>> len(hierarchy[0]["children"])
|
||||
1
|
||||
"""
|
||||
toc: List[MarkdownTOCNode] = []
|
||||
stack: List[MarkdownTOCNode] = []
|
||||
|
||||
for entry in entries:
|
||||
node: MarkdownTOCNode = MarkdownTOCNode(
|
||||
title=entry["title"],
|
||||
page=entry["page_number"],
|
||||
level=entry["level"],
|
||||
type="section",
|
||||
children=[],
|
||||
)
|
||||
|
||||
# Pop from stack until we find a parent at lower level
|
||||
while stack and stack[-1]["level"] >= node["level"]:
|
||||
stack.pop()
|
||||
|
||||
if stack:
|
||||
# Add as child to top of stack
|
||||
stack[-1]["children"].append(node)
|
||||
else:
|
||||
# Add as root-level entry
|
||||
toc.append(node)
|
||||
|
||||
stack.append(node)
|
||||
|
||||
return toc
|
||||
Reference in New Issue
Block a user