- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
304 lines
9.6 KiB
Python
304 lines
9.6 KiB
Python
"""TOC extraction via Markdown indentation analysis.
|
|
|
|
This module provides a **cost-free** TOC extraction strategy that works on
|
|
already-generated Markdown text. Unlike the OCR annotation approach, this
|
|
method doesn't require additional API calls.
|
|
|
|
Strategy:
|
|
1. Search for "Table des matières" heading in the first N lines
|
|
2. Parse lines matching pattern: "Title.....Page" or "Title Page"
|
|
3. Detect hierarchy from leading whitespace (indentation)
|
|
4. Build nested TOC structure using stack-based algorithm
|
|
|
|
When to Use:
|
|
- When OCR has already been performed (markdown available)
|
|
- When cost optimization is critical (no additional API calls)
|
|
- For documents with clear indentation in the TOC
|
|
|
|
Limitations:
|
|
- Requires French "Table des matières" header (can be extended)
|
|
- Indentation detection may be less accurate than visual/bbox analysis
|
|
- Only works if OCR preserved whitespace accurately
|
|
|
|
Indentation Levels:
|
|
- 0-2 spaces: Level 1 (main chapters/parts)
|
|
- 3-6 spaces: Level 2 (sections)
|
|
- 7+ spaces: Level 3 (subsections)
|
|
|
|
Output Structure:
|
|
{
|
|
"success": bool,
|
|
"toc": [...], # Hierarchical TOC
|
|
"toc_flat": [...], # Flat entries with levels
|
|
"cost_ocr_annotated": 0.0, # No additional cost
|
|
"method": "markdown_indentation"
|
|
}
|
|
|
|
Example:
|
|
>>> from utils.toc_extractor_markdown import extract_toc_from_markdown
|
|
>>>
|
|
>>> markdown = '''
|
|
... # Table des matières
|
|
... Introduction.............................5
|
|
... Première partie..........................10
|
|
... Chapitre 1............................15
|
|
... Chapitre 2............................25
|
|
... Deuxième partie..........................50
|
|
... '''
|
|
>>> result = extract_toc_from_markdown(markdown)
|
|
>>> if result["success"]:
|
|
... print(f"Found {len(result['toc_flat'])} entries")
|
|
Found 5 entries
|
|
|
|
Functions:
|
|
- extract_toc_from_markdown(): Main extraction from markdown text
|
|
- build_hierarchy(): Converts flat entries to nested structure
|
|
|
|
See Also:
|
|
- utils.toc_extractor: Main entry point (routes to visual by default)
|
|
- utils.toc_extractor_visual: More accurate X-position based extraction
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List, Optional, TypedDict, Union
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# Type definitions for internal data structures
|
|
class MarkdownTOCEntryRaw(TypedDict):
|
|
"""Raw TOC entry extracted from markdown with indentation info."""
|
|
title: str
|
|
page_number: int
|
|
level: int
|
|
leading_spaces: int
|
|
|
|
|
|
class MarkdownTOCNode(TypedDict):
|
|
"""Hierarchical TOC node with children."""
|
|
title: str
|
|
page: int
|
|
level: int
|
|
type: str
|
|
children: List[MarkdownTOCNode]
|
|
|
|
|
|
class MarkdownTOCFlatEntry(TypedDict):
|
|
"""Flat TOC entry with parent information."""
|
|
title: str
|
|
page_number: int
|
|
level: int
|
|
entry_type: str
|
|
parent_title: Optional[str]
|
|
|
|
|
|
class MarkdownTOCResultSuccess(TypedDict):
|
|
"""Successful TOC extraction result."""
|
|
success: bool # Always True
|
|
metadata: Dict[str, Any]
|
|
toc: List[MarkdownTOCNode]
|
|
toc_flat: List[MarkdownTOCFlatEntry]
|
|
cost_ocr_annotated: float
|
|
method: str
|
|
|
|
|
|
class MarkdownTOCResultError(TypedDict):
|
|
"""Failed TOC extraction result."""
|
|
success: bool # Always False
|
|
error: str
|
|
|
|
|
|
# Union type for function return
|
|
MarkdownTOCResult = Union[MarkdownTOCResultSuccess, MarkdownTOCResultError]
|
|
|
|
|
|
def extract_toc_from_markdown(
|
|
markdown_text: str,
|
|
max_lines: int = 200,
|
|
) -> MarkdownTOCResult:
|
|
"""Extract table of contents by analyzing raw markdown text.
|
|
|
|
Detects hierarchy by counting leading spaces (indentation) at the
|
|
beginning of each line. This is a cost-free alternative to OCR
|
|
annotation-based extraction.
|
|
|
|
Args:
|
|
markdown_text: Complete markdown text of the document.
|
|
max_lines: Maximum number of lines to analyze (searches TOC at start).
|
|
|
|
Returns:
|
|
Dictionary with hierarchical TOC structure. On success, includes:
|
|
- success: True
|
|
- metadata: Empty dict (for consistency with other extractors)
|
|
- toc: Hierarchical nested TOC structure
|
|
- toc_flat: Flat list of entries with levels
|
|
- cost_ocr_annotated: 0.0 (no additional cost)
|
|
- method: "markdown_indentation"
|
|
On failure, includes:
|
|
- success: False
|
|
- error: Error message string
|
|
|
|
Example:
|
|
>>> markdown = '''
|
|
... # Table des matières
|
|
... Introduction.....5
|
|
... Part One........10
|
|
... Chapter 1.....15
|
|
... '''
|
|
>>> result = extract_toc_from_markdown(markdown)
|
|
>>> if result["success"]:
|
|
... print(len(result["toc_flat"]))
|
|
3
|
|
"""
|
|
logger.info("Extraction TOC depuis markdown (analyse indentation)")
|
|
|
|
lines: List[str] = markdown_text.split('\n')[:max_lines]
|
|
|
|
# Find "Table des matières" section
|
|
toc_start: Optional[int] = None
|
|
for i, line in enumerate(lines):
|
|
if re.search(r'table\s+des\s+mati[èe]res', line, re.IGNORECASE):
|
|
toc_start = i + 1
|
|
logger.info(f"TOC trouvée à la ligne {i}")
|
|
break
|
|
|
|
if toc_start is None:
|
|
logger.warning("Aucune table des matières trouvée dans le markdown")
|
|
return MarkdownTOCResultError(
|
|
success=False,
|
|
error="Table des matières introuvable"
|
|
)
|
|
|
|
# Extract TOC entries
|
|
entries: List[MarkdownTOCEntryRaw] = []
|
|
toc_pattern: re.Pattern[str] = re.compile(r'^(\s*)(.+?)\s*\.+\s*(\d+)\s*$')
|
|
|
|
for line in lines[toc_start:toc_start + 100]: # Max 100 lines of TOC
|
|
line_stripped: str = line.strip()
|
|
if not line_stripped or line_stripped.startswith('#') or line_stripped.startswith('---'):
|
|
continue
|
|
|
|
# Search for pattern "Title.....Page"
|
|
# Must analyze line BEFORE strip() to count leading spaces
|
|
original_line: str = lines[lines.index(line) if line in lines else 0]
|
|
leading_spaces: int = len(original_line) - len(original_line.lstrip())
|
|
|
|
# Alternative pattern: search for title + number at end
|
|
match: Optional[re.Match[str]] = re.match(r'^(.+?)\s*\.{2,}\s*(\d+)\s*$', line_stripped)
|
|
if not match:
|
|
# Try without dotted leaders
|
|
match = re.match(r'^(.+?)\s+(\d+)\s*$', line_stripped)
|
|
|
|
if match:
|
|
title: str = match.group(1).strip()
|
|
page: int = int(match.group(2))
|
|
|
|
# Ignore lines too short or that don't look like titles
|
|
if len(title) < 3 or title.isdigit():
|
|
continue
|
|
|
|
# Determine level based on indentation
|
|
# 0-2 spaces = level 1
|
|
# 3-6 spaces = level 2
|
|
# 7+ spaces = level 3
|
|
level: int
|
|
if leading_spaces <= 2:
|
|
level = 1
|
|
elif leading_spaces <= 6:
|
|
level = 2
|
|
else:
|
|
level = 3
|
|
|
|
entries.append(MarkdownTOCEntryRaw(
|
|
title=title,
|
|
page_number=page,
|
|
level=level,
|
|
leading_spaces=leading_spaces,
|
|
))
|
|
|
|
logger.debug(f" '{title}' → {leading_spaces} espaces → level {level} (page {page})")
|
|
|
|
if not entries:
|
|
logger.warning("Aucune entrée TOC extraite")
|
|
return MarkdownTOCResultError(
|
|
success=False,
|
|
error="Aucune entrée TOC trouvée"
|
|
)
|
|
|
|
logger.info(f"✅ {len(entries)} entrées extraites depuis markdown")
|
|
|
|
# Build hierarchy
|
|
toc: List[MarkdownTOCNode] = build_hierarchy(entries)
|
|
|
|
return MarkdownTOCResultSuccess(
|
|
success=True,
|
|
metadata={},
|
|
toc=toc,
|
|
toc_flat=[
|
|
MarkdownTOCFlatEntry(
|
|
title=e["title"],
|
|
page_number=e["page_number"],
|
|
level=e["level"],
|
|
entry_type="section",
|
|
parent_title=None,
|
|
)
|
|
for e in entries
|
|
],
|
|
cost_ocr_annotated=0.0, # No additional cost, uses existing OCR
|
|
method="markdown_indentation",
|
|
)
|
|
|
|
|
|
def build_hierarchy(entries: List[MarkdownTOCEntryRaw]) -> List[MarkdownTOCNode]:
|
|
"""Build hierarchical structure from flat entries based on levels.
|
|
|
|
Uses a stack-based algorithm to construct nested TOC structure where
|
|
entries with higher indentation become children of the previous
|
|
less-indented entry.
|
|
|
|
Args:
|
|
entries: List of raw TOC entries with title, page, and level.
|
|
|
|
Returns:
|
|
Nested list of TOC nodes where each node contains children.
|
|
|
|
Example:
|
|
>>> entries = [
|
|
... {"title": "Part 1", "page_number": 1, "level": 1, "leading_spaces": 0},
|
|
... {"title": "Chapter 1", "page_number": 5, "level": 2, "leading_spaces": 4},
|
|
... ]
|
|
>>> hierarchy = build_hierarchy(entries)
|
|
>>> len(hierarchy[0]["children"])
|
|
1
|
|
"""
|
|
toc: List[MarkdownTOCNode] = []
|
|
stack: List[MarkdownTOCNode] = []
|
|
|
|
for entry in entries:
|
|
node: MarkdownTOCNode = MarkdownTOCNode(
|
|
title=entry["title"],
|
|
page=entry["page_number"],
|
|
level=entry["level"],
|
|
type="section",
|
|
children=[],
|
|
)
|
|
|
|
# Pop from stack until we find a parent at lower level
|
|
while stack and stack[-1]["level"] >= node["level"]:
|
|
stack.pop()
|
|
|
|
if stack:
|
|
# Add as child to top of stack
|
|
stack[-1]["children"].append(node)
|
|
else:
|
|
# Add as root-level entry
|
|
toc.append(node)
|
|
|
|
stack.append(node)
|
|
|
|
return toc
|