Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server)
  - PDF processing pipeline with OCR and LLM extraction
  - Weaviate vector database integration (BGE-M3 embeddings)
  - Flask web interface with search and document management
  - MCP server for Claude Desktop integration
  - Comprehensive test suite (134 tests)

- Clean up root directory
  - Remove obsolete documentation files
  - Remove backup and temporary files
  - Update autonomous agent configuration

- Update prompts
  - Enhance initializer bis prompt with better instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions

View File

@@ -0,0 +1,303 @@
"""TOC extraction via Markdown indentation analysis.
This module provides a **cost-free** TOC extraction strategy that works on
already-generated Markdown text. Unlike the OCR annotation approach, this
method doesn't require additional API calls.
Strategy:
1. Search for "Table des matières" heading in the first N lines
2. Parse lines matching pattern: "Title.....Page" or "Title Page"
3. Detect hierarchy from leading whitespace (indentation)
4. Build nested TOC structure using stack-based algorithm
When to Use:
- When OCR has already been performed (markdown available)
- When cost optimization is critical (no additional API calls)
- For documents with clear indentation in the TOC
Limitations:
- Requires French "Table des matières" header (can be extended)
- Indentation detection may be less accurate than visual/bbox analysis
- Only works if OCR preserved whitespace accurately
Indentation Levels:
- 0-2 spaces: Level 1 (main chapters/parts)
- 3-6 spaces: Level 2 (sections)
- 7+ spaces: Level 3 (subsections)
Output Structure:
{
"success": bool,
"toc": [...], # Hierarchical TOC
"toc_flat": [...], # Flat entries with levels
"cost_ocr_annotated": 0.0, # No additional cost
"method": "markdown_indentation"
}
Example:
>>> from utils.toc_extractor_markdown import extract_toc_from_markdown
>>>
>>> markdown = '''
... # Table des matières
... Introduction.............................5
... Première partie..........................10
... Chapitre 1............................15
... Chapitre 2............................25
... Deuxième partie..........................50
... '''
>>> result = extract_toc_from_markdown(markdown)
>>> if result["success"]:
... print(f"Found {len(result['toc_flat'])} entries")
Found 5 entries
Functions:
- extract_toc_from_markdown(): Main extraction from markdown text
- build_hierarchy(): Converts flat entries to nested structure
See Also:
- utils.toc_extractor: Main entry point (routes to visual by default)
- utils.toc_extractor_visual: More accurate X-position based extraction
"""
from __future__ import annotations
import logging
import re
from typing import Any, Dict, List, Optional, TypedDict, Union
from pathlib import Path
logger = logging.getLogger(__name__)
# Type definitions for internal data structures
class MarkdownTOCEntryRaw(TypedDict):
"""Raw TOC entry extracted from markdown with indentation info."""
title: str
page_number: int
level: int
leading_spaces: int
class MarkdownTOCNode(TypedDict):
"""Hierarchical TOC node with children."""
title: str
page: int
level: int
type: str
children: List[MarkdownTOCNode]
class MarkdownTOCFlatEntry(TypedDict):
"""Flat TOC entry with parent information."""
title: str
page_number: int
level: int
entry_type: str
parent_title: Optional[str]
class MarkdownTOCResultSuccess(TypedDict):
"""Successful TOC extraction result."""
success: bool # Always True
metadata: Dict[str, Any]
toc: List[MarkdownTOCNode]
toc_flat: List[MarkdownTOCFlatEntry]
cost_ocr_annotated: float
method: str
class MarkdownTOCResultError(TypedDict):
"""Failed TOC extraction result."""
success: bool # Always False
error: str
# Union type for function return
MarkdownTOCResult = Union[MarkdownTOCResultSuccess, MarkdownTOCResultError]
def extract_toc_from_markdown(
markdown_text: str,
max_lines: int = 200,
) -> MarkdownTOCResult:
"""Extract table of contents by analyzing raw markdown text.
Detects hierarchy by counting leading spaces (indentation) at the
beginning of each line. This is a cost-free alternative to OCR
annotation-based extraction.
Args:
markdown_text: Complete markdown text of the document.
max_lines: Maximum number of lines to analyze (searches TOC at start).
Returns:
Dictionary with hierarchical TOC structure. On success, includes:
- success: True
- metadata: Empty dict (for consistency with other extractors)
- toc: Hierarchical nested TOC structure
- toc_flat: Flat list of entries with levels
- cost_ocr_annotated: 0.0 (no additional cost)
- method: "markdown_indentation"
On failure, includes:
- success: False
- error: Error message string
Example:
>>> markdown = '''
... # Table des matières
... Introduction.....5
... Part One........10
... Chapter 1.....15
... '''
>>> result = extract_toc_from_markdown(markdown)
>>> if result["success"]:
... print(len(result["toc_flat"]))
3
"""
logger.info("Extraction TOC depuis markdown (analyse indentation)")
lines: List[str] = markdown_text.split('\n')[:max_lines]
# Find "Table des matières" section
toc_start: Optional[int] = None
for i, line in enumerate(lines):
if re.search(r'table\s+des\s+mati[èe]res', line, re.IGNORECASE):
toc_start = i + 1
logger.info(f"TOC trouvée à la ligne {i}")
break
if toc_start is None:
logger.warning("Aucune table des matières trouvée dans le markdown")
return MarkdownTOCResultError(
success=False,
error="Table des matières introuvable"
)
# Extract TOC entries
entries: List[MarkdownTOCEntryRaw] = []
toc_pattern: re.Pattern[str] = re.compile(r'^(\s*)(.+?)\s*\.+\s*(\d+)\s*$')
for line in lines[toc_start:toc_start + 100]: # Max 100 lines of TOC
line_stripped: str = line.strip()
if not line_stripped or line_stripped.startswith('#') or line_stripped.startswith('---'):
continue
# Search for pattern "Title.....Page"
# Must analyze line BEFORE strip() to count leading spaces
original_line: str = lines[lines.index(line) if line in lines else 0]
leading_spaces: int = len(original_line) - len(original_line.lstrip())
# Alternative pattern: search for title + number at end
match: Optional[re.Match[str]] = re.match(r'^(.+?)\s*\.{2,}\s*(\d+)\s*$', line_stripped)
if not match:
# Try without dotted leaders
match = re.match(r'^(.+?)\s+(\d+)\s*$', line_stripped)
if match:
title: str = match.group(1).strip()
page: int = int(match.group(2))
# Ignore lines too short or that don't look like titles
if len(title) < 3 or title.isdigit():
continue
# Determine level based on indentation
# 0-2 spaces = level 1
# 3-6 spaces = level 2
# 7+ spaces = level 3
level: int
if leading_spaces <= 2:
level = 1
elif leading_spaces <= 6:
level = 2
else:
level = 3
entries.append(MarkdownTOCEntryRaw(
title=title,
page_number=page,
level=level,
leading_spaces=leading_spaces,
))
logger.debug(f" '{title}'{leading_spaces} espaces → level {level} (page {page})")
if not entries:
logger.warning("Aucune entrée TOC extraite")
return MarkdownTOCResultError(
success=False,
error="Aucune entrée TOC trouvée"
)
logger.info(f"{len(entries)} entrées extraites depuis markdown")
# Build hierarchy
toc: List[MarkdownTOCNode] = build_hierarchy(entries)
return MarkdownTOCResultSuccess(
success=True,
metadata={},
toc=toc,
toc_flat=[
MarkdownTOCFlatEntry(
title=e["title"],
page_number=e["page_number"],
level=e["level"],
entry_type="section",
parent_title=None,
)
for e in entries
],
cost_ocr_annotated=0.0, # No additional cost, uses existing OCR
method="markdown_indentation",
)
def build_hierarchy(entries: List[MarkdownTOCEntryRaw]) -> List[MarkdownTOCNode]:
"""Build hierarchical structure from flat entries based on levels.
Uses a stack-based algorithm to construct nested TOC structure where
entries with higher indentation become children of the previous
less-indented entry.
Args:
entries: List of raw TOC entries with title, page, and level.
Returns:
Nested list of TOC nodes where each node contains children.
Example:
>>> entries = [
... {"title": "Part 1", "page_number": 1, "level": 1, "leading_spaces": 0},
... {"title": "Chapter 1", "page_number": 5, "level": 2, "leading_spaces": 4},
... ]
>>> hierarchy = build_hierarchy(entries)
>>> len(hierarchy[0]["children"])
1
"""
toc: List[MarkdownTOCNode] = []
stack: List[MarkdownTOCNode] = []
for entry in entries:
node: MarkdownTOCNode = MarkdownTOCNode(
title=entry["title"],
page=entry["page_number"],
level=entry["level"],
type="section",
children=[],
)
# Pop from stack until we find a parent at lower level
while stack and stack[-1]["level"] >= node["level"]:
stack.pop()
if stack:
# Add as child to top of stack
stack[-1]["children"].append(node)
else:
# Add as root-level entry
toc.append(node)
stack.append(node)
return toc