Add Library RAG project and cleanup root directory
- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
512
generations/library_rag/utils/toc_extractor_visual.py
Normal file
512
generations/library_rag/utils/toc_extractor_visual.py
Normal file
@@ -0,0 +1,512 @@
|
||||
"""Visual TOC extraction using bounding box X-coordinate analysis.
|
||||
|
||||
This module provides the **most accurate** TOC extraction strategy for
|
||||
philosophical texts by analyzing the horizontal position (X-coordinate)
|
||||
of each TOC entry. This approach is more reliable than text indentation
|
||||
analysis because it directly measures visual layout.
|
||||
|
||||
How It Works:
|
||||
1. OCR with annotations extracts text + bounding box positions
|
||||
2. Pydantic schema (TocEntryBbox) captures title, page, and x_position
|
||||
3. X-coordinates are clustered to identify distinct indentation levels
|
||||
4. Hierarchy is built based on relative X-positions
|
||||
|
||||
X-Position Interpretation:
|
||||
The x_position is normalized between 0.0 (left edge) and 1.0 (right edge):
|
||||
|
||||
- x ≈ 0.05-0.12: Level 1 (no indentation, main parts/chapters)
|
||||
- x ≈ 0.13-0.22: Level 2 (small indentation, sections)
|
||||
- x ≈ 0.23-0.35: Level 3 (double indentation, subsections)
|
||||
|
||||
Positions within 0.03 tolerance are grouped into the same level.
|
||||
|
||||
Advantages over Markdown Analysis:
|
||||
- Works regardless of OCR whitespace accuracy
|
||||
- More reliable for complex hierarchies
|
||||
- Handles both printed and handwritten indentation
|
||||
|
||||
Cost:
|
||||
- Uses OCR with annotations: ~0.003€/page
|
||||
- Only processes first N pages (default: 8)
|
||||
|
||||
Pydantic Schemas:
|
||||
- TocEntryBbox: Single TOC entry with text, page_number, x_position
|
||||
- DocumentTocBbox: Container for list of entries
|
||||
|
||||
Output Structure:
|
||||
{
|
||||
"success": bool,
|
||||
"metadata": {...},
|
||||
"toc": [...], # Hierarchical TOC
|
||||
"toc_flat": [...], # Flat entries with levels
|
||||
"cost_ocr_annotated": float,
|
||||
"method": "visual_x_position"
|
||||
}
|
||||
|
||||
Example:
|
||||
>>> from pathlib import Path
|
||||
>>> from utils.toc_extractor_visual import extract_toc_with_visual_analysis
|
||||
>>>
|
||||
>>> result = extract_toc_with_visual_analysis(
|
||||
... pdf_path=Path("input/philosophy_book.pdf"),
|
||||
... max_toc_pages=8
|
||||
... )
|
||||
>>> if result["success"]:
|
||||
... for entry in result["toc"]:
|
||||
... indent = " " * (entry["level"] - 1)
|
||||
... print(f"{indent}{entry['title']} (p.{entry['page']})")
|
||||
|
||||
Algorithm Details:
|
||||
1. Collect all x_position values from OCR response
|
||||
2. Sort and cluster positions (tolerance: 0.03)
|
||||
3. Compute cluster centroids as level thresholds
|
||||
4. Assign level to each entry based on nearest centroid
|
||||
5. Build hierarchy using stack-based approach
|
||||
|
||||
Functions:
|
||||
- extract_toc_with_visual_analysis(): Main extraction function
|
||||
- build_hierarchy_from_bbox(): Converts entries with X-positions to hierarchy
|
||||
- flatten_toc(): Flattens hierarchical TOC for storage
|
||||
|
||||
See Also:
|
||||
- utils.toc_extractor: Main entry point (routes here by default)
|
||||
- utils.toc_extractor_markdown: Alternative cost-free extraction
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Type, TypedDict, Union
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from .mistral_client import create_client
|
||||
from .ocr_processor import run_ocr_with_annotations
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TocEntryBbox(BaseModel):
|
||||
"""TOC entry with bounding box for visual detection.
|
||||
|
||||
Attributes:
|
||||
text: Complete entry text as it appears in the table of contents.
|
||||
Example: 'Presentation' or 'What is virtue?' or 'Meno or on virtue'.
|
||||
DO NOT include leader dots or page number in this field.
|
||||
page_number: Actual page number as printed in the book (the visible number
|
||||
on the right in the TOC). Example: if the line says 'Presentation.....3',
|
||||
extract the number 3. This is the BOOK page number, not the PDF index.
|
||||
x_position: Horizontal position (X coordinate) of the text start, normalized
|
||||
between 0 and 1. This is the CRUCIAL COORDINATE for detecting indentation:
|
||||
- x ≈ 0.05-0.12 = left-aligned title, NOT indented (hierarchical level 1)
|
||||
- x ≈ 0.13-0.22 = title with SMALL indentation (hierarchical level 2)
|
||||
- x ≈ 0.23-0.35 = title with DOUBLE indentation (hierarchical level 3)
|
||||
Measure precisely where the first character of the title begins.
|
||||
"""
|
||||
text: str = Field(..., description="""Texte COMPLET de l'entrée tel qu'il apparaît dans la table des matières.
|
||||
Exemple: 'Présentation' ou 'Qu'est-ce que la vertu ?' ou 'Ménon ou de la vertu'.
|
||||
NE PAS inclure les points de suite ni le numéro de page dans ce champ.""")
|
||||
page_number: int = Field(..., description="""Numéro de page réel tel qu'imprimé dans le livre (le numéro visible à droite dans la TOC).
|
||||
Exemple: si la ligne dit 'Présentation.....3', extraire le nombre 3.
|
||||
C'est le numéro de page du LIVRE, pas l'index PDF.""")
|
||||
x_position: float = Field(..., description="""Position horizontale (coordonnée X) du début du texte, normalisée entre 0 et 1.
|
||||
C'est LA COORDONNÉE CRUCIALE pour détecter l'indentation:
|
||||
- x ≈ 0.05-0.12 = titre aligné à gauche, NON indenté (niveau hiérarchique 1)
|
||||
- x ≈ 0.13-0.22 = titre avec PETITE indentation (niveau hiérarchique 2)
|
||||
- x ≈ 0.23-0.35 = titre avec DOUBLE indentation (niveau hiérarchique 3)
|
||||
Mesurer précisément où commence le premier caractère du titre.""")
|
||||
|
||||
|
||||
class DocumentTocBbox(BaseModel):
|
||||
"""Schema for extracting all TOC entries with their positions.
|
||||
|
||||
Attributes:
|
||||
entries: Complete list of ALL entries found in the table of contents.
|
||||
For EACH line in the TOC, extract:
|
||||
1. The title text (without leader dots)
|
||||
2. The page number (the number on the right)
|
||||
3. The exact horizontal X position of the title start (to detect indentation)
|
||||
|
||||
Include ALL entries, even those that appear to be at the same visual level.
|
||||
"""
|
||||
|
||||
entries: List[TocEntryBbox] = Field(
|
||||
...,
|
||||
description="""Complete list of ALL entries found in the table of contents.
|
||||
For EACH line in the TOC, extract:
|
||||
1. The title text (without leader dots)
|
||||
2. The page number (the number on the right)
|
||||
3. The exact horizontal X position of the title start (to detect indentation)
|
||||
|
||||
Include ALL entries, even those that appear to be at the same visual level.""",
|
||||
)
|
||||
|
||||
|
||||
# TypedDict classes for structured return types
|
||||
class VisualTOCMetadata(TypedDict):
|
||||
"""Metadata extracted from the document.
|
||||
|
||||
Attributes:
|
||||
title: Document title.
|
||||
author: Document author.
|
||||
languages: List of languages present in the document.
|
||||
summary: Brief document summary.
|
||||
"""
|
||||
|
||||
title: str
|
||||
author: str
|
||||
languages: List[str]
|
||||
summary: str
|
||||
|
||||
|
||||
class VisualTOCNode(TypedDict):
|
||||
"""Hierarchical TOC node.
|
||||
|
||||
Attributes:
|
||||
title: Entry title text.
|
||||
page: Page number in the book.
|
||||
level: Hierarchical level (1 = top level, 2 = subsection, etc.).
|
||||
type: Entry type (e.g., "section", "chapter").
|
||||
children: List of child nodes.
|
||||
"""
|
||||
|
||||
title: str
|
||||
page: int
|
||||
level: int
|
||||
type: str
|
||||
children: List[VisualTOCNode]
|
||||
|
||||
|
||||
class VisualTOCFlatEntry(TypedDict):
|
||||
"""Flattened TOC entry for storage.
|
||||
|
||||
Attributes:
|
||||
title: Entry title text.
|
||||
page_number: Page number in the book.
|
||||
level: Hierarchical level.
|
||||
entry_type: Entry type (e.g., "section", "chapter").
|
||||
parent_title: Title of the parent entry, if any.
|
||||
"""
|
||||
|
||||
title: str
|
||||
page_number: int
|
||||
level: int
|
||||
entry_type: str
|
||||
parent_title: Optional[str]
|
||||
|
||||
|
||||
class VisualTOCResultSuccess(TypedDict):
|
||||
"""Successful TOC extraction result.
|
||||
|
||||
Attributes:
|
||||
success: Always True for success case.
|
||||
metadata: Document metadata.
|
||||
toc: Hierarchical TOC structure.
|
||||
toc_flat: Flattened TOC entries.
|
||||
cost_ocr_annotated: OCR processing cost in euros.
|
||||
method: Extraction method identifier.
|
||||
"""
|
||||
|
||||
success: bool
|
||||
metadata: VisualTOCMetadata
|
||||
toc: List[VisualTOCNode]
|
||||
toc_flat: List[VisualTOCFlatEntry]
|
||||
cost_ocr_annotated: float
|
||||
method: str
|
||||
|
||||
|
||||
class VisualTOCResultError(TypedDict):
|
||||
"""Failed TOC extraction result.
|
||||
|
||||
Attributes:
|
||||
success: Always False for error case.
|
||||
error: Error message describing the failure.
|
||||
"""
|
||||
|
||||
success: bool
|
||||
error: str
|
||||
|
||||
|
||||
# Union type for the function return
|
||||
VisualTOCResult = Union[VisualTOCResultSuccess, VisualTOCResultError]
|
||||
|
||||
|
||||
class VisualTOCEntryInternal(TypedDict):
|
||||
"""Internal representation of TOC entry during processing.
|
||||
|
||||
Attributes:
|
||||
text: Entry title text.
|
||||
page_number: Page number in the book.
|
||||
x_position: Normalized X position (0.0 to 1.0).
|
||||
x_start: Same as x_position (for processing).
|
||||
page: Same as page_number (for processing).
|
||||
level: Computed hierarchical level.
|
||||
"""
|
||||
|
||||
text: str
|
||||
page_number: int
|
||||
x_position: float
|
||||
x_start: float
|
||||
page: int
|
||||
level: int
|
||||
|
||||
|
||||
def extract_toc_with_visual_analysis(
|
||||
pdf_path: Path,
|
||||
api_key: Optional[str] = None,
|
||||
max_toc_pages: int = 8,
|
||||
) -> VisualTOCResult:
|
||||
"""Extract TOC by visually analyzing bounding boxes.
|
||||
|
||||
Detects hierarchy from horizontal alignment (X coordinate). This method
|
||||
uses OCR with annotations to extract the precise X-coordinate of each
|
||||
TOC entry, then clusters these positions to identify indentation levels.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the PDF file.
|
||||
api_key: Mistral API key (optional, uses environment variable if not provided).
|
||||
max_toc_pages: Number of pages to analyze (default: 8).
|
||||
|
||||
Returns:
|
||||
Dictionary containing either:
|
||||
- Success: metadata, hierarchical TOC, flat TOC, cost, method
|
||||
- Error: success=False and error message
|
||||
|
||||
Raises:
|
||||
Does not raise exceptions; errors are returned in the result dictionary.
|
||||
|
||||
Example:
|
||||
>>> from pathlib import Path
|
||||
>>> result = extract_toc_with_visual_analysis(Path("book.pdf"))
|
||||
>>> if result["success"]:
|
||||
... print(f"Extracted {len(result['toc'])} top-level entries")
|
||||
... else:
|
||||
... print(f"Error: {result['error']}")
|
||||
"""
|
||||
try:
|
||||
client = create_client(api_key)
|
||||
pdf_bytes: bytes = pdf_path.read_bytes()
|
||||
except Exception as e:
|
||||
logger.error(f"Initialization error: {e}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
logger.info(f"Visual TOC extraction on {max_toc_pages} pages")
|
||||
|
||||
# Call OCR with document_annotation_format for global structure
|
||||
try:
|
||||
response = run_ocr_with_annotations(
|
||||
client=client,
|
||||
file_bytes=pdf_bytes,
|
||||
filename=pdf_path.name,
|
||||
include_images=False,
|
||||
document_annotation_format=DocumentTocBbox,
|
||||
pages=list(range(max_toc_pages)),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"OCR with annotations error: {e}")
|
||||
return {"success": False, "error": f"OCR failed: {str(e)}"}
|
||||
|
||||
# Extract annotations
|
||||
doc_annotation: Any = getattr(response, "document_annotation", None)
|
||||
|
||||
if not doc_annotation:
|
||||
return {"success": False, "error": "No annotation returned"}
|
||||
|
||||
# Parse entries
|
||||
try:
|
||||
if isinstance(doc_annotation, str):
|
||||
toc_data: Any = json.loads(doc_annotation)
|
||||
else:
|
||||
toc_data = doc_annotation
|
||||
|
||||
entries_data: List[Dict[str, Any]] = (
|
||||
toc_data.get("entries", []) if isinstance(toc_data, dict) else toc_data
|
||||
)
|
||||
|
||||
# Build hierarchy from X coordinates
|
||||
toc_entries: List[VisualTOCNode] = build_hierarchy_from_bbox(entries_data)
|
||||
|
||||
logger.info(f"TOC extracted visually: {len(toc_entries)} entries")
|
||||
|
||||
# Basic metadata (no enriched metadata in visual mode)
|
||||
metadata: VisualTOCMetadata = {
|
||||
"title": pdf_path.stem,
|
||||
"author": "Unknown author",
|
||||
"languages": [],
|
||||
"summary": "",
|
||||
}
|
||||
|
||||
result: VisualTOCResultSuccess = {
|
||||
"success": True,
|
||||
"metadata": metadata,
|
||||
"toc": toc_entries,
|
||||
"toc_flat": flatten_toc(toc_entries),
|
||||
"cost_ocr_annotated": max_toc_pages * 0.003,
|
||||
"method": "visual_x_position",
|
||||
}
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Bbox parsing error: {e}")
|
||||
return {"success": False, "error": f"Parsing failed: {str(e)}"}
|
||||
|
||||
|
||||
def build_hierarchy_from_bbox(entries: List[Dict[str, Any]]) -> List[VisualTOCNode]:
|
||||
"""Build TOC hierarchy from X positions (indentation).
|
||||
|
||||
Detects the hierarchical level by analyzing the horizontal X coordinate.
|
||||
Clusters nearby X positions to identify distinct indentation levels, then
|
||||
builds a tree structure using a stack-based approach.
|
||||
|
||||
Args:
|
||||
entries: List of entries with x_position field. Each entry should have:
|
||||
- text: Entry title
|
||||
- page_number: Page number
|
||||
- x_position: Normalized X coordinate (0.0 to 1.0)
|
||||
|
||||
Returns:
|
||||
Hierarchical TOC structure as a list of nodes. Each node contains:
|
||||
- title: Entry title
|
||||
- page: Page number
|
||||
- level: Hierarchical level (1, 2, 3, ...)
|
||||
- type: Entry type (always "section")
|
||||
- children: List of child nodes
|
||||
|
||||
Example:
|
||||
>>> entries = [
|
||||
... {"text": "Chapter 1", "page_number": 1, "x_position": 0.1},
|
||||
... {"text": "Section 1.1", "page_number": 2, "x_position": 0.2},
|
||||
... ]
|
||||
>>> hierarchy = build_hierarchy_from_bbox(entries)
|
||||
>>> hierarchy[0]["children"][0]["title"]
|
||||
'Section 1.1'
|
||||
"""
|
||||
if not entries:
|
||||
return []
|
||||
|
||||
# Extract X positions and normalize entry data
|
||||
entry_list: List[VisualTOCEntryInternal] = []
|
||||
for entry in entries:
|
||||
x_start: float = entry.get("x_position", 0.1)
|
||||
page_num: int = entry.get("page_number", 0)
|
||||
entry["x_start"] = x_start
|
||||
entry["page"] = page_num
|
||||
entry_list.append(entry) # type: ignore[arg-type]
|
||||
|
||||
# Find unique indentation thresholds
|
||||
x_positions: List[float] = sorted(set(e["x_start"] for e in entry_list))
|
||||
|
||||
if not x_positions:
|
||||
logger.warning("No X position detected")
|
||||
return []
|
||||
|
||||
# Group nearby positions (tolerance 0.03 to normalize small variations)
|
||||
x_levels: List[float] = []
|
||||
current_group: List[float] = [x_positions[0]]
|
||||
|
||||
for x in x_positions[1:]:
|
||||
if x - current_group[-1] < 0.03:
|
||||
current_group.append(x)
|
||||
else:
|
||||
x_levels.append(sum(current_group) / len(current_group))
|
||||
current_group = [x]
|
||||
|
||||
if current_group:
|
||||
x_levels.append(sum(current_group) / len(current_group))
|
||||
|
||||
logger.info(
|
||||
f"Indentation levels detected (X positions): {[f'{x:.3f}' for x in x_levels]}"
|
||||
)
|
||||
|
||||
# Assign levels based on X position
|
||||
for entry_item in entry_list:
|
||||
x_val: float = entry_item["x_start"]
|
||||
# Find the closest level
|
||||
level: int = min(range(len(x_levels)), key=lambda i: abs(x_levels[i] - x_val)) + 1
|
||||
entry_item["level"] = level
|
||||
logger.debug(f" '{entry_item.get('text', '')}' -> X={x_val:.3f} -> level {level}")
|
||||
|
||||
# Build hierarchy
|
||||
toc: List[VisualTOCNode] = []
|
||||
stack: List[VisualTOCNode] = []
|
||||
|
||||
for entry_item in entry_list:
|
||||
node: VisualTOCNode = {
|
||||
"title": entry_item.get("text", "").strip(),
|
||||
"page": entry_item["page"],
|
||||
"level": entry_item["level"],
|
||||
"type": "section",
|
||||
"children": [],
|
||||
}
|
||||
|
||||
# Pop from stack while current level is less than or equal to stack top
|
||||
while stack and stack[-1]["level"] >= node["level"]:
|
||||
stack.pop()
|
||||
|
||||
if stack:
|
||||
stack[-1]["children"].append(node)
|
||||
else:
|
||||
toc.append(node)
|
||||
|
||||
stack.append(node)
|
||||
|
||||
return toc
|
||||
|
||||
|
||||
def flatten_toc(toc: List[VisualTOCNode]) -> List[VisualTOCFlatEntry]:
|
||||
"""Flatten a hierarchical TOC.
|
||||
|
||||
Converts a nested TOC structure into a flat list of entries, preserving
|
||||
parent-child relationships through the parent_title field.
|
||||
|
||||
Args:
|
||||
toc: Hierarchical TOC structure (list of VisualTOCNode).
|
||||
|
||||
Returns:
|
||||
Flat list of TOC entries with parent references.
|
||||
|
||||
Example:
|
||||
>>> toc = [{
|
||||
... "title": "Chapter 1",
|
||||
... "page": 1,
|
||||
... "level": 1,
|
||||
... "type": "section",
|
||||
... "children": [{
|
||||
... "title": "Section 1.1",
|
||||
... "page": 2,
|
||||
... "level": 2,
|
||||
... "type": "section",
|
||||
... "children": []
|
||||
... }]
|
||||
... }]
|
||||
>>> flat = flatten_toc(toc)
|
||||
>>> len(flat)
|
||||
2
|
||||
>>> flat[1]["parent_title"]
|
||||
'Chapter 1'
|
||||
"""
|
||||
flat: List[VisualTOCFlatEntry] = []
|
||||
|
||||
def recurse(items: List[VisualTOCNode], parent_title: Optional[str] = None) -> None:
|
||||
"""Recursively flatten TOC nodes.
|
||||
|
||||
Args:
|
||||
items: List of TOC nodes to process.
|
||||
parent_title: Title of the parent node (None for top level).
|
||||
"""
|
||||
for item in items:
|
||||
flat_entry: VisualTOCFlatEntry = {
|
||||
"title": item["title"],
|
||||
"page_number": item["page"],
|
||||
"level": item["level"],
|
||||
"entry_type": item["type"],
|
||||
"parent_title": parent_title,
|
||||
}
|
||||
flat.append(flat_entry)
|
||||
if item.get("children"):
|
||||
recurse(item["children"], item["title"])
|
||||
|
||||
recurse(toc)
|
||||
return flat
|
||||
|
||||
Reference in New Issue
Block a user