linear-coding-agent/generations/library_rag/utils/toc_extractor_visual.py

"""Visual TOC extraction using bounding box X-coordinate analysis.

This module provides the **most accurate** TOC extraction strategy for
philosophical texts by analyzing the horizontal position (X-coordinate)
of each TOC entry. This approach is more reliable than text indentation
analysis because it directly measures visual layout.

How It Works:
    1. OCR with annotations extracts text + bounding box positions
    2. Pydantic schema (TocEntryBbox) captures title, page, and x_position
    3. X-coordinates are clustered to identify distinct indentation levels
    4. Hierarchy is built based on relative X-positions

X-Position Interpretation:
    The x_position is normalized between 0.0 (left edge) and 1.0 (right edge):

    - x ≈ 0.05-0.12: Level 1 (no indentation, main parts/chapters)
    - x ≈ 0.13-0.22: Level 2 (small indentation, sections)
    - x ≈ 0.23-0.35: Level 3 (double indentation, subsections)

    Positions within 0.03 tolerance are grouped into the same level.

Advantages over Markdown Analysis:
    - Works regardless of OCR whitespace accuracy
    - More reliable for complex hierarchies
    - Handles both printed and handwritten indentation

Cost:
    - Uses OCR with annotations: ~0.003€/page
    - Only processes first N pages (default: 8)

Pydantic Schemas:
    - TocEntryBbox: Single TOC entry with text, page_number, x_position
    - DocumentTocBbox: Container for list of entries

Output Structure:
    {
        "success": bool,
        "metadata": {...},
        "toc": [...],               # Hierarchical TOC
        "toc_flat": [...],          # Flat entries with levels
        "cost_ocr_annotated": float,
        "method": "visual_x_position"
    }

Example:
    >>> from pathlib import Path
    >>> from utils.toc_extractor_visual import extract_toc_with_visual_analysis
    >>>
    >>> result = extract_toc_with_visual_analysis(
    ...     pdf_path=Path("input/philosophy_book.pdf"),
    ...     max_toc_pages=8
    ... )
    >>> if result["success"]:
    ...     for entry in result["toc"]:
    ...         indent = "  " * (entry["level"] - 1)
    ...         print(f"{indent}{entry['title']} (p.{entry['page']})")

Algorithm Details:
    1. Collect all x_position values from OCR response
    2. Sort and cluster positions (tolerance: 0.03)
    3. Compute cluster centroids as level thresholds
    4. Assign level to each entry based on nearest centroid
    5. Build hierarchy using stack-based approach

Functions:
    - extract_toc_with_visual_analysis(): Main extraction function
    - build_hierarchy_from_bbox(): Converts entries with X-positions to hierarchy
    - flatten_toc(): Flattens hierarchical TOC for storage

See Also:
    - utils.toc_extractor: Main entry point (routes here by default)
    - utils.toc_extractor_markdown: Alternative cost-free extraction
"""

from __future__ import annotations

import json
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional, Type, TypedDict, Union

from pydantic import BaseModel, Field

from .mistral_client import create_client
from .ocr_processor import run_ocr_with_annotations

logger: logging.Logger = logging.getLogger(__name__)


class TocEntryBbox(BaseModel):
    """TOC entry with bounding box for visual detection.

    Attributes:
        text: Complete entry text as it appears in the table of contents.
            Example: 'Presentation' or 'What is virtue?' or 'Meno or on virtue'.
            DO NOT include leader dots or page number in this field.
        page_number: Actual page number as printed in the book (the visible number
            on the right in the TOC). Example: if the line says 'Presentation.....3',
            extract the number 3. This is the BOOK page number, not the PDF index.
        x_position: Horizontal position (X coordinate) of the text start, normalized
            between 0 and 1. This is the CRUCIAL COORDINATE for detecting indentation:
            - x ≈ 0.05-0.12 = left-aligned title, NOT indented (hierarchical level 1)
            - x ≈ 0.13-0.22 = title with SMALL indentation (hierarchical level 2)
            - x ≈ 0.23-0.35 = title with DOUBLE indentation (hierarchical level 3)
            Measure precisely where the first character of the title begins.
    """
    text: str = Field(..., description="""Texte COMPLET de l'entrée tel qu'il apparaît dans la table des matières.
    Exemple: 'Présentation' ou 'Qu'est-ce que la vertu ?' ou 'Ménon ou de la vertu'.
    NE PAS inclure les points de suite ni le numéro de page dans ce champ.""")
    page_number: int = Field(..., description="""Numéro de page réel tel qu'imprimé dans le livre (le numéro visible à droite dans la TOC).
    Exemple: si la ligne dit 'Présentation.....3', extraire le nombre 3.
    C'est le numéro de page du LIVRE, pas l'index PDF.""")
    x_position: float = Field(..., description="""Position horizontale (coordonnée X) du début du texte, normalisée entre 0 et 1.
    C'est LA COORDONNÉE CRUCIALE pour détecter l'indentation:
    - x ≈ 0.05-0.12 = titre aligné à gauche, NON indenté (niveau hiérarchique 1)
    - x ≈ 0.13-0.22 = titre avec PETITE indentation (niveau hiérarchique 2)
    - x ≈ 0.23-0.35 = titre avec DOUBLE indentation (niveau hiérarchique 3)
    Mesurer précisément où commence le premier caractère du titre.""")


class DocumentTocBbox(BaseModel):
    """Schema for extracting all TOC entries with their positions.

    Attributes:
        entries: Complete list of ALL entries found in the table of contents.
            For EACH line in the TOC, extract:
            1. The title text (without leader dots)
            2. The page number (the number on the right)
            3. The exact horizontal X position of the title start (to detect indentation)

            Include ALL entries, even those that appear to be at the same visual level.
    """

    entries: List[TocEntryBbox] = Field(
        ...,
        description="""Complete list of ALL entries found in the table of contents.
    For EACH line in the TOC, extract:
    1. The title text (without leader dots)
    2. The page number (the number on the right)
    3. The exact horizontal X position of the title start (to detect indentation)

    Include ALL entries, even those that appear to be at the same visual level.""",
    )


# TypedDict classes for structured return types
class VisualTOCMetadata(TypedDict):
    """Metadata extracted from the document.

    Attributes:
        title: Document title.
        author: Document author.
        languages: List of languages present in the document.
        summary: Brief document summary.
    """

    title: str
    author: str
    languages: List[str]
    summary: str


class VisualTOCNode(TypedDict):
    """Hierarchical TOC node.

    Attributes:
        title: Entry title text.
        page: Page number in the book.
        level: Hierarchical level (1 = top level, 2 = subsection, etc.).
        type: Entry type (e.g., "section", "chapter").
        children: List of child nodes.
    """

    title: str
    page: int
    level: int
    type: str
    children: List[VisualTOCNode]


class VisualTOCFlatEntry(TypedDict):
    """Flattened TOC entry for storage.

    Attributes:
        title: Entry title text.
        page_number: Page number in the book.
        level: Hierarchical level.
        entry_type: Entry type (e.g., "section", "chapter").
        parent_title: Title of the parent entry, if any.
    """

    title: str
    page_number: int
    level: int
    entry_type: str
    parent_title: Optional[str]


class VisualTOCResultSuccess(TypedDict):
    """Successful TOC extraction result.

    Attributes:
        success: Always True for success case.
        metadata: Document metadata.
        toc: Hierarchical TOC structure.
        toc_flat: Flattened TOC entries.
        cost_ocr_annotated: OCR processing cost in euros.
        method: Extraction method identifier.
    """

    success: bool
    metadata: VisualTOCMetadata
    toc: List[VisualTOCNode]
    toc_flat: List[VisualTOCFlatEntry]
    cost_ocr_annotated: float
    method: str


class VisualTOCResultError(TypedDict):
    """Failed TOC extraction result.

    Attributes:
        success: Always False for error case.
        error: Error message describing the failure.
    """

    success: bool
    error: str


# Union type for the function return
VisualTOCResult = Union[VisualTOCResultSuccess, VisualTOCResultError]


class VisualTOCEntryInternal(TypedDict):
    """Internal representation of TOC entry during processing.

    Attributes:
        text: Entry title text.
        page_number: Page number in the book.
        x_position: Normalized X position (0.0 to 1.0).
        x_start: Same as x_position (for processing).
        page: Same as page_number (for processing).
        level: Computed hierarchical level.
    """

    text: str
    page_number: int
    x_position: float
    x_start: float
    page: int
    level: int


def extract_toc_with_visual_analysis(
    pdf_path: Path,
    api_key: Optional[str] = None,
    max_toc_pages: int = 8,
) -> VisualTOCResult:
    """Extract TOC by visually analyzing bounding boxes.

    Detects hierarchy from horizontal alignment (X coordinate). This method
    uses OCR with annotations to extract the precise X-coordinate of each
    TOC entry, then clusters these positions to identify indentation levels.

    Args:
        pdf_path: Path to the PDF file.
        api_key: Mistral API key (optional, uses environment variable if not provided).
        max_toc_pages: Number of pages to analyze (default: 8).

    Returns:
        Dictionary containing either:
            - Success: metadata, hierarchical TOC, flat TOC, cost, method
            - Error: success=False and error message

    Raises:
        Does not raise exceptions; errors are returned in the result dictionary.

    Example:
        >>> from pathlib import Path
        >>> result = extract_toc_with_visual_analysis(Path("book.pdf"))
        >>> if result["success"]:
        ...     print(f"Extracted {len(result['toc'])} top-level entries")
        ... else:
        ...     print(f"Error: {result['error']}")
    """
    try:
        client = create_client(api_key)
        pdf_bytes: bytes = pdf_path.read_bytes()
    except Exception as e:
        logger.error(f"Initialization error: {e}")
        return {"success": False, "error": str(e)}

    logger.info(f"Visual TOC extraction on {max_toc_pages} pages")

    # Call OCR with document_annotation_format for global structure
    try:
        response = run_ocr_with_annotations(
            client=client,
            file_bytes=pdf_bytes,
            filename=pdf_path.name,
            include_images=False,
            document_annotation_format=DocumentTocBbox,
            pages=list(range(max_toc_pages)),
        )
    except Exception as e:
        logger.error(f"OCR with annotations error: {e}")
        return {"success": False, "error": f"OCR failed: {str(e)}"}

    # Extract annotations
    doc_annotation: Any = getattr(response, "document_annotation", None)

    if not doc_annotation:
        return {"success": False, "error": "No annotation returned"}

    # Parse entries
    try:
        if isinstance(doc_annotation, str):
            toc_data: Any = json.loads(doc_annotation)
        else:
            toc_data = doc_annotation

        entries_data: List[Dict[str, Any]] = (
            toc_data.get("entries", []) if isinstance(toc_data, dict) else toc_data
        )

        # Build hierarchy from X coordinates
        toc_entries: List[VisualTOCNode] = build_hierarchy_from_bbox(entries_data)

        logger.info(f"TOC extracted visually: {len(toc_entries)} entries")

        # Basic metadata (no enriched metadata in visual mode)
        metadata: VisualTOCMetadata = {
            "title": pdf_path.stem,
            "author": "Unknown author",
            "languages": [],
            "summary": "",
        }

        result: VisualTOCResultSuccess = {
            "success": True,
            "metadata": metadata,
            "toc": toc_entries,
            "toc_flat": flatten_toc(toc_entries),
            "cost_ocr_annotated": max_toc_pages * 0.003,
            "method": "visual_x_position",
        }
        return result
    except Exception as e:
        logger.error(f"Bbox parsing error: {e}")
        return {"success": False, "error": f"Parsing failed: {str(e)}"}


def build_hierarchy_from_bbox(entries: List[Dict[str, Any]]) -> List[VisualTOCNode]:
    """Build TOC hierarchy from X positions (indentation).

    Detects the hierarchical level by analyzing the horizontal X coordinate.
    Clusters nearby X positions to identify distinct indentation levels, then
    builds a tree structure using a stack-based approach.

    Args:
        entries: List of entries with x_position field. Each entry should have:
            - text: Entry title
            - page_number: Page number
            - x_position: Normalized X coordinate (0.0 to 1.0)

    Returns:
        Hierarchical TOC structure as a list of nodes. Each node contains:
            - title: Entry title
            - page: Page number
            - level: Hierarchical level (1, 2, 3, ...)
            - type: Entry type (always "section")
            - children: List of child nodes

    Example:
        >>> entries = [
        ...     {"text": "Chapter 1", "page_number": 1, "x_position": 0.1},
        ...     {"text": "Section 1.1", "page_number": 2, "x_position": 0.2},
        ... ]
        >>> hierarchy = build_hierarchy_from_bbox(entries)
        >>> hierarchy[0]["children"][0]["title"]
        'Section 1.1'
    """
    if not entries:
        return []

    # Extract X positions and normalize entry data
    entry_list: List[VisualTOCEntryInternal] = []
    for entry in entries:
        x_start: float = entry.get("x_position", 0.1)
        page_num: int = entry.get("page_number", 0)
        entry["x_start"] = x_start
        entry["page"] = page_num
        entry_list.append(entry)  # type: ignore[arg-type]

    # Find unique indentation thresholds
    x_positions: List[float] = sorted(set(e["x_start"] for e in entry_list))

    if not x_positions:
        logger.warning("No X position detected")
        return []

    # Group nearby positions (tolerance 0.03 to normalize small variations)
    x_levels: List[float] = []
    current_group: List[float] = [x_positions[0]]

    for x in x_positions[1:]:
        if x - current_group[-1] < 0.03:
            current_group.append(x)
        else:
            x_levels.append(sum(current_group) / len(current_group))
            current_group = [x]

    if current_group:
        x_levels.append(sum(current_group) / len(current_group))

    logger.info(
        f"Indentation levels detected (X positions): {[f'{x:.3f}' for x in x_levels]}"
    )

    # Assign levels based on X position
    for entry_item in entry_list:
        x_val: float = entry_item["x_start"]
        # Find the closest level
        level: int = min(range(len(x_levels)), key=lambda i: abs(x_levels[i] - x_val)) + 1
        entry_item["level"] = level
        logger.debug(f"  '{entry_item.get('text', '')}' -> X={x_val:.3f} -> level {level}")

    # Build hierarchy
    toc: List[VisualTOCNode] = []
    stack: List[VisualTOCNode] = []

    for entry_item in entry_list:
        node: VisualTOCNode = {
            "title": entry_item.get("text", "").strip(),
            "page": entry_item["page"],
            "level": entry_item["level"],
            "type": "section",
            "children": [],
        }

        # Pop from stack while current level is less than or equal to stack top
        while stack and stack[-1]["level"] >= node["level"]:
            stack.pop()

        if stack:
            stack[-1]["children"].append(node)
        else:
            toc.append(node)

        stack.append(node)

    return toc


def flatten_toc(toc: List[VisualTOCNode]) -> List[VisualTOCFlatEntry]:
    """Flatten a hierarchical TOC.

    Converts a nested TOC structure into a flat list of entries, preserving
    parent-child relationships through the parent_title field.

    Args:
        toc: Hierarchical TOC structure (list of VisualTOCNode).

    Returns:
        Flat list of TOC entries with parent references.

    Example:
        >>> toc = [{
        ...     "title": "Chapter 1",
        ...     "page": 1,
        ...     "level": 1,
        ...     "type": "section",
        ...     "children": [{
        ...         "title": "Section 1.1",
        ...         "page": 2,
        ...         "level": 2,
        ...         "type": "section",
        ...         "children": []
        ...     }]
        ... }]
        >>> flat = flatten_toc(toc)
        >>> len(flat)
        2
        >>> flat[1]["parent_title"]
        'Chapter 1'
    """
    flat: List[VisualTOCFlatEntry] = []

    def recurse(items: List[VisualTOCNode], parent_title: Optional[str] = None) -> None:
        """Recursively flatten TOC nodes.

        Args:
            items: List of TOC nodes to process.
            parent_title: Title of the parent node (None for top level).
        """
        for item in items:
            flat_entry: VisualTOCFlatEntry = {
                "title": item["title"],
                "page_number": item["page"],
                "level": item["level"],
                "entry_type": item["type"],
                "parent_title": parent_title,
            }
            flat.append(flat_entry)
            if item.get("children"):
                recurse(item["children"], item["title"])

    recurse(toc)
    return flat