linear-coding-agent/generations/library_rag/utils/toc_extractor_markdown.py

"""TOC extraction via Markdown indentation analysis.

This module provides a **cost-free** TOC extraction strategy that works on
already-generated Markdown text. Unlike the OCR annotation approach, this
method doesn't require additional API calls.

Strategy:
    1. Search for "Table des matières" heading in the first N lines
    2. Parse lines matching pattern: "Title.....Page" or "Title  Page"
    3. Detect hierarchy from leading whitespace (indentation)
    4. Build nested TOC structure using stack-based algorithm

When to Use:
    - When OCR has already been performed (markdown available)
    - When cost optimization is critical (no additional API calls)
    - For documents with clear indentation in the TOC

Limitations:
    - Requires French "Table des matières" header (can be extended)
    - Indentation detection may be less accurate than visual/bbox analysis
    - Only works if OCR preserved whitespace accurately

Indentation Levels:
    - 0-2 spaces: Level 1 (main chapters/parts)
    - 3-6 spaces: Level 2 (sections)
    - 7+ spaces: Level 3 (subsections)

Output Structure:
    {
        "success": bool,
        "toc": [...],               # Hierarchical TOC
        "toc_flat": [...],          # Flat entries with levels
        "cost_ocr_annotated": 0.0,  # No additional cost
        "method": "markdown_indentation"
    }

Example:
    >>> from utils.toc_extractor_markdown import extract_toc_from_markdown
    >>>
    >>> markdown = '''
    ... # Table des matières
    ... Introduction.............................5
    ... Première partie..........................10
    ...    Chapitre 1............................15
    ...    Chapitre 2............................25
    ... Deuxième partie..........................50
    ... '''
    >>> result = extract_toc_from_markdown(markdown)
    >>> if result["success"]:
    ...     print(f"Found {len(result['toc_flat'])} entries")
    Found 5 entries

Functions:
    - extract_toc_from_markdown(): Main extraction from markdown text
    - build_hierarchy(): Converts flat entries to nested structure

See Also:
    - utils.toc_extractor: Main entry point (routes to visual by default)
    - utils.toc_extractor_visual: More accurate X-position based extraction
"""

from __future__ import annotations

import logging
import re
from typing import Any, Dict, List, Optional, TypedDict, Union
from pathlib import Path

logger = logging.getLogger(__name__)


# Type definitions for internal data structures
class MarkdownTOCEntryRaw(TypedDict):
    """Raw TOC entry extracted from markdown with indentation info."""
    title: str
    page_number: int
    level: int
    leading_spaces: int


class MarkdownTOCNode(TypedDict):
    """Hierarchical TOC node with children."""
    title: str
    page: int
    level: int
    type: str
    children: List[MarkdownTOCNode]


class MarkdownTOCFlatEntry(TypedDict):
    """Flat TOC entry with parent information."""
    title: str
    page_number: int
    level: int
    entry_type: str
    parent_title: Optional[str]


class MarkdownTOCResultSuccess(TypedDict):
    """Successful TOC extraction result."""
    success: bool  # Always True
    metadata: Dict[str, Any]
    toc: List[MarkdownTOCNode]
    toc_flat: List[MarkdownTOCFlatEntry]
    cost_ocr_annotated: float
    method: str


class MarkdownTOCResultError(TypedDict):
    """Failed TOC extraction result."""
    success: bool  # Always False
    error: str


# Union type for function return
MarkdownTOCResult = Union[MarkdownTOCResultSuccess, MarkdownTOCResultError]


def extract_toc_from_markdown(
    markdown_text: str,
    max_lines: int = 200,
) -> MarkdownTOCResult:
    """Extract table of contents by analyzing raw markdown text.

    Detects hierarchy by counting leading spaces (indentation) at the
    beginning of each line. This is a cost-free alternative to OCR
    annotation-based extraction.

    Args:
        markdown_text: Complete markdown text of the document.
        max_lines: Maximum number of lines to analyze (searches TOC at start).

    Returns:
        Dictionary with hierarchical TOC structure. On success, includes:
            - success: True
            - metadata: Empty dict (for consistency with other extractors)
            - toc: Hierarchical nested TOC structure
            - toc_flat: Flat list of entries with levels
            - cost_ocr_annotated: 0.0 (no additional cost)
            - method: "markdown_indentation"
        On failure, includes:
            - success: False
            - error: Error message string

    Example:
        >>> markdown = '''
        ... # Table des matières
        ... Introduction.....5
        ... Part One........10
        ...   Chapter 1.....15
        ... '''
        >>> result = extract_toc_from_markdown(markdown)
        >>> if result["success"]:
        ...     print(len(result["toc_flat"]))
        3
    """
    logger.info("Extraction TOC depuis markdown (analyse indentation)")

    lines: List[str] = markdown_text.split('\n')[:max_lines]

    # Find "Table des matières" section
    toc_start: Optional[int] = None
    for i, line in enumerate(lines):
        if re.search(r'table\s+des\s+mati[èe]res', line, re.IGNORECASE):
            toc_start = i + 1
            logger.info(f"TOC trouvée à la ligne {i}")
            break

    if toc_start is None:
        logger.warning("Aucune table des matières trouvée dans le markdown")
        return MarkdownTOCResultError(
            success=False,
            error="Table des matières introuvable"
        )

    # Extract TOC entries
    entries: List[MarkdownTOCEntryRaw] = []
    toc_pattern: re.Pattern[str] = re.compile(r'^(\s*)(.+?)\s*\.+\s*(\d+)\s*$')

    for line in lines[toc_start:toc_start + 100]:  # Max 100 lines of TOC
        line_stripped: str = line.strip()
        if not line_stripped or line_stripped.startswith('#') or line_stripped.startswith('---'):
            continue

        # Search for pattern "Title.....Page"
        # Must analyze line BEFORE strip() to count leading spaces
        original_line: str = lines[lines.index(line) if line in lines else 0]
        leading_spaces: int = len(original_line) - len(original_line.lstrip())

        # Alternative pattern: search for title + number at end
        match: Optional[re.Match[str]] = re.match(r'^(.+?)\s*\.{2,}\s*(\d+)\s*$', line_stripped)
        if not match:
            # Try without dotted leaders
            match = re.match(r'^(.+?)\s+(\d+)\s*$', line_stripped)

        if match:
            title: str = match.group(1).strip()
            page: int = int(match.group(2))

            # Ignore lines too short or that don't look like titles
            if len(title) < 3 or title.isdigit():
                continue

            # Determine level based on indentation
            # 0-2 spaces = level 1
            # 3-6 spaces = level 2
            # 7+ spaces = level 3
            level: int
            if leading_spaces <= 2:
                level = 1
            elif leading_spaces <= 6:
                level = 2
            else:
                level = 3

            entries.append(MarkdownTOCEntryRaw(
                title=title,
                page_number=page,
                level=level,
                leading_spaces=leading_spaces,
            ))

            logger.debug(f"  '{title}' → {leading_spaces} espaces → level {level} (page {page})")

    if not entries:
        logger.warning("Aucune entrée TOC extraite")
        return MarkdownTOCResultError(
            success=False,
            error="Aucune entrée TOC trouvée"
        )

    logger.info(f"✅ {len(entries)} entrées extraites depuis markdown")

    # Build hierarchy
    toc: List[MarkdownTOCNode] = build_hierarchy(entries)

    return MarkdownTOCResultSuccess(
        success=True,
        metadata={},
        toc=toc,
        toc_flat=[
            MarkdownTOCFlatEntry(
                title=e["title"],
                page_number=e["page_number"],
                level=e["level"],
                entry_type="section",
                parent_title=None,
            )
            for e in entries
        ],
        cost_ocr_annotated=0.0,  # No additional cost, uses existing OCR
        method="markdown_indentation",
    )


def build_hierarchy(entries: List[MarkdownTOCEntryRaw]) -> List[MarkdownTOCNode]:
    """Build hierarchical structure from flat entries based on levels.

    Uses a stack-based algorithm to construct nested TOC structure where
    entries with higher indentation become children of the previous
    less-indented entry.

    Args:
        entries: List of raw TOC entries with title, page, and level.

    Returns:
        Nested list of TOC nodes where each node contains children.

    Example:
        >>> entries = [
        ...     {"title": "Part 1", "page_number": 1, "level": 1, "leading_spaces": 0},
        ...     {"title": "Chapter 1", "page_number": 5, "level": 2, "leading_spaces": 4},
        ... ]
        >>> hierarchy = build_hierarchy(entries)
        >>> len(hierarchy[0]["children"])
        1
    """
    toc: List[MarkdownTOCNode] = []
    stack: List[MarkdownTOCNode] = []

    for entry in entries:
        node: MarkdownTOCNode = MarkdownTOCNode(
            title=entry["title"],
            page=entry["page_number"],
            level=entry["level"],
            type="section",
            children=[],
        )

        # Pop from stack until we find a parent at lower level
        while stack and stack[-1]["level"] >= node["level"]:
            stack.pop()

        if stack:
            # Add as child to top of stack
            stack[-1]["children"].append(node)
        else:
            # Add as root-level entry
            toc.append(node)

        stack.append(node)

    return toc