linear-coding-agent/generations/library_rag/utils/image_extractor.py

"""Image extraction and storage from OCR API responses.

This module provides utilities for extracting and saving images from
Mistral OCR API responses. It is a companion module to markdown_builder,
handling the image-specific aspects of document processing.

Features:
    - **Image Writer Factory**: Creates reusable callbacks for image saving
    - **Batch Extraction**: Processes all images from an OCR response
    - **Protocol-based Design**: Flexible interface for custom implementations

Pipeline Position:
    OCR Response → **Image Extractor** → Saved images + paths for Markdown

Components:
    1. ImageWriterProtocol: Interface definition for image saving
    2. create_image_writer(): Factory for standard file-based writers
    3. extract_images(): Batch extraction from OCR responses

Integration:
    The image writer is designed to integrate with markdown_builder:

    >>> from utils.image_extractor import create_image_writer
    >>> from utils.markdown_builder import build_markdown
    >>>
    >>> writer = create_image_writer(Path("output/doc/images"))
    >>> markdown = build_markdown(ocr_response, image_writer=writer)

Standalone Usage:
    >>> from pathlib import Path
    >>> from utils.image_extractor import extract_images
    >>>
    >>> # Extract all images from OCR response
    >>> paths = extract_images(ocr_response, Path("output/my_doc"))
    >>> print(f"Extracted {len(paths)} images")

File Naming Convention:
    Images are named: page{N}_img{M}.png
    - N: Page number (1-based)
    - M: Image index within page (1-based)
    - Format: Always PNG (base64 from Mistral is PNG)

Note:
    - All indices are 1-based for consistency with page numbering
    - The images subdirectory is created automatically if needed
    - Base64 data without proper encoding is silently skipped
    - Large documents may produce many images; monitor disk space

See Also:
    - utils.markdown_builder: Uses ImageWriter for markdown generation
    - utils.mistral_client: Source of OCR responses with image data
"""

import base64
from pathlib import Path
from typing import Any, Callable, List, Optional, Protocol


class ImageWriterProtocol(Protocol):
    """Protocol for image writing callbacks.

    This protocol defines the interface for functions that save
    images extracted from OCR responses and return a relative
    path for markdown references.

    The protocol expects:
        - page_idx: 1-based page number
        - img_idx: 1-based image index within the page
        - image_b64: Base64-encoded image data

    Returns:
        Relative path to the saved image for markdown inclusion.

    Example:
        >>> def my_writer(page_idx: int, img_idx: int, image_b64: str) -> str:
        ...     # Custom saving logic
        ...     return f"images/page{page_idx}_img{img_idx}.png"
    """

    def __call__(self, page_idx: int, img_idx: int, image_b64: str) -> str:
        """Save image and return relative path for markdown reference."""
        ...


# Type alias for image writer callables
ImageWriter = Callable[[int, int, str], str]


def create_image_writer(images_dir: Path) -> ImageWriter:
    """Create a function for saving images to disk.

    This factory function creates a closure that saves base64-encoded
    images to the specified directory and returns relative paths
    suitable for markdown image references.

    Args:
        images_dir: Directory path where images will be saved.
            The directory will be created if it doesn't exist.

    Returns:
        A callable that accepts (page_idx, img_idx, image_b64) and
        returns the relative path to the saved image.

    Example:
        >>> from pathlib import Path
        >>> writer = create_image_writer(Path("output/images"))
        >>> path = writer(1, 0, "iVBORw0KGgoAAAANS...")
        >>> print(path)
        'images/page1_img0.png'
    """
    # Create directory if it doesn't exist
    images_dir.mkdir(parents=True, exist_ok=True)

    def writer(page_idx: int, img_idx: int, image_b64: str) -> str:
        """Save an image and return its relative path.

        Args:
            page_idx: Page number (1-based).
            img_idx: Image index within the page (1-based).
            image_b64: Base64-encoded image data.

        Returns:
            Relative path to the saved image file.
        """
        filename: str = f"page{page_idx}_img{img_idx}.png"
        filepath: Path = images_dir / filename

        # Decode and save
        image_data: bytes = base64.b64decode(image_b64)
        filepath.write_bytes(image_data)

        # Return relative path for markdown
        return f"images/{filename}"

    return writer


def extract_images(ocr_response: Any, output_dir: Path) -> List[str]:
    """Extract all images from an OCR response.

    Iterates through all pages in the OCR response, extracts any
    embedded images, decodes them from base64, and saves them
    to the output directory.

    Args:
        ocr_response: OCR response object from Mistral API.
            Expected to have a pages attribute, where each page
            may have an images list containing objects with
            image_base64 attributes.
        output_dir: Base output directory. Images will be saved
            to a subdirectory named "images".

    Returns:
        List of absolute file paths to the extracted images.

    Example:
        >>> from pathlib import Path
        >>> paths = extract_images(ocr_response, Path("output/my_doc"))
        >>> for path in paths:
        ...     print(path)
        'C:/output/my_doc/images/page1_img1.png'
        'C:/output/my_doc/images/page2_img1.png'

    Note:
        - Pages and images are 1-indexed in filenames
        - Images without base64 data are silently skipped
        - The images subdirectory is created automatically
    """
    images_dir: Path = output_dir / "images"
    images_dir.mkdir(parents=True, exist_ok=True)

    extracted: List[str] = []

    for page_index, page in enumerate(ocr_response.pages, start=1):
        if not getattr(page, "images", None):
            continue

        for img_idx, img in enumerate(page.images, start=1):
            image_b64: Optional[str] = getattr(img, "image_base64", None)
            if not image_b64:
                continue

            filename: str = f"page{page_index}_img{img_idx}.png"
            filepath: Path = images_dir / filename

            # Decode and save
            image_data: bytes = base64.b64decode(image_b64)
            filepath.write_bytes(image_data)

            extracted.append(str(filepath))

    return extracted