- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
193 lines
6.5 KiB
Python
193 lines
6.5 KiB
Python
"""Image extraction and storage from OCR API responses.
|
|
|
|
This module provides utilities for extracting and saving images from
|
|
Mistral OCR API responses. It is a companion module to markdown_builder,
|
|
handling the image-specific aspects of document processing.
|
|
|
|
Features:
|
|
- **Image Writer Factory**: Creates reusable callbacks for image saving
|
|
- **Batch Extraction**: Processes all images from an OCR response
|
|
- **Protocol-based Design**: Flexible interface for custom implementations
|
|
|
|
Pipeline Position:
|
|
OCR Response → **Image Extractor** → Saved images + paths for Markdown
|
|
|
|
Components:
|
|
1. ImageWriterProtocol: Interface definition for image saving
|
|
2. create_image_writer(): Factory for standard file-based writers
|
|
3. extract_images(): Batch extraction from OCR responses
|
|
|
|
Integration:
|
|
The image writer is designed to integrate with markdown_builder:
|
|
|
|
>>> from utils.image_extractor import create_image_writer
|
|
>>> from utils.markdown_builder import build_markdown
|
|
>>>
|
|
>>> writer = create_image_writer(Path("output/doc/images"))
|
|
>>> markdown = build_markdown(ocr_response, image_writer=writer)
|
|
|
|
Standalone Usage:
|
|
>>> from pathlib import Path
|
|
>>> from utils.image_extractor import extract_images
|
|
>>>
|
|
>>> # Extract all images from OCR response
|
|
>>> paths = extract_images(ocr_response, Path("output/my_doc"))
|
|
>>> print(f"Extracted {len(paths)} images")
|
|
|
|
File Naming Convention:
|
|
Images are named: page{N}_img{M}.png
|
|
- N: Page number (1-based)
|
|
- M: Image index within page (1-based)
|
|
- Format: Always PNG (base64 from Mistral is PNG)
|
|
|
|
Note:
|
|
- All indices are 1-based for consistency with page numbering
|
|
- The images subdirectory is created automatically if needed
|
|
- Base64 data without proper encoding is silently skipped
|
|
- Large documents may produce many images; monitor disk space
|
|
|
|
See Also:
|
|
- utils.markdown_builder: Uses ImageWriter for markdown generation
|
|
- utils.mistral_client: Source of OCR responses with image data
|
|
"""
|
|
|
|
import base64
|
|
from pathlib import Path
|
|
from typing import Any, Callable, List, Optional, Protocol
|
|
|
|
|
|
class ImageWriterProtocol(Protocol):
|
|
"""Protocol for image writing callbacks.
|
|
|
|
This protocol defines the interface for functions that save
|
|
images extracted from OCR responses and return a relative
|
|
path for markdown references.
|
|
|
|
The protocol expects:
|
|
- page_idx: 1-based page number
|
|
- img_idx: 1-based image index within the page
|
|
- image_b64: Base64-encoded image data
|
|
|
|
Returns:
|
|
Relative path to the saved image for markdown inclusion.
|
|
|
|
Example:
|
|
>>> def my_writer(page_idx: int, img_idx: int, image_b64: str) -> str:
|
|
... # Custom saving logic
|
|
... return f"images/page{page_idx}_img{img_idx}.png"
|
|
"""
|
|
|
|
def __call__(self, page_idx: int, img_idx: int, image_b64: str) -> str:
|
|
"""Save image and return relative path for markdown reference."""
|
|
...
|
|
|
|
|
|
# Type alias for image writer callables
|
|
ImageWriter = Callable[[int, int, str], str]
|
|
|
|
|
|
def create_image_writer(images_dir: Path) -> ImageWriter:
|
|
"""Create a function for saving images to disk.
|
|
|
|
This factory function creates a closure that saves base64-encoded
|
|
images to the specified directory and returns relative paths
|
|
suitable for markdown image references.
|
|
|
|
Args:
|
|
images_dir: Directory path where images will be saved.
|
|
The directory will be created if it doesn't exist.
|
|
|
|
Returns:
|
|
A callable that accepts (page_idx, img_idx, image_b64) and
|
|
returns the relative path to the saved image.
|
|
|
|
Example:
|
|
>>> from pathlib import Path
|
|
>>> writer = create_image_writer(Path("output/images"))
|
|
>>> path = writer(1, 0, "iVBORw0KGgoAAAANS...")
|
|
>>> print(path)
|
|
'images/page1_img0.png'
|
|
"""
|
|
# Create directory if it doesn't exist
|
|
images_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
def writer(page_idx: int, img_idx: int, image_b64: str) -> str:
|
|
"""Save an image and return its relative path.
|
|
|
|
Args:
|
|
page_idx: Page number (1-based).
|
|
img_idx: Image index within the page (1-based).
|
|
image_b64: Base64-encoded image data.
|
|
|
|
Returns:
|
|
Relative path to the saved image file.
|
|
"""
|
|
filename: str = f"page{page_idx}_img{img_idx}.png"
|
|
filepath: Path = images_dir / filename
|
|
|
|
# Decode and save
|
|
image_data: bytes = base64.b64decode(image_b64)
|
|
filepath.write_bytes(image_data)
|
|
|
|
# Return relative path for markdown
|
|
return f"images/{filename}"
|
|
|
|
return writer
|
|
|
|
|
|
def extract_images(ocr_response: Any, output_dir: Path) -> List[str]:
|
|
"""Extract all images from an OCR response.
|
|
|
|
Iterates through all pages in the OCR response, extracts any
|
|
embedded images, decodes them from base64, and saves them
|
|
to the output directory.
|
|
|
|
Args:
|
|
ocr_response: OCR response object from Mistral API.
|
|
Expected to have a pages attribute, where each page
|
|
may have an images list containing objects with
|
|
image_base64 attributes.
|
|
output_dir: Base output directory. Images will be saved
|
|
to a subdirectory named "images".
|
|
|
|
Returns:
|
|
List of absolute file paths to the extracted images.
|
|
|
|
Example:
|
|
>>> from pathlib import Path
|
|
>>> paths = extract_images(ocr_response, Path("output/my_doc"))
|
|
>>> for path in paths:
|
|
... print(path)
|
|
'C:/output/my_doc/images/page1_img1.png'
|
|
'C:/output/my_doc/images/page2_img1.png'
|
|
|
|
Note:
|
|
- Pages and images are 1-indexed in filenames
|
|
- Images without base64 data are silently skipped
|
|
- The images subdirectory is created automatically
|
|
"""
|
|
images_dir: Path = output_dir / "images"
|
|
images_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
extracted: List[str] = []
|
|
|
|
for page_index, page in enumerate(ocr_response.pages, start=1):
|
|
if not getattr(page, "images", None):
|
|
continue
|
|
|
|
for img_idx, img in enumerate(page.images, start=1):
|
|
image_b64: Optional[str] = getattr(img, "image_base64", None)
|
|
if not image_b64:
|
|
continue
|
|
|
|
filename: str = f"page{page_index}_img{img_idx}.png"
|
|
filepath: Path = images_dir / filename
|
|
|
|
# Decode and save
|
|
image_data: bytes = base64.b64decode(image_b64)
|
|
filepath.write_bytes(image_data)
|
|
|
|
extracted.append(str(filepath))
|
|
|
|
return extracted
|