Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions
--- a/generations/library_rag/utils/image_extractor.py
+++ b/generations/library_rag/utils/image_extractor.py
@@ -0,0 +1,192 @@
+"""Image extraction and storage from OCR API responses.
+
+This module provides utilities for extracting and saving images from
+Mistral OCR API responses. It is a companion module to markdown_builder,
+handling the image-specific aspects of document processing.
+
+Features:
+    - **Image Writer Factory**: Creates reusable callbacks for image saving
+    - **Batch Extraction**: Processes all images from an OCR response
+    - **Protocol-based Design**: Flexible interface for custom implementations
+
+Pipeline Position:
+    OCR Response → **Image Extractor** → Saved images + paths for Markdown
+
+Components:
+    1. ImageWriterProtocol: Interface definition for image saving
+    2. create_image_writer(): Factory for standard file-based writers
+    3. extract_images(): Batch extraction from OCR responses
+
+Integration:
+    The image writer is designed to integrate with markdown_builder:
+
+    >>> from utils.image_extractor import create_image_writer
+    >>> from utils.markdown_builder import build_markdown
+    >>>
+    >>> writer = create_image_writer(Path("output/doc/images"))
+    >>> markdown = build_markdown(ocr_response, image_writer=writer)
+
+Standalone Usage:
+    >>> from pathlib import Path
+    >>> from utils.image_extractor import extract_images
+    >>>
+    >>> # Extract all images from OCR response
+    >>> paths = extract_images(ocr_response, Path("output/my_doc"))
+    >>> print(f"Extracted {len(paths)} images")
+
+File Naming Convention:
+    Images are named: page{N}_img{M}.png
+    - N: Page number (1-based)
+    - M: Image index within page (1-based)
+    - Format: Always PNG (base64 from Mistral is PNG)
+
+Note:
+    - All indices are 1-based for consistency with page numbering
+    - The images subdirectory is created automatically if needed
+    - Base64 data without proper encoding is silently skipped
+    - Large documents may produce many images; monitor disk space
+
+See Also:
+    - utils.markdown_builder: Uses ImageWriter for markdown generation
+    - utils.mistral_client: Source of OCR responses with image data
+"""
+
+import base64
+from pathlib import Path
+from typing import Any, Callable, List, Optional, Protocol
+
+
+class ImageWriterProtocol(Protocol):
+    """Protocol for image writing callbacks.
+
+    This protocol defines the interface for functions that save
+    images extracted from OCR responses and return a relative
+    path for markdown references.
+
+    The protocol expects:
+        - page_idx: 1-based page number
+        - img_idx: 1-based image index within the page
+        - image_b64: Base64-encoded image data
+
+    Returns:
+        Relative path to the saved image for markdown inclusion.
+
+    Example:
+        >>> def my_writer(page_idx: int, img_idx: int, image_b64: str) -> str:
+        ...     # Custom saving logic
+        ...     return f"images/page{page_idx}_img{img_idx}.png"
+    """
+
+    def __call__(self, page_idx: int, img_idx: int, image_b64: str) -> str:
+        """Save image and return relative path for markdown reference."""
+        ...
+
+
+# Type alias for image writer callables
+ImageWriter = Callable[[int, int, str], str]
+
+
+def create_image_writer(images_dir: Path) -> ImageWriter:
+    """Create a function for saving images to disk.
+
+    This factory function creates a closure that saves base64-encoded
+    images to the specified directory and returns relative paths
+    suitable for markdown image references.
+
+    Args:
+        images_dir: Directory path where images will be saved.
+            The directory will be created if it doesn't exist.
+
+    Returns:
+        A callable that accepts (page_idx, img_idx, image_b64) and
+        returns the relative path to the saved image.
+
+    Example:
+        >>> from pathlib import Path
+        >>> writer = create_image_writer(Path("output/images"))
+        >>> path = writer(1, 0, "iVBORw0KGgoAAAANS...")
+        >>> print(path)
+        'images/page1_img0.png'
+    """
+    # Create directory if it doesn't exist
+    images_dir.mkdir(parents=True, exist_ok=True)
+
+    def writer(page_idx: int, img_idx: int, image_b64: str) -> str:
+        """Save an image and return its relative path.
+
+        Args:
+            page_idx: Page number (1-based).
+            img_idx: Image index within the page (1-based).
+            image_b64: Base64-encoded image data.
+
+        Returns:
+            Relative path to the saved image file.
+        """
+        filename: str = f"page{page_idx}_img{img_idx}.png"
+        filepath: Path = images_dir / filename
+
+        # Decode and save
+        image_data: bytes = base64.b64decode(image_b64)
+        filepath.write_bytes(image_data)
+
+        # Return relative path for markdown
+        return f"images/{filename}"
+
+    return writer
+
+
+def extract_images(ocr_response: Any, output_dir: Path) -> List[str]:
+    """Extract all images from an OCR response.
+
+    Iterates through all pages in the OCR response, extracts any
+    embedded images, decodes them from base64, and saves them
+    to the output directory.
+
+    Args:
+        ocr_response: OCR response object from Mistral API.
+            Expected to have a pages attribute, where each page
+            may have an images list containing objects with
+            image_base64 attributes.
+        output_dir: Base output directory. Images will be saved
+            to a subdirectory named "images".
+
+    Returns:
+        List of absolute file paths to the extracted images.
+
+    Example:
+        >>> from pathlib import Path
+        >>> paths = extract_images(ocr_response, Path("output/my_doc"))
+        >>> for path in paths:
+        ...     print(path)
+        'C:/output/my_doc/images/page1_img1.png'
+        'C:/output/my_doc/images/page2_img1.png'
+
+    Note:
+        - Pages and images are 1-indexed in filenames
+        - Images without base64 data are silently skipped
+        - The images subdirectory is created automatically
+    """
+    images_dir: Path = output_dir / "images"
+    images_dir.mkdir(parents=True, exist_ok=True)
+
+    extracted: List[str] = []
+
+    for page_index, page in enumerate(ocr_response.pages, start=1):
+        if not getattr(page, "images", None):
+            continue
+
+        for img_idx, img in enumerate(page.images, start=1):
+            image_b64: Optional[str] = getattr(img, "image_base64", None)
+            if not image_b64:
+                continue
+
+            filename: str = f"page{page_index}_img{img_idx}.png"
+            filepath: Path = images_dir / filename
+
+            # Decode and save
+            image_data: bytes = base64.b64decode(image_b64)
+            filepath.write_bytes(image_data)
+
+            extracted.append(str(filepath))
+
+    return extracted