Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions
--- a/generations/library_rag/mcp_tools/parsing_tools.py
+++ b/generations/library_rag/mcp_tools/parsing_tools.py
@@ -0,0 +1,335 @@
+"""Parsing tools for Library RAG MCP Server.
+
+This module implements the parse_pdf tool with optimal pre-configured parameters
+for PDF ingestion into the Library RAG system.
+
+The tool uses fixed optimal parameters:
+    - llm_provider: "mistral" (API-based, fast)
+    - llm_model: "mistral-medium-latest" (best quality/cost ratio)
+    - use_semantic_chunking: True (LLM-based intelligent chunking)
+    - use_ocr_annotations: True (3x cost but better TOC extraction)
+    - ingest_to_weaviate: True (automatic vectorization and storage)
+
+Example:
+    The parse_pdf tool can be invoked via MCP with a simple path::
+
+        {
+            "tool": "parse_pdf",
+            "arguments": {
+                "pdf_path": "/path/to/document.pdf"
+            }
+        }
+
+    Or with a URL::
+
+        {
+            "tool": "parse_pdf",
+            "arguments": {
+                "pdf_path": "https://example.com/document.pdf"
+            }
+        }
+"""
+
+from __future__ import annotations
+
+import logging
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, Literal
+from urllib.parse import urlparse
+
+import httpx
+
+from mcp_tools.schemas import ParsePdfInput, ParsePdfOutput
+
+# Import pdf_pipeline for PDF processing
+from utils.pdf_pipeline import process_pdf, process_pdf_bytes
+from utils.types import LLMProvider
+
+# Logger for this module
+logger = logging.getLogger(__name__)
+
+# =============================================================================
+# Constants - Fixed Optimal Parameters
+# =============================================================================
+
+# LLM provider configuration (Mistral API for best results)
+FIXED_LLM_PROVIDER: LLMProvider = "mistral"
+FIXED_LLM_MODEL = "mistral-medium-latest"
+
+# Processing options (optimal settings for quality)
+FIXED_USE_SEMANTIC_CHUNKING = True
+FIXED_USE_OCR_ANNOTATIONS = True
+FIXED_INGEST_TO_WEAVIATE = True
+
+# Additional processing flags
+FIXED_USE_LLM = True
+# Note: The following flags are not supported by process_pdf() and should not be used
+# FIXED_CLEAN_CHUNKS = True
+# FIXED_EXTRACT_CONCEPTS = True
+# FIXED_VALIDATE_OUTPUT = True
+
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+
+def is_url(path: str) -> bool:
+    """Check if a path is a URL.
+
+    Args:
+        path: The path or URL string to check.
+
+    Returns:
+        True if the path is a valid HTTP/HTTPS URL, False otherwise.
+
+    Example:
+        >>> is_url("https://example.com/doc.pdf")
+        True
+        >>> is_url("/path/to/doc.pdf")
+        False
+    """
+    try:
+        result = urlparse(path)
+        return result.scheme in ("http", "https")
+    except ValueError:
+        return False
+
+
+async def download_pdf(url: str, timeout: float = 60.0) -> bytes:
+    """Download a PDF file from a URL.
+
+    Args:
+        url: The URL to download from. Must be HTTP or HTTPS.
+        timeout: Maximum time in seconds to wait for download.
+            Defaults to 60 seconds.
+
+    Returns:
+        Raw bytes content of the downloaded PDF file.
+
+    Raises:
+        httpx.HTTPError: If the download fails (network error, HTTP error, etc.).
+        ValueError: If the URL is invalid or not accessible.
+
+    Example:
+        >>> pdf_bytes = await download_pdf("https://example.com/document.pdf")
+        >>> len(pdf_bytes) > 0
+        True
+    """
+    logger.info(f"Downloading PDF from: {url}")
+
+    async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
+        response = await client.get(url)
+        response.raise_for_status()
+
+        content_type = response.headers.get("content-type", "")
+        if "application/pdf" not in content_type.lower() and not url.lower().endswith(
+            ".pdf"
+        ):
+            logger.warning(
+                f"URL may not be a PDF (Content-Type: {content_type}), proceeding anyway"
+            )
+
+        logger.info(f"Downloaded {len(response.content)} bytes from {url}")
+        return response.content
+
+
+def extract_filename_from_url(url: str) -> str:
+    """Extract a filename from a URL.
+
+    Args:
+        url: The URL to extract filename from.
+
+    Returns:
+        Extracted filename with .pdf extension. Falls back to "downloaded.pdf"
+        if no filename can be extracted.
+
+    Example:
+        >>> extract_filename_from_url("https://example.com/documents/kant.pdf")
+        "kant.pdf"
+        >>> extract_filename_from_url("https://example.com/api/download")
+        "downloaded.pdf"
+    """
+    parsed = urlparse(url)
+    path = parsed.path
+
+    if path:
+        # Get the last path component
+        filename = path.split("/")[-1]
+        if filename and "." in filename:
+            return filename
+        if filename:
+            return f"{filename}.pdf"
+
+    return "downloaded.pdf"
+
+
+# =============================================================================
+# Main Tool Implementation
+# =============================================================================
+
+
+async def parse_pdf_handler(input_data: ParsePdfInput) -> ParsePdfOutput:
+    """Process a PDF document with optimal pre-configured parameters.
+
+    This is the main handler for the parse_pdf MCP tool. It processes PDFs
+    through the Library RAG pipeline with the following fixed optimal settings:
+
+    - LLM: Mistral API (mistral-medium-latest) for fast, high-quality processing
+    - OCR: Mistral OCR with annotations (better TOC extraction, 3x cost)
+    - Chunking: Semantic LLM-based chunking (argumentative units)
+    - Ingestion: Automatic Weaviate vectorization and storage
+
+    The tool accepts either a local file path or a URL. URLs are automatically
+    downloaded before processing.
+
+    Args:
+        input_data: Validated input containing pdf_path (local path or URL).
+
+    Returns:
+        ParsePdfOutput containing processing results including:
+        - success: Whether processing completed successfully
+        - document_name: Name of the processed document
+        - source_id: Unique identifier for retrieval
+        - pages: Number of pages processed
+        - chunks_count: Number of chunks created
+        - cost_ocr: OCR cost in EUR
+        - cost_llm: LLM cost in EUR
+        - cost_total: Total processing cost
+        - output_dir: Directory containing output files
+        - metadata: Extracted document metadata
+        - error: Error message if processing failed
+
+    Example:
+        >>> input_data = ParsePdfInput(pdf_path="/docs/aristotle.pdf")
+        >>> result = await parse_pdf_handler(input_data)
+        >>> result.success
+        True
+        >>> result.chunks_count > 0
+        True
+    """
+    pdf_path = input_data.pdf_path
+    logger.info(f"parse_pdf called with: {pdf_path}")
+
+    try:
+        # Determine if input is a URL or local path
+        if is_url(pdf_path):
+            # Download PDF from URL
+            logger.info(f"Detected URL input, downloading: {pdf_path}")
+            pdf_bytes = await download_pdf(pdf_path)
+            filename = extract_filename_from_url(pdf_path)
+
+            # Process from bytes
+            result = process_pdf_bytes(
+                file_bytes=pdf_bytes,
+                filename=filename,
+                output_dir=Path("output"),
+                llm_provider=FIXED_LLM_PROVIDER,
+                use_llm=FIXED_USE_LLM,
+                llm_model=FIXED_LLM_MODEL,
+                use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
+                use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
+                ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
+            )
+        else:
+            # Process local file
+            local_path = Path(pdf_path)
+            if not local_path.exists():
+                logger.error(f"PDF file not found: {pdf_path}")
+                return ParsePdfOutput(
+                    success=False,
+                    document_name="",
+                    source_id="",
+                    pages=0,
+                    chunks_count=0,
+                    cost_ocr=0.0,
+                    cost_llm=0.0,
+                    cost_total=0.0,
+                    output_dir="",
+                    metadata={},
+                    error=f"PDF file not found: {pdf_path}",
+                )
+
+            logger.info(f"Processing local file: {local_path}")
+            result = process_pdf(
+                pdf_path=local_path,
+                output_dir=Path("output"),
+                use_llm=FIXED_USE_LLM,
+                llm_provider=FIXED_LLM_PROVIDER,
+                llm_model=FIXED_LLM_MODEL,
+                use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
+                use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
+                ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
+            )
+
+        # Convert pipeline result to output schema
+        success = result.get("success", False)
+        document_name = result.get("document_name", "")
+        source_id = result.get("source_id", document_name)
+
+        # Extract costs
+        cost_ocr = result.get("cost_ocr", 0.0)
+        cost_llm = result.get("cost_llm", 0.0)
+        cost_total = result.get("cost_total", cost_ocr + cost_llm)
+
+        # Extract metadata
+        metadata_raw = result.get("metadata", {})
+        if metadata_raw is None:
+            metadata_raw = {}
+
+        # Build output
+        output = ParsePdfOutput(
+            success=success,
+            document_name=document_name,
+            source_id=source_id,
+            pages=result.get("pages", 0),
+            chunks_count=result.get("chunks_count", 0),
+            cost_ocr=cost_ocr,
+            cost_llm=cost_llm,
+            cost_total=cost_total,
+            output_dir=str(result.get("output_dir", "")),
+            metadata=metadata_raw,
+            error=result.get("error"),
+        )
+
+        if success:
+            logger.info(
+                f"Successfully processed {document_name}: "
+                f"{output.chunks_count} chunks, {output.cost_total:.4f} EUR"
+            )
+        else:
+            logger.error(f"Failed to process {pdf_path}: {output.error}")
+
+        return output
+
+    except httpx.HTTPError as e:
+        logger.error(f"HTTP error downloading PDF: {e}")
+        return ParsePdfOutput(
+            success=False,
+            document_name="",
+            source_id="",
+            pages=0,
+            chunks_count=0,
+            cost_ocr=0.0,
+            cost_llm=0.0,
+            cost_total=0.0,
+            output_dir="",
+            metadata={},
+            error=f"Failed to download PDF: {e}",
+        )
+    except Exception as e:
+        logger.error(f"Error processing PDF: {e}", exc_info=True)
+        return ParsePdfOutput(
+            success=False,
+            document_name="",
+            source_id="",
+            pages=0,
+            chunks_count=0,
+            cost_ocr=0.0,
+            cost_llm=0.0,
+            cost_total=0.0,
+            output_dir="",
+            metadata={},
+            error=f"Processing error: {str(e)}",
+        )