Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions
--- a/generations/library_rag/mcp_tools/init.py
+++ b/generations/library_rag/mcp_tools/init.py
@@ -0,0 +1,106 @@
+"""
+MCP Tools for Library RAG Server.
+
+This package contains all tool implementations for the Library RAG MCP server:
+- Parsing tools: PDF ingestion with optimal parameters
+- Retrieval tools: Semantic search and document management
+- Exceptions: Custom exception classes for structured error handling
+- Logging: Structured JSON logging configuration
+"""
+
+from mcp_tools.schemas import (
+    ParsePdfInput,
+    ParsePdfOutput,
+    SearchChunksInput,
+    SearchChunksOutput,
+    SearchSummariesInput,
+    SearchSummariesOutput,
+    GetDocumentInput,
+    GetDocumentOutput,
+    ListDocumentsInput,
+    ListDocumentsOutput,
+    GetChunksByDocumentInput,
+    GetChunksByDocumentOutput,
+    FilterByAuthorInput,
+    FilterByAuthorOutput,
+    DeleteDocumentInput,
+    DeleteDocumentOutput,
+)
+
+from mcp_tools.exceptions import (
+    MCPToolError,
+    WeaviateConnectionError,
+    PDFProcessingError,
+    DocumentNotFoundError,
+    ValidationError,
+    LLMProcessingError,
+    DownloadError,
+)
+
+from mcp_tools.logging_config import (
+    setup_mcp_logging,
+    get_tool_logger,
+    ToolInvocationLogger,
+    log_tool_invocation,
+    log_weaviate_query,
+    redact_sensitive_data,
+    redact_dict,
+)
+
+from mcp_tools.parsing_tools import parse_pdf_handler
+from mcp_tools.retrieval_tools import (
+    search_chunks_handler,
+    search_summaries_handler,
+    get_document_handler,
+    list_documents_handler,
+    get_chunks_by_document_handler,
+    filter_by_author_handler,
+    delete_document_handler,
+)
+
+__all__ = [
+    # Parsing tools
+    "parse_pdf_handler",
+    # Retrieval tools
+    "search_chunks_handler",
+    "search_summaries_handler",
+    "get_document_handler",
+    "list_documents_handler",
+    "get_chunks_by_document_handler",
+    "filter_by_author_handler",
+    "delete_document_handler",
+    # Parsing schemas
+    "ParsePdfInput",
+    "ParsePdfOutput",
+    # Retrieval schemas
+    "SearchChunksInput",
+    "SearchChunksOutput",
+    "SearchSummariesInput",
+    "SearchSummariesOutput",
+    "GetDocumentInput",
+    "GetDocumentOutput",
+    "ListDocumentsInput",
+    "ListDocumentsOutput",
+    "GetChunksByDocumentInput",
+    "GetChunksByDocumentOutput",
+    "FilterByAuthorInput",
+    "FilterByAuthorOutput",
+    "DeleteDocumentInput",
+    "DeleteDocumentOutput",
+    # Exceptions
+    "MCPToolError",
+    "WeaviateConnectionError",
+    "PDFProcessingError",
+    "DocumentNotFoundError",
+    "ValidationError",
+    "LLMProcessingError",
+    "DownloadError",
+    # Logging
+    "setup_mcp_logging",
+    "get_tool_logger",
+    "ToolInvocationLogger",
+    "log_tool_invocation",
+    "log_weaviate_query",
+    "redact_sensitive_data",
+    "redact_dict",
+]
--- a/generations/library_rag/mcp_tools/exceptions.py
+++ b/generations/library_rag/mcp_tools/exceptions.py
@@ -0,0 +1,297 @@
+"""Custom exception classes for Library RAG MCP Server.
+
+This module defines custom exception classes used throughout the MCP server
+for structured error handling and consistent error responses.
+
+Exception Hierarchy:
+    MCPToolError (base)
+    ├── WeaviateConnectionError - Database connection failures
+    ├── PDFProcessingError - PDF parsing/OCR failures
+    ├── DocumentNotFoundError - Document/chunk retrieval failures
+    └── ValidationError - Input validation failures
+
+Example:
+    Raise and catch custom exceptions::
+
+        from mcp_tools.exceptions import WeaviateConnectionError
+
+        try:
+            client = connect_to_weaviate()
+        except Exception as e:
+            raise WeaviateConnectionError("Failed to connect") from e
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, Optional
+
+
+class MCPToolError(Exception):
+    """Base exception for all MCP tool errors.
+
+    This is the base class for all custom exceptions in the MCP server.
+    It provides structured error information that can be converted to
+    MCP error responses.
+
+    Attributes:
+        message: Human-readable error description.
+        error_code: Machine-readable error code for categorization.
+        details: Additional context about the error.
+        original_error: The underlying exception if this wraps another error.
+    """
+
+    def __init__(
+        self,
+        message: str,
+        *,
+        error_code: str = "MCP_ERROR",
+        details: Optional[Dict[str, Any]] = None,
+        original_error: Optional[Exception] = None,
+    ) -> None:
+        """Initialize the MCPToolError.
+
+        Args:
+            message: Human-readable error description.
+            error_code: Machine-readable error code (default: "MCP_ERROR").
+            details: Additional context about the error (optional).
+            original_error: The underlying exception if wrapping (optional).
+        """
+        super().__init__(message)
+        self.message = message
+        self.error_code = error_code
+        self.details = details or {}
+        self.original_error = original_error
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert exception to a dictionary for JSON serialization.
+
+        Returns:
+            Dictionary with error information suitable for MCP responses.
+        """
+        result: Dict[str, Any] = {
+            "error": True,
+            "error_code": self.error_code,
+            "message": self.message,
+        }
+        if self.details:
+            result["details"] = self.details
+        if self.original_error:
+            result["original_error"] = str(self.original_error)
+        return result
+
+    def __str__(self) -> str:
+        """Return string representation of the error."""
+        if self.original_error:
+            return f"[{self.error_code}] {self.message} (caused by: {self.original_error})"
+        return f"[{self.error_code}] {self.message}"
+
+
+class WeaviateConnectionError(MCPToolError):
+    """Raised when Weaviate database connection fails.
+
+    This exception is raised when the MCP server cannot establish or
+    maintain a connection to the Weaviate vector database.
+
+    Example:
+        >>> raise WeaviateConnectionError(
+        ...     "Cannot connect to Weaviate at localhost:8080",
+        ...     details={"host": "localhost", "port": 8080}
+        ... )
+    """
+
+    def __init__(
+        self,
+        message: str = "Failed to connect to Weaviate",
+        *,
+        details: Optional[Dict[str, Any]] = None,
+        original_error: Optional[Exception] = None,
+    ) -> None:
+        """Initialize WeaviateConnectionError.
+
+        Args:
+            message: Error description (default: "Failed to connect to Weaviate").
+            details: Additional context (host, port, etc.).
+            original_error: The underlying connection exception.
+        """
+        super().__init__(
+            message,
+            error_code="WEAVIATE_CONNECTION_ERROR",
+            details=details,
+            original_error=original_error,
+        )
+
+
+class PDFProcessingError(MCPToolError):
+    """Raised when PDF processing fails.
+
+    This exception is raised when the MCP server encounters an error
+    during PDF parsing, OCR, or any step in the PDF ingestion pipeline.
+
+    Example:
+        >>> raise PDFProcessingError(
+        ...     "OCR failed for page 5",
+        ...     details={"page": 5, "pdf_path": "/docs/test.pdf"}
+        ... )
+    """
+
+    def __init__(
+        self,
+        message: str = "PDF processing failed",
+        *,
+        details: Optional[Dict[str, Any]] = None,
+        original_error: Optional[Exception] = None,
+    ) -> None:
+        """Initialize PDFProcessingError.
+
+        Args:
+            message: Error description (default: "PDF processing failed").
+            details: Additional context (pdf_path, page, step, etc.).
+            original_error: The underlying processing exception.
+        """
+        super().__init__(
+            message,
+            error_code="PDF_PROCESSING_ERROR",
+            details=details,
+            original_error=original_error,
+        )
+
+
+class DocumentNotFoundError(MCPToolError):
+    """Raised when a requested document or chunk is not found.
+
+    This exception is raised when a retrieval operation cannot find
+    the requested document, chunk, or summary in Weaviate.
+
+    Example:
+        >>> raise DocumentNotFoundError(
+        ...     "Document not found",
+        ...     details={"source_id": "platon-menon"}
+        ... )
+    """
+
+    def __init__(
+        self,
+        message: str = "Document not found",
+        *,
+        details: Optional[Dict[str, Any]] = None,
+        original_error: Optional[Exception] = None,
+    ) -> None:
+        """Initialize DocumentNotFoundError.
+
+        Args:
+            message: Error description (default: "Document not found").
+            details: Additional context (source_id, query, etc.).
+            original_error: The underlying exception if any.
+        """
+        super().__init__(
+            message,
+            error_code="DOCUMENT_NOT_FOUND",
+            details=details,
+            original_error=original_error,
+        )
+
+
+class ValidationError(MCPToolError):
+    """Raised when input validation fails.
+
+    This exception is raised when user input does not meet the
+    required validation criteria (e.g., invalid paths, bad parameters).
+
+    Example:
+        >>> raise ValidationError(
+        ...     "Invalid PDF path",
+        ...     details={"path": "/nonexistent/file.pdf", "reason": "File not found"}
+        ... )
+    """
+
+    def __init__(
+        self,
+        message: str = "Validation failed",
+        *,
+        details: Optional[Dict[str, Any]] = None,
+        original_error: Optional[Exception] = None,
+    ) -> None:
+        """Initialize ValidationError.
+
+        Args:
+            message: Error description (default: "Validation failed").
+            details: Additional context (field, value, reason, etc.).
+            original_error: The underlying validation exception.
+        """
+        super().__init__(
+            message,
+            error_code="VALIDATION_ERROR",
+            details=details,
+            original_error=original_error,
+        )
+
+
+class LLMProcessingError(MCPToolError):
+    """Raised when LLM processing fails.
+
+    This exception is raised when the LLM (Mistral or Ollama) fails
+    to process content during metadata extraction, chunking, or other
+    LLM-based operations.
+
+    Example:
+        >>> raise LLMProcessingError(
+        ...     "LLM timeout during metadata extraction",
+        ...     details={"provider": "ollama", "model": "mistral", "step": "metadata"}
+        ... )
+    """
+
+    def __init__(
+        self,
+        message: str = "LLM processing failed",
+        *,
+        details: Optional[Dict[str, Any]] = None,
+        original_error: Optional[Exception] = None,
+    ) -> None:
+        """Initialize LLMProcessingError.
+
+        Args:
+            message: Error description (default: "LLM processing failed").
+            details: Additional context (provider, model, step, etc.).
+            original_error: The underlying LLM exception.
+        """
+        super().__init__(
+            message,
+            error_code="LLM_PROCESSING_ERROR",
+            details=details,
+            original_error=original_error,
+        )
+
+
+class DownloadError(MCPToolError):
+    """Raised when file download from URL fails.
+
+    This exception is raised when the MCP server cannot download
+    a PDF file from a provided URL.
+
+    Example:
+        >>> raise DownloadError(
+        ...     "Failed to download PDF",
+        ...     details={"url": "https://example.com/doc.pdf", "status_code": 404}
+        ... )
+    """
+
+    def __init__(
+        self,
+        message: str = "File download failed",
+        *,
+        details: Optional[Dict[str, Any]] = None,
+        original_error: Optional[Exception] = None,
+    ) -> None:
+        """Initialize DownloadError.
+
+        Args:
+            message: Error description (default: "File download failed").
+            details: Additional context (url, status_code, etc.).
+            original_error: The underlying HTTP exception.
+        """
+        super().__init__(
+            message,
+            error_code="DOWNLOAD_ERROR",
+            details=details,
+            original_error=original_error,
+        )
--- a/generations/library_rag/mcp_tools/logging_config.py
+++ b/generations/library_rag/mcp_tools/logging_config.py
@@ -0,0 +1,462 @@
+"""Structured JSON logging configuration for Library RAG MCP Server.
+
+This module provides structured JSON logging with sensitive data filtering
+and tool invocation tracking.
+
+Features:
+    - JSON-formatted log output for machine parsing
+    - Sensitive data filtering (API keys, passwords)
+    - Tool invocation logging with timing
+    - Configurable log levels via environment variable
+
+Example:
+    Configure logging at server startup::
+
+        from mcp_tools.logging_config import setup_mcp_logging, get_tool_logger
+
+        # Setup logging
+        logger = setup_mcp_logging(log_level="INFO")
+
+        # Get tool-specific logger
+        tool_logger = get_tool_logger("search_chunks")
+        tool_logger.info("Processing query", extra={"query": "justice"})
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+import sys
+import time
+from contextlib import contextmanager
+from datetime import datetime, timezone
+from functools import wraps
+from pathlib import Path
+from typing import Any, Callable, Dict, Generator, Literal, Optional, TypeVar, cast
+
+# Type variable for decorator return type preservation
+F = TypeVar("F", bound=Callable[..., Any])
+
+# =============================================================================
+# Sensitive Data Patterns
+# =============================================================================
+
+# Patterns to detect sensitive data in log messages
+SENSITIVE_PATTERNS = [
+    # API keys
+    (re.compile(r'(api[_-]?key\s*[=:]\s*)["\']?[\w-]{20,}["\']?', re.I), r"\1***REDACTED***"),
+    (re.compile(r'(bearer\s+)[\w-]{20,}', re.I), r"\1***REDACTED***"),
+    (re.compile(r'(authorization\s*[=:]\s*)["\']?[\w-]{20,}["\']?', re.I), r"\1***REDACTED***"),
+    # Mistral API key format
+    (re.compile(r'(MISTRAL_API_KEY\s*[=:]\s*)["\']?[\w-]+["\']?', re.I), r"\1***REDACTED***"),
+    # Generic secrets
+    (re.compile(r'(password\s*[=:]\s*)["\']?[^\s"\']+["\']?', re.I), r"\1***REDACTED***"),
+    (re.compile(r'(secret\s*[=:]\s*)["\']?[\w-]+["\']?', re.I), r"\1***REDACTED***"),
+    (re.compile(r'(token\s*[=:]\s*)["\']?[\w-]{20,}["\']?', re.I), r"\1***REDACTED***"),
+]
+
+
+def redact_sensitive_data(message: str) -> str:
+    """Remove sensitive data from log messages.
+
+    Args:
+        message: The log message to sanitize.
+
+    Returns:
+        Sanitized message with sensitive data redacted.
+
+    Example:
+        >>> redact_sensitive_data("api_key=sk-12345abcdef")
+        "api_key=***REDACTED***"
+    """
+    result = message
+    for pattern, replacement in SENSITIVE_PATTERNS:
+        result = pattern.sub(replacement, result)
+    return result
+
+
+def redact_dict(data: Dict[str, Any]) -> Dict[str, Any]:
+    """Recursively redact sensitive data from a dictionary.
+
+    Args:
+        data: Dictionary that may contain sensitive data.
+
+    Returns:
+        New dictionary with sensitive values redacted.
+    """
+    sensitive_keys = {
+        "api_key", "apikey", "api-key",
+        "password", "passwd", "pwd",
+        "secret", "token", "auth",
+        "authorization", "bearer",
+        "mistral_api_key", "MISTRAL_API_KEY",
+    }
+
+    result: Dict[str, Any] = {}
+    for key, value in data.items():
+        key_lower = key.lower().replace("-", "_")
+
+        if key_lower in sensitive_keys or any(s in key_lower for s in ["key", "secret", "token", "password"]):
+            result[key] = "***REDACTED***"
+        elif isinstance(value, dict):
+            result[key] = redact_dict(value)
+        elif isinstance(value, str):
+            result[key] = redact_sensitive_data(value)
+        else:
+            result[key] = value
+
+    return result
+
+
+# =============================================================================
+# JSON Log Formatter
+# =============================================================================
+
+
+class JSONLogFormatter(logging.Formatter):
+    """JSON formatter for structured logging.
+
+    Outputs log records as single-line JSON objects with consistent structure.
+    Automatically redacts sensitive data from messages and extra fields.
+
+    JSON Structure:
+        {
+            "timestamp": "2024-12-24T10:30:00.000Z",
+            "level": "INFO",
+            "logger": "library-rag-mcp.search_chunks",
+            "message": "Processing query",
+            "tool": "search_chunks",
+            "duration_ms": 123,
+            ...extra fields...
+        }
+    """
+
+    def format(self, record: logging.LogRecord) -> str:
+        """Format the log record as JSON.
+
+        Args:
+            record: The log record to format.
+
+        Returns:
+            JSON-formatted log string.
+        """
+        # Base log structure
+        log_entry: Dict[str, Any] = {
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+            "level": record.levelname,
+            "logger": record.name,
+            "message": redact_sensitive_data(record.getMessage()),
+        }
+
+        # Add exception info if present
+        if record.exc_info:
+            log_entry["exception"] = self.formatException(record.exc_info)
+
+        # Add extra fields (excluding standard LogRecord attributes)
+        standard_attrs = {
+            "name", "msg", "args", "levelname", "levelno", "pathname",
+            "filename", "module", "lineno", "funcName", "created",
+            "msecs", "relativeCreated", "thread", "threadName",
+            "processName", "process", "exc_info", "exc_text", "stack_info",
+            "message", "taskName",
+        }
+
+        for key, value in record.__dict__.items():
+            if key not in standard_attrs and not key.startswith("_"):
+                if isinstance(value, dict):
+                    log_entry[key] = redact_dict(value)
+                elif isinstance(value, str):
+                    log_entry[key] = redact_sensitive_data(value)
+                else:
+                    log_entry[key] = value
+
+        return json.dumps(log_entry, default=str, ensure_ascii=False)
+
+
+# =============================================================================
+# Logging Setup
+# =============================================================================
+
+
+def setup_mcp_logging(
+    log_level: str = "INFO",
+    log_dir: Optional[Path] = None,
+    json_format: bool = True,
+) -> logging.Logger:
+    """Configure structured logging for the MCP server.
+
+    Sets up logging with JSON formatting to both file and stderr.
+    Uses stderr for console output since stdout is used for MCP communication.
+
+    Args:
+        log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL).
+        log_dir: Directory for log files. Defaults to "logs".
+        json_format: Use JSON formatting (default True).
+
+    Returns:
+        Configured logger instance for the MCP server.
+
+    Example:
+        >>> logger = setup_mcp_logging(log_level="DEBUG")
+        >>> logger.info("Server started", extra={"port": 8080})
+    """
+    # Determine log directory
+    if log_dir is None:
+        log_dir = Path("logs")
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    # Get or create the root MCP logger
+    logger = logging.getLogger("library-rag-mcp")
+    logger.setLevel(getattr(logging, log_level.upper(), logging.INFO))
+
+    # Clear existing handlers to avoid duplicates
+    logger.handlers.clear()
+
+    # Create formatters
+    if json_format:
+        formatter: logging.Formatter = JSONLogFormatter()
+    else:
+        formatter = logging.Formatter(
+            "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        )
+
+    # File handler (JSON logs)
+    file_handler = logging.FileHandler(
+        log_dir / "mcp_server.log",
+        encoding="utf-8",
+    )
+    file_handler.setLevel(logging.DEBUG)  # Log everything to file
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+
+    # Stderr handler (for console output - stdout is for MCP)
+    stderr_handler = logging.StreamHandler(sys.stderr)
+    stderr_handler.setLevel(getattr(logging, log_level.upper(), logging.INFO))
+    stderr_handler.setFormatter(formatter)
+    logger.addHandler(stderr_handler)
+
+    # Prevent propagation to root logger
+    logger.propagate = False
+
+    return logger
+
+
+def get_tool_logger(tool_name: str) -> logging.Logger:
+    """Get a logger for a specific MCP tool.
+
+    Creates a child logger under the main MCP logger with the tool name
+    automatically included in log entries.
+
+    Args:
+        tool_name: Name of the MCP tool (e.g., "search_chunks", "parse_pdf").
+
+    Returns:
+        Logger instance for the tool.
+
+    Example:
+        >>> logger = get_tool_logger("search_chunks")
+        >>> logger.info("Query processed", extra={"results": 10})
+    """
+    return logging.getLogger(f"library-rag-mcp.{tool_name}")
+
+
+# =============================================================================
+# Tool Invocation Logging
+# =============================================================================
+
+
+class ToolInvocationLogger:
+    """Context manager for logging tool invocations with timing.
+
+    Automatically logs tool start, success/failure, and duration.
+    Handles exception logging and provides structured output.
+
+    Example:
+        >>> with ToolInvocationLogger("search_chunks", {"query": "justice"}) as inv:
+        ...     result = do_search()
+        ...     inv.set_result({"count": 10})
+    """
+
+    def __init__(
+        self,
+        tool_name: str,
+        inputs: Dict[str, Any],
+        logger: Optional[logging.Logger] = None,
+    ) -> None:
+        """Initialize the invocation logger.
+
+        Args:
+            tool_name: Name of the tool being invoked.
+            inputs: Tool input parameters (will be redacted).
+            logger: Logger to use. Defaults to tool-specific logger.
+        """
+        self.tool_name = tool_name
+        self.inputs = redact_dict(inputs)
+        self.logger = logger or get_tool_logger(tool_name)
+        self.start_time: float = 0.0
+        self.result: Optional[Dict[str, Any]] = None
+        self.error: Optional[Exception] = None
+
+    def __enter__(self) -> "ToolInvocationLogger":
+        """Start timing and log invocation start."""
+        self.start_time = time.perf_counter()
+        self.logger.info(
+            f"Tool invocation started: {self.tool_name}",
+            extra={
+                "tool": self.tool_name,
+                "event": "invocation_start",
+                "inputs": self.inputs,
+            },
+        )
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[type],
+        exc_val: Optional[BaseException],
+        exc_tb: Any,
+    ) -> Literal[False]:
+        """Log invocation completion with timing."""
+        duration_ms = (time.perf_counter() - self.start_time) * 1000
+
+        if exc_val is not None:
+            # Log error
+            self.logger.error(
+                f"Tool invocation failed: {self.tool_name}",
+                extra={
+                    "tool": self.tool_name,
+                    "event": "invocation_error",
+                    "duration_ms": round(duration_ms, 2),
+                    "error_type": exc_type.__name__ if exc_type else "Unknown",
+                    "error_message": str(exc_val),
+                },
+                exc_info=True,
+            )
+            # Don't suppress the exception
+            return False
+
+        # Log success
+        extra: Dict[str, Any] = {
+            "tool": self.tool_name,
+            "event": "invocation_success",
+            "duration_ms": round(duration_ms, 2),
+        }
+        if self.result:
+            extra["result_summary"] = self._summarize_result()
+
+        self.logger.info(
+            f"Tool invocation completed: {self.tool_name}",
+            extra=extra,
+        )
+        return False
+
+    def set_result(self, result: Dict[str, Any]) -> None:
+        """Set the result for logging summary.
+
+        Args:
+            result: The tool result dictionary.
+        """
+        self.result = result
+
+    def _summarize_result(self) -> Dict[str, Any]:
+        """Create a summary of the result for logging.
+
+        Returns:
+            Dictionary with key result metrics (counts, success status, etc.)
+        """
+        if not self.result:
+            return {}
+
+        summary: Dict[str, Any] = {}
+
+        # Common summary fields
+        if "success" in self.result:
+            summary["success"] = self.result["success"]
+        if "total_count" in self.result:
+            summary["total_count"] = self.result["total_count"]
+        if "results" in self.result and isinstance(self.result["results"], list):
+            summary["result_count"] = len(self.result["results"])
+        if "chunks_count" in self.result:
+            summary["chunks_count"] = self.result["chunks_count"]
+        if "cost_total" in self.result:
+            summary["cost_total"] = self.result["cost_total"]
+        if "found" in self.result:
+            summary["found"] = self.result["found"]
+        if "error" in self.result and self.result["error"]:
+            summary["error"] = self.result["error"]
+
+        return summary
+
+
+@contextmanager
+def log_tool_invocation(
+    tool_name: str,
+    inputs: Dict[str, Any],
+) -> Generator[ToolInvocationLogger, None, None]:
+    """Context manager for logging tool invocations.
+
+    Convenience function that creates and manages a ToolInvocationLogger.
+
+    Args:
+        tool_name: Name of the tool being invoked.
+        inputs: Tool input parameters.
+
+    Yields:
+        ToolInvocationLogger instance for setting results.
+
+    Example:
+        >>> with log_tool_invocation("search_chunks", {"query": "test"}) as inv:
+        ...     result = search(query)
+        ...     inv.set_result(result)
+    """
+    logger_instance = ToolInvocationLogger(tool_name, inputs)
+    with logger_instance as inv:
+        yield inv
+
+
+def log_weaviate_query(
+    operation: str,
+    collection: str,
+    filters: Optional[Dict[str, Any]] = None,
+    result_count: Optional[int] = None,
+    duration_ms: Optional[float] = None,
+) -> None:
+    """Log a Weaviate query operation.
+
+    Utility function for logging Weaviate database queries with consistent
+    structure.
+
+    Args:
+        operation: Query operation type (fetch, near_text, aggregate, etc.).
+        collection: Weaviate collection name.
+        filters: Query filters applied (optional).
+        result_count: Number of results returned (optional).
+        duration_ms: Query duration in milliseconds (optional).
+
+    Example:
+        >>> log_weaviate_query(
+        ...     operation="near_text",
+        ...     collection="Chunk",
+        ...     filters={"author": "Platon"},
+        ...     result_count=10,
+        ...     duration_ms=45.2
+        ... )
+    """
+    logger = logging.getLogger("library-rag-mcp.weaviate")
+
+    extra: Dict[str, Any] = {
+        "event": "weaviate_query",
+        "operation": operation,
+        "collection": collection,
+    }
+
+    if filters:
+        extra["filters"] = redact_dict(filters)
+    if result_count is not None:
+        extra["result_count"] = result_count
+    if duration_ms is not None:
+        extra["duration_ms"] = round(duration_ms, 2)
+
+    logger.debug(f"Weaviate {operation} on {collection}", extra=extra)
--- a/generations/library_rag/mcp_tools/parsing_tools.py
+++ b/generations/library_rag/mcp_tools/parsing_tools.py
@@ -0,0 +1,335 @@
+"""Parsing tools for Library RAG MCP Server.
+
+This module implements the parse_pdf tool with optimal pre-configured parameters
+for PDF ingestion into the Library RAG system.
+
+The tool uses fixed optimal parameters:
+    - llm_provider: "mistral" (API-based, fast)
+    - llm_model: "mistral-medium-latest" (best quality/cost ratio)
+    - use_semantic_chunking: True (LLM-based intelligent chunking)
+    - use_ocr_annotations: True (3x cost but better TOC extraction)
+    - ingest_to_weaviate: True (automatic vectorization and storage)
+
+Example:
+    The parse_pdf tool can be invoked via MCP with a simple path::
+
+        {
+            "tool": "parse_pdf",
+            "arguments": {
+                "pdf_path": "/path/to/document.pdf"
+            }
+        }
+
+    Or with a URL::
+
+        {
+            "tool": "parse_pdf",
+            "arguments": {
+                "pdf_path": "https://example.com/document.pdf"
+            }
+        }
+"""
+
+from __future__ import annotations
+
+import logging
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, Literal
+from urllib.parse import urlparse
+
+import httpx
+
+from mcp_tools.schemas import ParsePdfInput, ParsePdfOutput
+
+# Import pdf_pipeline for PDF processing
+from utils.pdf_pipeline import process_pdf, process_pdf_bytes
+from utils.types import LLMProvider
+
+# Logger for this module
+logger = logging.getLogger(__name__)
+
+# =============================================================================
+# Constants - Fixed Optimal Parameters
+# =============================================================================
+
+# LLM provider configuration (Mistral API for best results)
+FIXED_LLM_PROVIDER: LLMProvider = "mistral"
+FIXED_LLM_MODEL = "mistral-medium-latest"
+
+# Processing options (optimal settings for quality)
+FIXED_USE_SEMANTIC_CHUNKING = True
+FIXED_USE_OCR_ANNOTATIONS = True
+FIXED_INGEST_TO_WEAVIATE = True
+
+# Additional processing flags
+FIXED_USE_LLM = True
+# Note: The following flags are not supported by process_pdf() and should not be used
+# FIXED_CLEAN_CHUNKS = True
+# FIXED_EXTRACT_CONCEPTS = True
+# FIXED_VALIDATE_OUTPUT = True
+
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+
+def is_url(path: str) -> bool:
+    """Check if a path is a URL.
+
+    Args:
+        path: The path or URL string to check.
+
+    Returns:
+        True if the path is a valid HTTP/HTTPS URL, False otherwise.
+
+    Example:
+        >>> is_url("https://example.com/doc.pdf")
+        True
+        >>> is_url("/path/to/doc.pdf")
+        False
+    """
+    try:
+        result = urlparse(path)
+        return result.scheme in ("http", "https")
+    except ValueError:
+        return False
+
+
+async def download_pdf(url: str, timeout: float = 60.0) -> bytes:
+    """Download a PDF file from a URL.
+
+    Args:
+        url: The URL to download from. Must be HTTP or HTTPS.
+        timeout: Maximum time in seconds to wait for download.
+            Defaults to 60 seconds.
+
+    Returns:
+        Raw bytes content of the downloaded PDF file.
+
+    Raises:
+        httpx.HTTPError: If the download fails (network error, HTTP error, etc.).
+        ValueError: If the URL is invalid or not accessible.
+
+    Example:
+        >>> pdf_bytes = await download_pdf("https://example.com/document.pdf")
+        >>> len(pdf_bytes) > 0
+        True
+    """
+    logger.info(f"Downloading PDF from: {url}")
+
+    async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
+        response = await client.get(url)
+        response.raise_for_status()
+
+        content_type = response.headers.get("content-type", "")
+        if "application/pdf" not in content_type.lower() and not url.lower().endswith(
+            ".pdf"
+        ):
+            logger.warning(
+                f"URL may not be a PDF (Content-Type: {content_type}), proceeding anyway"
+            )
+
+        logger.info(f"Downloaded {len(response.content)} bytes from {url}")
+        return response.content
+
+
+def extract_filename_from_url(url: str) -> str:
+    """Extract a filename from a URL.
+
+    Args:
+        url: The URL to extract filename from.
+
+    Returns:
+        Extracted filename with .pdf extension. Falls back to "downloaded.pdf"
+        if no filename can be extracted.
+
+    Example:
+        >>> extract_filename_from_url("https://example.com/documents/kant.pdf")
+        "kant.pdf"
+        >>> extract_filename_from_url("https://example.com/api/download")
+        "downloaded.pdf"
+    """
+    parsed = urlparse(url)
+    path = parsed.path
+
+    if path:
+        # Get the last path component
+        filename = path.split("/")[-1]
+        if filename and "." in filename:
+            return filename
+        if filename:
+            return f"{filename}.pdf"
+
+    return "downloaded.pdf"
+
+
+# =============================================================================
+# Main Tool Implementation
+# =============================================================================
+
+
+async def parse_pdf_handler(input_data: ParsePdfInput) -> ParsePdfOutput:
+    """Process a PDF document with optimal pre-configured parameters.
+
+    This is the main handler for the parse_pdf MCP tool. It processes PDFs
+    through the Library RAG pipeline with the following fixed optimal settings:
+
+    - LLM: Mistral API (mistral-medium-latest) for fast, high-quality processing
+    - OCR: Mistral OCR with annotations (better TOC extraction, 3x cost)
+    - Chunking: Semantic LLM-based chunking (argumentative units)
+    - Ingestion: Automatic Weaviate vectorization and storage
+
+    The tool accepts either a local file path or a URL. URLs are automatically
+    downloaded before processing.
+
+    Args:
+        input_data: Validated input containing pdf_path (local path or URL).
+
+    Returns:
+        ParsePdfOutput containing processing results including:
+        - success: Whether processing completed successfully
+        - document_name: Name of the processed document
+        - source_id: Unique identifier for retrieval
+        - pages: Number of pages processed
+        - chunks_count: Number of chunks created
+        - cost_ocr: OCR cost in EUR
+        - cost_llm: LLM cost in EUR
+        - cost_total: Total processing cost
+        - output_dir: Directory containing output files
+        - metadata: Extracted document metadata
+        - error: Error message if processing failed
+
+    Example:
+        >>> input_data = ParsePdfInput(pdf_path="/docs/aristotle.pdf")
+        >>> result = await parse_pdf_handler(input_data)
+        >>> result.success
+        True
+        >>> result.chunks_count > 0
+        True
+    """
+    pdf_path = input_data.pdf_path
+    logger.info(f"parse_pdf called with: {pdf_path}")
+
+    try:
+        # Determine if input is a URL or local path
+        if is_url(pdf_path):
+            # Download PDF from URL
+            logger.info(f"Detected URL input, downloading: {pdf_path}")
+            pdf_bytes = await download_pdf(pdf_path)
+            filename = extract_filename_from_url(pdf_path)
+
+            # Process from bytes
+            result = process_pdf_bytes(
+                file_bytes=pdf_bytes,
+                filename=filename,
+                output_dir=Path("output"),
+                llm_provider=FIXED_LLM_PROVIDER,
+                use_llm=FIXED_USE_LLM,
+                llm_model=FIXED_LLM_MODEL,
+                use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
+                use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
+                ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
+            )
+        else:
+            # Process local file
+            local_path = Path(pdf_path)
+            if not local_path.exists():
+                logger.error(f"PDF file not found: {pdf_path}")
+                return ParsePdfOutput(
+                    success=False,
+                    document_name="",
+                    source_id="",
+                    pages=0,
+                    chunks_count=0,
+                    cost_ocr=0.0,
+                    cost_llm=0.0,
+                    cost_total=0.0,
+                    output_dir="",
+                    metadata={},
+                    error=f"PDF file not found: {pdf_path}",
+                )
+
+            logger.info(f"Processing local file: {local_path}")
+            result = process_pdf(
+                pdf_path=local_path,
+                output_dir=Path("output"),
+                use_llm=FIXED_USE_LLM,
+                llm_provider=FIXED_LLM_PROVIDER,
+                llm_model=FIXED_LLM_MODEL,
+                use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
+                use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
+                ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
+            )
+
+        # Convert pipeline result to output schema
+        success = result.get("success", False)
+        document_name = result.get("document_name", "")
+        source_id = result.get("source_id", document_name)
+
+        # Extract costs
+        cost_ocr = result.get("cost_ocr", 0.0)
+        cost_llm = result.get("cost_llm", 0.0)
+        cost_total = result.get("cost_total", cost_ocr + cost_llm)
+
+        # Extract metadata
+        metadata_raw = result.get("metadata", {})
+        if metadata_raw is None:
+            metadata_raw = {}
+
+        # Build output
+        output = ParsePdfOutput(
+            success=success,
+            document_name=document_name,
+            source_id=source_id,
+            pages=result.get("pages", 0),
+            chunks_count=result.get("chunks_count", 0),
+            cost_ocr=cost_ocr,
+            cost_llm=cost_llm,
+            cost_total=cost_total,
+            output_dir=str(result.get("output_dir", "")),
+            metadata=metadata_raw,
+            error=result.get("error"),
+        )
+
+        if success:
+            logger.info(
+                f"Successfully processed {document_name}: "
+                f"{output.chunks_count} chunks, {output.cost_total:.4f} EUR"
+            )
+        else:
+            logger.error(f"Failed to process {pdf_path}: {output.error}")
+
+        return output
+
+    except httpx.HTTPError as e:
+        logger.error(f"HTTP error downloading PDF: {e}")
+        return ParsePdfOutput(
+            success=False,
+            document_name="",
+            source_id="",
+            pages=0,
+            chunks_count=0,
+            cost_ocr=0.0,
+            cost_llm=0.0,
+            cost_total=0.0,
+            output_dir="",
+            metadata={},
+            error=f"Failed to download PDF: {e}",
+        )
+    except Exception as e:
+        logger.error(f"Error processing PDF: {e}", exc_info=True)
+        return ParsePdfOutput(
+            success=False,
+            document_name="",
+            source_id="",
+            pages=0,
+            chunks_count=0,
+            cost_ocr=0.0,
+            cost_llm=0.0,
+            cost_total=0.0,
+            output_dir="",
+            metadata={},
+            error=f"Processing error: {str(e)}",
+        )
--- a/generations/library_rag/mcp_tools/retrieval_tools.py
+++ b/generations/library_rag/mcp_tools/retrieval_tools.py
--- a/generations/library_rag/mcp_tools/schemas.py
+++ b/generations/library_rag/mcp_tools/schemas.py
@@ -0,0 +1,361 @@
+"""
+Pydantic schemas for MCP tool inputs and outputs.
+
+All schemas use strict validation and include field descriptions
+for automatic JSON schema generation in MCP tool definitions.
+"""
+
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, Field
+
+
+# =============================================================================
+# Parsing Tool Schemas
+# =============================================================================
+
+
+class ParsePdfInput(BaseModel):
+    """Input schema for parse_pdf tool."""
+
+    pdf_path: str = Field(
+        ...,
+        description="Path to the PDF file to process, or URL to download",
+        min_length=1,
+    )
+
+
+class ParsePdfOutput(BaseModel):
+    """Output schema for parse_pdf tool."""
+
+    success: bool = Field(..., description="Whether processing succeeded")
+    document_name: str = Field(..., description="Name of the processed document")
+    source_id: str = Field(..., description="Unique identifier for the document")
+    pages: int = Field(..., description="Number of pages processed")
+    chunks_count: int = Field(..., description="Number of chunks created")
+    cost_ocr: float = Field(..., description="OCR processing cost in EUR")
+    cost_llm: float = Field(..., description="LLM processing cost in EUR")
+    cost_total: float = Field(..., description="Total processing cost in EUR")
+    output_dir: str = Field(..., description="Directory containing output files")
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Extracted metadata (title, author, language, year)",
+    )
+    error: Optional[str] = Field(None, description="Error message if failed")
+
+
+# =============================================================================
+# Retrieval Tool Schemas
+# =============================================================================
+
+
+class ChunkResult(BaseModel):
+    """A single chunk result from search."""
+
+    text: str = Field(..., description="Chunk text content")
+    similarity: float = Field(..., description="Similarity score (0-1)")
+    source_id: str = Field(..., description="Source document ID (e.g., 'peirce_collected_papers')")
+    canonical_reference: Optional[str] = Field(None, description="Academic citation reference (e.g., 'CP 5.628', 'Ménon 80a')")
+    section_path: str = Field(..., description="Hierarchical section path")
+    chapter_title: Optional[str] = Field(None, description="Chapter title if available")
+    work_title: str = Field(..., description="Title of the work")
+    work_author: str = Field(..., description="Author of the work")
+    order_index: int = Field(..., description="Position in document")
+
+
+class SearchChunksInput(BaseModel):
+    """Input schema for search_chunks tool."""
+
+    query: str = Field(
+        ...,
+        description="Semantic search query",
+        min_length=1,
+        max_length=1000,
+    )
+    limit: int = Field(
+        default=10,
+        description="Maximum number of results to return",
+        ge=1,
+        le=500,
+    )
+    min_similarity: float = Field(
+        default=0.0,
+        description="Minimum similarity threshold (0-1)",
+        ge=0.0,
+        le=1.0,
+    )
+    author_filter: Optional[str] = Field(
+        None,
+        description="Filter by author name",
+    )
+    work_filter: Optional[str] = Field(
+        None,
+        description="Filter by work title",
+    )
+    language_filter: Optional[str] = Field(
+        None,
+        description="Filter by language code (e.g., 'fr', 'en')",
+    )
+
+
+class SearchChunksOutput(BaseModel):
+    """Output schema for search_chunks tool."""
+
+    results: List[ChunkResult] = Field(
+        default_factory=list,
+        description="List of matching chunks",
+    )
+    total_count: int = Field(..., description="Total number of results")
+    query: str = Field(..., description="Original query")
+
+
+class SummaryResult(BaseModel):
+    """A single summary result from search."""
+
+    text: str = Field(..., description="Summary text")
+    similarity: float = Field(..., description="Similarity score (0-1)")
+    title: str = Field(..., description="Section title")
+    section_path: str = Field(..., description="Hierarchical section path")
+    level: int = Field(..., description="Hierarchy level (1=chapter, 2=section, etc.)")
+    concepts: List[str] = Field(default_factory=list, description="Key concepts")
+    document_source_id: str = Field(..., description="Source document ID")
+
+
+class SearchSummariesInput(BaseModel):
+    """Input schema for search_summaries tool."""
+
+    query: str = Field(
+        ...,
+        description="Semantic search query",
+        min_length=1,
+        max_length=1000,
+    )
+    limit: int = Field(
+        default=10,
+        description="Maximum number of results to return",
+        ge=1,
+        le=100,
+    )
+    min_level: Optional[int] = Field(
+        None,
+        description="Minimum hierarchy level (1=chapter)",
+        ge=1,
+        le=5,
+    )
+    max_level: Optional[int] = Field(
+        None,
+        description="Maximum hierarchy level",
+        ge=1,
+        le=5,
+    )
+
+
+class SearchSummariesOutput(BaseModel):
+    """Output schema for search_summaries tool."""
+
+    results: List[SummaryResult] = Field(
+        default_factory=list,
+        description="List of matching summaries",
+    )
+    total_count: int = Field(..., description="Total number of results")
+    query: str = Field(..., description="Original query")
+
+
+class GetDocumentInput(BaseModel):
+    """Input schema for get_document tool."""
+
+    source_id: str = Field(
+        ...,
+        description="Document source ID (e.g., 'platon-menon')",
+        min_length=1,
+    )
+    include_chunks: bool = Field(
+        default=False,
+        description="Include document chunks in response",
+    )
+    chunk_limit: int = Field(
+        default=50,
+        description="Maximum chunks to return if include_chunks=True",
+        ge=1,
+        le=500,
+    )
+
+
+class DocumentInfo(BaseModel):
+    """Document information."""
+
+    source_id: str = Field(..., description="Unique document identifier")
+    work_title: str = Field(..., description="Title of the work")
+    work_author: str = Field(..., description="Author of the work")
+    edition: Optional[str] = Field(None, description="Edition information")
+    pages: int = Field(..., description="Number of pages")
+    language: str = Field(..., description="Document language")
+    toc: Optional[Dict[str, Any]] = Field(None, description="Table of contents")
+    hierarchy: Optional[Dict[str, Any]] = Field(None, description="Document hierarchy")
+
+
+class GetDocumentOutput(BaseModel):
+    """Output schema for get_document tool."""
+
+    document: Optional[DocumentInfo] = Field(None, description="Document information")
+    chunks: List[ChunkResult] = Field(
+        default_factory=list,
+        description="Document chunks (if requested)",
+    )
+    chunks_total: int = Field(
+        default=0,
+        description="Total number of chunks in document",
+    )
+    found: bool = Field(..., description="Whether document was found")
+    error: Optional[str] = Field(None, description="Error message if not found")
+
+
+class ListDocumentsInput(BaseModel):
+    """Input schema for list_documents tool."""
+
+    author_filter: Optional[str] = Field(None, description="Filter by author name")
+    work_filter: Optional[str] = Field(None, description="Filter by work title")
+    language_filter: Optional[str] = Field(None, description="Filter by language code")
+    limit: int = Field(
+        default=50,
+        description="Maximum number of results",
+        ge=1,
+        le=250,
+    )
+    offset: int = Field(
+        default=0,
+        description="Offset for pagination",
+        ge=0,
+    )
+
+
+class DocumentSummary(BaseModel):
+    """Summary of a document for listing."""
+
+    source_id: str = Field(..., description="Unique document identifier")
+    work_title: str = Field(..., description="Title of the work")
+    work_author: str = Field(..., description="Author of the work")
+    pages: int = Field(..., description="Number of pages")
+    chunks_count: int = Field(..., description="Number of chunks")
+    language: str = Field(..., description="Document language")
+
+
+class ListDocumentsOutput(BaseModel):
+    """Output schema for list_documents tool."""
+
+    documents: List[DocumentSummary] = Field(
+        default_factory=list,
+        description="List of documents",
+    )
+    total_count: int = Field(..., description="Total number of documents")
+    limit: int = Field(..., description="Applied limit")
+    offset: int = Field(..., description="Applied offset")
+
+
+class GetChunksByDocumentInput(BaseModel):
+    """Input schema for get_chunks_by_document tool."""
+
+    source_id: str = Field(
+        ...,
+        description="Document source ID",
+        min_length=1,
+    )
+    limit: int = Field(
+        default=50,
+        description="Maximum number of chunks to return",
+        ge=1,
+        le=500,
+    )
+    offset: int = Field(
+        default=0,
+        description="Offset for pagination",
+        ge=0,
+    )
+    section_filter: Optional[str] = Field(
+        None,
+        description="Filter by section path prefix",
+    )
+
+
+class GetChunksByDocumentOutput(BaseModel):
+    """Output schema for get_chunks_by_document tool."""
+
+    chunks: List[ChunkResult] = Field(
+        default_factory=list,
+        description="Ordered list of chunks",
+    )
+    total_count: int = Field(..., description="Total chunks in document")
+    document_source_id: str = Field(..., description="Document source ID")
+    limit: int = Field(..., description="Applied limit")
+    offset: int = Field(..., description="Applied offset")
+
+
+class WorkInfo(BaseModel):
+    """Information about a work."""
+
+    title: str = Field(..., description="Work title")
+    author: str = Field(..., description="Author name")
+    year: Optional[int] = Field(None, description="Publication year")
+    language: str = Field(..., description="Language code")
+    genre: Optional[str] = Field(None, description="Genre classification")
+
+
+class AuthorWorkResult(BaseModel):
+    """Work with its documents for author filtering."""
+
+    work: WorkInfo = Field(..., description="Work information")
+    documents: List[DocumentSummary] = Field(
+        default_factory=list,
+        description="Documents for this work",
+    )
+    total_chunks: int = Field(..., description="Total chunks across all documents")
+
+
+class FilterByAuthorInput(BaseModel):
+    """Input schema for filter_by_author tool."""
+
+    author: str = Field(
+        ...,
+        description="Author name to search for",
+        min_length=1,
+    )
+    include_chunk_counts: bool = Field(
+        default=True,
+        description="Include chunk counts in results",
+    )
+
+
+class FilterByAuthorOutput(BaseModel):
+    """Output schema for filter_by_author tool."""
+
+    author: str = Field(..., description="Searched author name")
+    works: List[AuthorWorkResult] = Field(
+        default_factory=list,
+        description="Works by this author",
+    )
+    total_works: int = Field(..., description="Total number of works")
+    total_documents: int = Field(..., description="Total number of documents")
+    total_chunks: int = Field(..., description="Total number of chunks")
+
+
+class DeleteDocumentInput(BaseModel):
+    """Input schema for delete_document tool."""
+
+    source_id: str = Field(
+        ...,
+        description="Document source ID to delete",
+        min_length=1,
+    )
+    confirm: bool = Field(
+        default=False,
+        description="Must be True to confirm deletion",
+    )
+
+
+class DeleteDocumentOutput(BaseModel):
+    """Output schema for delete_document tool."""
+
+    success: bool = Field(..., description="Whether deletion succeeded")
+    source_id: str = Field(..., description="Deleted document source ID")
+    chunks_deleted: int = Field(..., description="Number of chunks deleted")
+    summaries_deleted: int = Field(..., description="Number of summaries deleted")
+    error: Optional[str] = Field(None, description="Error message if failed")