Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server)
  - PDF processing pipeline with OCR and LLM extraction
  - Weaviate vector database integration (BGE-M3 embeddings)
  - Flask web interface with search and document management
  - MCP server for Claude Desktop integration
  - Comprehensive test suite (134 tests)

- Clean up root directory
  - Remove obsolete documentation files
  - Remove backup and temporary files
  - Update autonomous agent configuration

- Update prompts
  - Enhance initializer bis prompt with better instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions

View File

@@ -0,0 +1,106 @@
"""
MCP Tools for Library RAG Server.
This package contains all tool implementations for the Library RAG MCP server:
- Parsing tools: PDF ingestion with optimal parameters
- Retrieval tools: Semantic search and document management
- Exceptions: Custom exception classes for structured error handling
- Logging: Structured JSON logging configuration
"""
from mcp_tools.schemas import (
ParsePdfInput,
ParsePdfOutput,
SearchChunksInput,
SearchChunksOutput,
SearchSummariesInput,
SearchSummariesOutput,
GetDocumentInput,
GetDocumentOutput,
ListDocumentsInput,
ListDocumentsOutput,
GetChunksByDocumentInput,
GetChunksByDocumentOutput,
FilterByAuthorInput,
FilterByAuthorOutput,
DeleteDocumentInput,
DeleteDocumentOutput,
)
from mcp_tools.exceptions import (
MCPToolError,
WeaviateConnectionError,
PDFProcessingError,
DocumentNotFoundError,
ValidationError,
LLMProcessingError,
DownloadError,
)
from mcp_tools.logging_config import (
setup_mcp_logging,
get_tool_logger,
ToolInvocationLogger,
log_tool_invocation,
log_weaviate_query,
redact_sensitive_data,
redact_dict,
)
from mcp_tools.parsing_tools import parse_pdf_handler
from mcp_tools.retrieval_tools import (
search_chunks_handler,
search_summaries_handler,
get_document_handler,
list_documents_handler,
get_chunks_by_document_handler,
filter_by_author_handler,
delete_document_handler,
)
__all__ = [
# Parsing tools
"parse_pdf_handler",
# Retrieval tools
"search_chunks_handler",
"search_summaries_handler",
"get_document_handler",
"list_documents_handler",
"get_chunks_by_document_handler",
"filter_by_author_handler",
"delete_document_handler",
# Parsing schemas
"ParsePdfInput",
"ParsePdfOutput",
# Retrieval schemas
"SearchChunksInput",
"SearchChunksOutput",
"SearchSummariesInput",
"SearchSummariesOutput",
"GetDocumentInput",
"GetDocumentOutput",
"ListDocumentsInput",
"ListDocumentsOutput",
"GetChunksByDocumentInput",
"GetChunksByDocumentOutput",
"FilterByAuthorInput",
"FilterByAuthorOutput",
"DeleteDocumentInput",
"DeleteDocumentOutput",
# Exceptions
"MCPToolError",
"WeaviateConnectionError",
"PDFProcessingError",
"DocumentNotFoundError",
"ValidationError",
"LLMProcessingError",
"DownloadError",
# Logging
"setup_mcp_logging",
"get_tool_logger",
"ToolInvocationLogger",
"log_tool_invocation",
"log_weaviate_query",
"redact_sensitive_data",
"redact_dict",
]

View File

@@ -0,0 +1,297 @@
"""Custom exception classes for Library RAG MCP Server.
This module defines custom exception classes used throughout the MCP server
for structured error handling and consistent error responses.
Exception Hierarchy:
MCPToolError (base)
├── WeaviateConnectionError - Database connection failures
├── PDFProcessingError - PDF parsing/OCR failures
├── DocumentNotFoundError - Document/chunk retrieval failures
└── ValidationError - Input validation failures
Example:
Raise and catch custom exceptions::
from mcp_tools.exceptions import WeaviateConnectionError
try:
client = connect_to_weaviate()
except Exception as e:
raise WeaviateConnectionError("Failed to connect") from e
"""
from __future__ import annotations
from typing import Any, Dict, Optional
class MCPToolError(Exception):
"""Base exception for all MCP tool errors.
This is the base class for all custom exceptions in the MCP server.
It provides structured error information that can be converted to
MCP error responses.
Attributes:
message: Human-readable error description.
error_code: Machine-readable error code for categorization.
details: Additional context about the error.
original_error: The underlying exception if this wraps another error.
"""
def __init__(
self,
message: str,
*,
error_code: str = "MCP_ERROR",
details: Optional[Dict[str, Any]] = None,
original_error: Optional[Exception] = None,
) -> None:
"""Initialize the MCPToolError.
Args:
message: Human-readable error description.
error_code: Machine-readable error code (default: "MCP_ERROR").
details: Additional context about the error (optional).
original_error: The underlying exception if wrapping (optional).
"""
super().__init__(message)
self.message = message
self.error_code = error_code
self.details = details or {}
self.original_error = original_error
def to_dict(self) -> Dict[str, Any]:
"""Convert exception to a dictionary for JSON serialization.
Returns:
Dictionary with error information suitable for MCP responses.
"""
result: Dict[str, Any] = {
"error": True,
"error_code": self.error_code,
"message": self.message,
}
if self.details:
result["details"] = self.details
if self.original_error:
result["original_error"] = str(self.original_error)
return result
def __str__(self) -> str:
"""Return string representation of the error."""
if self.original_error:
return f"[{self.error_code}] {self.message} (caused by: {self.original_error})"
return f"[{self.error_code}] {self.message}"
class WeaviateConnectionError(MCPToolError):
"""Raised when Weaviate database connection fails.
This exception is raised when the MCP server cannot establish or
maintain a connection to the Weaviate vector database.
Example:
>>> raise WeaviateConnectionError(
... "Cannot connect to Weaviate at localhost:8080",
... details={"host": "localhost", "port": 8080}
... )
"""
def __init__(
self,
message: str = "Failed to connect to Weaviate",
*,
details: Optional[Dict[str, Any]] = None,
original_error: Optional[Exception] = None,
) -> None:
"""Initialize WeaviateConnectionError.
Args:
message: Error description (default: "Failed to connect to Weaviate").
details: Additional context (host, port, etc.).
original_error: The underlying connection exception.
"""
super().__init__(
message,
error_code="WEAVIATE_CONNECTION_ERROR",
details=details,
original_error=original_error,
)
class PDFProcessingError(MCPToolError):
"""Raised when PDF processing fails.
This exception is raised when the MCP server encounters an error
during PDF parsing, OCR, or any step in the PDF ingestion pipeline.
Example:
>>> raise PDFProcessingError(
... "OCR failed for page 5",
... details={"page": 5, "pdf_path": "/docs/test.pdf"}
... )
"""
def __init__(
self,
message: str = "PDF processing failed",
*,
details: Optional[Dict[str, Any]] = None,
original_error: Optional[Exception] = None,
) -> None:
"""Initialize PDFProcessingError.
Args:
message: Error description (default: "PDF processing failed").
details: Additional context (pdf_path, page, step, etc.).
original_error: The underlying processing exception.
"""
super().__init__(
message,
error_code="PDF_PROCESSING_ERROR",
details=details,
original_error=original_error,
)
class DocumentNotFoundError(MCPToolError):
"""Raised when a requested document or chunk is not found.
This exception is raised when a retrieval operation cannot find
the requested document, chunk, or summary in Weaviate.
Example:
>>> raise DocumentNotFoundError(
... "Document not found",
... details={"source_id": "platon-menon"}
... )
"""
def __init__(
self,
message: str = "Document not found",
*,
details: Optional[Dict[str, Any]] = None,
original_error: Optional[Exception] = None,
) -> None:
"""Initialize DocumentNotFoundError.
Args:
message: Error description (default: "Document not found").
details: Additional context (source_id, query, etc.).
original_error: The underlying exception if any.
"""
super().__init__(
message,
error_code="DOCUMENT_NOT_FOUND",
details=details,
original_error=original_error,
)
class ValidationError(MCPToolError):
"""Raised when input validation fails.
This exception is raised when user input does not meet the
required validation criteria (e.g., invalid paths, bad parameters).
Example:
>>> raise ValidationError(
... "Invalid PDF path",
... details={"path": "/nonexistent/file.pdf", "reason": "File not found"}
... )
"""
def __init__(
self,
message: str = "Validation failed",
*,
details: Optional[Dict[str, Any]] = None,
original_error: Optional[Exception] = None,
) -> None:
"""Initialize ValidationError.
Args:
message: Error description (default: "Validation failed").
details: Additional context (field, value, reason, etc.).
original_error: The underlying validation exception.
"""
super().__init__(
message,
error_code="VALIDATION_ERROR",
details=details,
original_error=original_error,
)
class LLMProcessingError(MCPToolError):
"""Raised when LLM processing fails.
This exception is raised when the LLM (Mistral or Ollama) fails
to process content during metadata extraction, chunking, or other
LLM-based operations.
Example:
>>> raise LLMProcessingError(
... "LLM timeout during metadata extraction",
... details={"provider": "ollama", "model": "mistral", "step": "metadata"}
... )
"""
def __init__(
self,
message: str = "LLM processing failed",
*,
details: Optional[Dict[str, Any]] = None,
original_error: Optional[Exception] = None,
) -> None:
"""Initialize LLMProcessingError.
Args:
message: Error description (default: "LLM processing failed").
details: Additional context (provider, model, step, etc.).
original_error: The underlying LLM exception.
"""
super().__init__(
message,
error_code="LLM_PROCESSING_ERROR",
details=details,
original_error=original_error,
)
class DownloadError(MCPToolError):
"""Raised when file download from URL fails.
This exception is raised when the MCP server cannot download
a PDF file from a provided URL.
Example:
>>> raise DownloadError(
... "Failed to download PDF",
... details={"url": "https://example.com/doc.pdf", "status_code": 404}
... )
"""
def __init__(
self,
message: str = "File download failed",
*,
details: Optional[Dict[str, Any]] = None,
original_error: Optional[Exception] = None,
) -> None:
"""Initialize DownloadError.
Args:
message: Error description (default: "File download failed").
details: Additional context (url, status_code, etc.).
original_error: The underlying HTTP exception.
"""
super().__init__(
message,
error_code="DOWNLOAD_ERROR",
details=details,
original_error=original_error,
)

View File

@@ -0,0 +1,462 @@
"""Structured JSON logging configuration for Library RAG MCP Server.
This module provides structured JSON logging with sensitive data filtering
and tool invocation tracking.
Features:
- JSON-formatted log output for machine parsing
- Sensitive data filtering (API keys, passwords)
- Tool invocation logging with timing
- Configurable log levels via environment variable
Example:
Configure logging at server startup::
from mcp_tools.logging_config import setup_mcp_logging, get_tool_logger
# Setup logging
logger = setup_mcp_logging(log_level="INFO")
# Get tool-specific logger
tool_logger = get_tool_logger("search_chunks")
tool_logger.info("Processing query", extra={"query": "justice"})
"""
from __future__ import annotations
import json
import logging
import os
import re
import sys
import time
from contextlib import contextmanager
from datetime import datetime, timezone
from functools import wraps
from pathlib import Path
from typing import Any, Callable, Dict, Generator, Literal, Optional, TypeVar, cast
# Type variable for decorator return type preservation
F = TypeVar("F", bound=Callable[..., Any])
# =============================================================================
# Sensitive Data Patterns
# =============================================================================
# Patterns to detect sensitive data in log messages
SENSITIVE_PATTERNS = [
# API keys
(re.compile(r'(api[_-]?key\s*[=:]\s*)["\']?[\w-]{20,}["\']?', re.I), r"\1***REDACTED***"),
(re.compile(r'(bearer\s+)[\w-]{20,}', re.I), r"\1***REDACTED***"),
(re.compile(r'(authorization\s*[=:]\s*)["\']?[\w-]{20,}["\']?', re.I), r"\1***REDACTED***"),
# Mistral API key format
(re.compile(r'(MISTRAL_API_KEY\s*[=:]\s*)["\']?[\w-]+["\']?', re.I), r"\1***REDACTED***"),
# Generic secrets
(re.compile(r'(password\s*[=:]\s*)["\']?[^\s"\']+["\']?', re.I), r"\1***REDACTED***"),
(re.compile(r'(secret\s*[=:]\s*)["\']?[\w-]+["\']?', re.I), r"\1***REDACTED***"),
(re.compile(r'(token\s*[=:]\s*)["\']?[\w-]{20,}["\']?', re.I), r"\1***REDACTED***"),
]
def redact_sensitive_data(message: str) -> str:
"""Remove sensitive data from log messages.
Args:
message: The log message to sanitize.
Returns:
Sanitized message with sensitive data redacted.
Example:
>>> redact_sensitive_data("api_key=sk-12345abcdef")
"api_key=***REDACTED***"
"""
result = message
for pattern, replacement in SENSITIVE_PATTERNS:
result = pattern.sub(replacement, result)
return result
def redact_dict(data: Dict[str, Any]) -> Dict[str, Any]:
"""Recursively redact sensitive data from a dictionary.
Args:
data: Dictionary that may contain sensitive data.
Returns:
New dictionary with sensitive values redacted.
"""
sensitive_keys = {
"api_key", "apikey", "api-key",
"password", "passwd", "pwd",
"secret", "token", "auth",
"authorization", "bearer",
"mistral_api_key", "MISTRAL_API_KEY",
}
result: Dict[str, Any] = {}
for key, value in data.items():
key_lower = key.lower().replace("-", "_")
if key_lower in sensitive_keys or any(s in key_lower for s in ["key", "secret", "token", "password"]):
result[key] = "***REDACTED***"
elif isinstance(value, dict):
result[key] = redact_dict(value)
elif isinstance(value, str):
result[key] = redact_sensitive_data(value)
else:
result[key] = value
return result
# =============================================================================
# JSON Log Formatter
# =============================================================================
class JSONLogFormatter(logging.Formatter):
"""JSON formatter for structured logging.
Outputs log records as single-line JSON objects with consistent structure.
Automatically redacts sensitive data from messages and extra fields.
JSON Structure:
{
"timestamp": "2024-12-24T10:30:00.000Z",
"level": "INFO",
"logger": "library-rag-mcp.search_chunks",
"message": "Processing query",
"tool": "search_chunks",
"duration_ms": 123,
...extra fields...
}
"""
def format(self, record: logging.LogRecord) -> str:
"""Format the log record as JSON.
Args:
record: The log record to format.
Returns:
JSON-formatted log string.
"""
# Base log structure
log_entry: Dict[str, Any] = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"level": record.levelname,
"logger": record.name,
"message": redact_sensitive_data(record.getMessage()),
}
# Add exception info if present
if record.exc_info:
log_entry["exception"] = self.formatException(record.exc_info)
# Add extra fields (excluding standard LogRecord attributes)
standard_attrs = {
"name", "msg", "args", "levelname", "levelno", "pathname",
"filename", "module", "lineno", "funcName", "created",
"msecs", "relativeCreated", "thread", "threadName",
"processName", "process", "exc_info", "exc_text", "stack_info",
"message", "taskName",
}
for key, value in record.__dict__.items():
if key not in standard_attrs and not key.startswith("_"):
if isinstance(value, dict):
log_entry[key] = redact_dict(value)
elif isinstance(value, str):
log_entry[key] = redact_sensitive_data(value)
else:
log_entry[key] = value
return json.dumps(log_entry, default=str, ensure_ascii=False)
# =============================================================================
# Logging Setup
# =============================================================================
def setup_mcp_logging(
log_level: str = "INFO",
log_dir: Optional[Path] = None,
json_format: bool = True,
) -> logging.Logger:
"""Configure structured logging for the MCP server.
Sets up logging with JSON formatting to both file and stderr.
Uses stderr for console output since stdout is used for MCP communication.
Args:
log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL).
log_dir: Directory for log files. Defaults to "logs".
json_format: Use JSON formatting (default True).
Returns:
Configured logger instance for the MCP server.
Example:
>>> logger = setup_mcp_logging(log_level="DEBUG")
>>> logger.info("Server started", extra={"port": 8080})
"""
# Determine log directory
if log_dir is None:
log_dir = Path("logs")
log_dir.mkdir(parents=True, exist_ok=True)
# Get or create the root MCP logger
logger = logging.getLogger("library-rag-mcp")
logger.setLevel(getattr(logging, log_level.upper(), logging.INFO))
# Clear existing handlers to avoid duplicates
logger.handlers.clear()
# Create formatters
if json_format:
formatter: logging.Formatter = JSONLogFormatter()
else:
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
# File handler (JSON logs)
file_handler = logging.FileHandler(
log_dir / "mcp_server.log",
encoding="utf-8",
)
file_handler.setLevel(logging.DEBUG) # Log everything to file
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
# Stderr handler (for console output - stdout is for MCP)
stderr_handler = logging.StreamHandler(sys.stderr)
stderr_handler.setLevel(getattr(logging, log_level.upper(), logging.INFO))
stderr_handler.setFormatter(formatter)
logger.addHandler(stderr_handler)
# Prevent propagation to root logger
logger.propagate = False
return logger
def get_tool_logger(tool_name: str) -> logging.Logger:
"""Get a logger for a specific MCP tool.
Creates a child logger under the main MCP logger with the tool name
automatically included in log entries.
Args:
tool_name: Name of the MCP tool (e.g., "search_chunks", "parse_pdf").
Returns:
Logger instance for the tool.
Example:
>>> logger = get_tool_logger("search_chunks")
>>> logger.info("Query processed", extra={"results": 10})
"""
return logging.getLogger(f"library-rag-mcp.{tool_name}")
# =============================================================================
# Tool Invocation Logging
# =============================================================================
class ToolInvocationLogger:
"""Context manager for logging tool invocations with timing.
Automatically logs tool start, success/failure, and duration.
Handles exception logging and provides structured output.
Example:
>>> with ToolInvocationLogger("search_chunks", {"query": "justice"}) as inv:
... result = do_search()
... inv.set_result({"count": 10})
"""
def __init__(
self,
tool_name: str,
inputs: Dict[str, Any],
logger: Optional[logging.Logger] = None,
) -> None:
"""Initialize the invocation logger.
Args:
tool_name: Name of the tool being invoked.
inputs: Tool input parameters (will be redacted).
logger: Logger to use. Defaults to tool-specific logger.
"""
self.tool_name = tool_name
self.inputs = redact_dict(inputs)
self.logger = logger or get_tool_logger(tool_name)
self.start_time: float = 0.0
self.result: Optional[Dict[str, Any]] = None
self.error: Optional[Exception] = None
def __enter__(self) -> "ToolInvocationLogger":
"""Start timing and log invocation start."""
self.start_time = time.perf_counter()
self.logger.info(
f"Tool invocation started: {self.tool_name}",
extra={
"tool": self.tool_name,
"event": "invocation_start",
"inputs": self.inputs,
},
)
return self
def __exit__(
self,
exc_type: Optional[type],
exc_val: Optional[BaseException],
exc_tb: Any,
) -> Literal[False]:
"""Log invocation completion with timing."""
duration_ms = (time.perf_counter() - self.start_time) * 1000
if exc_val is not None:
# Log error
self.logger.error(
f"Tool invocation failed: {self.tool_name}",
extra={
"tool": self.tool_name,
"event": "invocation_error",
"duration_ms": round(duration_ms, 2),
"error_type": exc_type.__name__ if exc_type else "Unknown",
"error_message": str(exc_val),
},
exc_info=True,
)
# Don't suppress the exception
return False
# Log success
extra: Dict[str, Any] = {
"tool": self.tool_name,
"event": "invocation_success",
"duration_ms": round(duration_ms, 2),
}
if self.result:
extra["result_summary"] = self._summarize_result()
self.logger.info(
f"Tool invocation completed: {self.tool_name}",
extra=extra,
)
return False
def set_result(self, result: Dict[str, Any]) -> None:
"""Set the result for logging summary.
Args:
result: The tool result dictionary.
"""
self.result = result
def _summarize_result(self) -> Dict[str, Any]:
"""Create a summary of the result for logging.
Returns:
Dictionary with key result metrics (counts, success status, etc.)
"""
if not self.result:
return {}
summary: Dict[str, Any] = {}
# Common summary fields
if "success" in self.result:
summary["success"] = self.result["success"]
if "total_count" in self.result:
summary["total_count"] = self.result["total_count"]
if "results" in self.result and isinstance(self.result["results"], list):
summary["result_count"] = len(self.result["results"])
if "chunks_count" in self.result:
summary["chunks_count"] = self.result["chunks_count"]
if "cost_total" in self.result:
summary["cost_total"] = self.result["cost_total"]
if "found" in self.result:
summary["found"] = self.result["found"]
if "error" in self.result and self.result["error"]:
summary["error"] = self.result["error"]
return summary
@contextmanager
def log_tool_invocation(
tool_name: str,
inputs: Dict[str, Any],
) -> Generator[ToolInvocationLogger, None, None]:
"""Context manager for logging tool invocations.
Convenience function that creates and manages a ToolInvocationLogger.
Args:
tool_name: Name of the tool being invoked.
inputs: Tool input parameters.
Yields:
ToolInvocationLogger instance for setting results.
Example:
>>> with log_tool_invocation("search_chunks", {"query": "test"}) as inv:
... result = search(query)
... inv.set_result(result)
"""
logger_instance = ToolInvocationLogger(tool_name, inputs)
with logger_instance as inv:
yield inv
def log_weaviate_query(
operation: str,
collection: str,
filters: Optional[Dict[str, Any]] = None,
result_count: Optional[int] = None,
duration_ms: Optional[float] = None,
) -> None:
"""Log a Weaviate query operation.
Utility function for logging Weaviate database queries with consistent
structure.
Args:
operation: Query operation type (fetch, near_text, aggregate, etc.).
collection: Weaviate collection name.
filters: Query filters applied (optional).
result_count: Number of results returned (optional).
duration_ms: Query duration in milliseconds (optional).
Example:
>>> log_weaviate_query(
... operation="near_text",
... collection="Chunk",
... filters={"author": "Platon"},
... result_count=10,
... duration_ms=45.2
... )
"""
logger = logging.getLogger("library-rag-mcp.weaviate")
extra: Dict[str, Any] = {
"event": "weaviate_query",
"operation": operation,
"collection": collection,
}
if filters:
extra["filters"] = redact_dict(filters)
if result_count is not None:
extra["result_count"] = result_count
if duration_ms is not None:
extra["duration_ms"] = round(duration_ms, 2)
logger.debug(f"Weaviate {operation} on {collection}", extra=extra)

View File

@@ -0,0 +1,335 @@
"""Parsing tools for Library RAG MCP Server.
This module implements the parse_pdf tool with optimal pre-configured parameters
for PDF ingestion into the Library RAG system.
The tool uses fixed optimal parameters:
- llm_provider: "mistral" (API-based, fast)
- llm_model: "mistral-medium-latest" (best quality/cost ratio)
- use_semantic_chunking: True (LLM-based intelligent chunking)
- use_ocr_annotations: True (3x cost but better TOC extraction)
- ingest_to_weaviate: True (automatic vectorization and storage)
Example:
The parse_pdf tool can be invoked via MCP with a simple path::
{
"tool": "parse_pdf",
"arguments": {
"pdf_path": "/path/to/document.pdf"
}
}
Or with a URL::
{
"tool": "parse_pdf",
"arguments": {
"pdf_path": "https://example.com/document.pdf"
}
}
"""
from __future__ import annotations
import logging
import tempfile
from pathlib import Path
from typing import Any, Dict, Literal
from urllib.parse import urlparse
import httpx
from mcp_tools.schemas import ParsePdfInput, ParsePdfOutput
# Import pdf_pipeline for PDF processing
from utils.pdf_pipeline import process_pdf, process_pdf_bytes
from utils.types import LLMProvider
# Logger for this module
logger = logging.getLogger(__name__)
# =============================================================================
# Constants - Fixed Optimal Parameters
# =============================================================================
# LLM provider configuration (Mistral API for best results)
FIXED_LLM_PROVIDER: LLMProvider = "mistral"
FIXED_LLM_MODEL = "mistral-medium-latest"
# Processing options (optimal settings for quality)
FIXED_USE_SEMANTIC_CHUNKING = True
FIXED_USE_OCR_ANNOTATIONS = True
FIXED_INGEST_TO_WEAVIATE = True
# Additional processing flags
FIXED_USE_LLM = True
# Note: The following flags are not supported by process_pdf() and should not be used
# FIXED_CLEAN_CHUNKS = True
# FIXED_EXTRACT_CONCEPTS = True
# FIXED_VALIDATE_OUTPUT = True
# =============================================================================
# Helper Functions
# =============================================================================
def is_url(path: str) -> bool:
"""Check if a path is a URL.
Args:
path: The path or URL string to check.
Returns:
True if the path is a valid HTTP/HTTPS URL, False otherwise.
Example:
>>> is_url("https://example.com/doc.pdf")
True
>>> is_url("/path/to/doc.pdf")
False
"""
try:
result = urlparse(path)
return result.scheme in ("http", "https")
except ValueError:
return False
async def download_pdf(url: str, timeout: float = 60.0) -> bytes:
"""Download a PDF file from a URL.
Args:
url: The URL to download from. Must be HTTP or HTTPS.
timeout: Maximum time in seconds to wait for download.
Defaults to 60 seconds.
Returns:
Raw bytes content of the downloaded PDF file.
Raises:
httpx.HTTPError: If the download fails (network error, HTTP error, etc.).
ValueError: If the URL is invalid or not accessible.
Example:
>>> pdf_bytes = await download_pdf("https://example.com/document.pdf")
>>> len(pdf_bytes) > 0
True
"""
logger.info(f"Downloading PDF from: {url}")
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
response = await client.get(url)
response.raise_for_status()
content_type = response.headers.get("content-type", "")
if "application/pdf" not in content_type.lower() and not url.lower().endswith(
".pdf"
):
logger.warning(
f"URL may not be a PDF (Content-Type: {content_type}), proceeding anyway"
)
logger.info(f"Downloaded {len(response.content)} bytes from {url}")
return response.content
def extract_filename_from_url(url: str) -> str:
"""Extract a filename from a URL.
Args:
url: The URL to extract filename from.
Returns:
Extracted filename with .pdf extension. Falls back to "downloaded.pdf"
if no filename can be extracted.
Example:
>>> extract_filename_from_url("https://example.com/documents/kant.pdf")
"kant.pdf"
>>> extract_filename_from_url("https://example.com/api/download")
"downloaded.pdf"
"""
parsed = urlparse(url)
path = parsed.path
if path:
# Get the last path component
filename = path.split("/")[-1]
if filename and "." in filename:
return filename
if filename:
return f"{filename}.pdf"
return "downloaded.pdf"
# =============================================================================
# Main Tool Implementation
# =============================================================================
async def parse_pdf_handler(input_data: ParsePdfInput) -> ParsePdfOutput:
"""Process a PDF document with optimal pre-configured parameters.
This is the main handler for the parse_pdf MCP tool. It processes PDFs
through the Library RAG pipeline with the following fixed optimal settings:
- LLM: Mistral API (mistral-medium-latest) for fast, high-quality processing
- OCR: Mistral OCR with annotations (better TOC extraction, 3x cost)
- Chunking: Semantic LLM-based chunking (argumentative units)
- Ingestion: Automatic Weaviate vectorization and storage
The tool accepts either a local file path or a URL. URLs are automatically
downloaded before processing.
Args:
input_data: Validated input containing pdf_path (local path or URL).
Returns:
ParsePdfOutput containing processing results including:
- success: Whether processing completed successfully
- document_name: Name of the processed document
- source_id: Unique identifier for retrieval
- pages: Number of pages processed
- chunks_count: Number of chunks created
- cost_ocr: OCR cost in EUR
- cost_llm: LLM cost in EUR
- cost_total: Total processing cost
- output_dir: Directory containing output files
- metadata: Extracted document metadata
- error: Error message if processing failed
Example:
>>> input_data = ParsePdfInput(pdf_path="/docs/aristotle.pdf")
>>> result = await parse_pdf_handler(input_data)
>>> result.success
True
>>> result.chunks_count > 0
True
"""
pdf_path = input_data.pdf_path
logger.info(f"parse_pdf called with: {pdf_path}")
try:
# Determine if input is a URL or local path
if is_url(pdf_path):
# Download PDF from URL
logger.info(f"Detected URL input, downloading: {pdf_path}")
pdf_bytes = await download_pdf(pdf_path)
filename = extract_filename_from_url(pdf_path)
# Process from bytes
result = process_pdf_bytes(
file_bytes=pdf_bytes,
filename=filename,
output_dir=Path("output"),
llm_provider=FIXED_LLM_PROVIDER,
use_llm=FIXED_USE_LLM,
llm_model=FIXED_LLM_MODEL,
use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
)
else:
# Process local file
local_path = Path(pdf_path)
if not local_path.exists():
logger.error(f"PDF file not found: {pdf_path}")
return ParsePdfOutput(
success=False,
document_name="",
source_id="",
pages=0,
chunks_count=0,
cost_ocr=0.0,
cost_llm=0.0,
cost_total=0.0,
output_dir="",
metadata={},
error=f"PDF file not found: {pdf_path}",
)
logger.info(f"Processing local file: {local_path}")
result = process_pdf(
pdf_path=local_path,
output_dir=Path("output"),
use_llm=FIXED_USE_LLM,
llm_provider=FIXED_LLM_PROVIDER,
llm_model=FIXED_LLM_MODEL,
use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
)
# Convert pipeline result to output schema
success = result.get("success", False)
document_name = result.get("document_name", "")
source_id = result.get("source_id", document_name)
# Extract costs
cost_ocr = result.get("cost_ocr", 0.0)
cost_llm = result.get("cost_llm", 0.0)
cost_total = result.get("cost_total", cost_ocr + cost_llm)
# Extract metadata
metadata_raw = result.get("metadata", {})
if metadata_raw is None:
metadata_raw = {}
# Build output
output = ParsePdfOutput(
success=success,
document_name=document_name,
source_id=source_id,
pages=result.get("pages", 0),
chunks_count=result.get("chunks_count", 0),
cost_ocr=cost_ocr,
cost_llm=cost_llm,
cost_total=cost_total,
output_dir=str(result.get("output_dir", "")),
metadata=metadata_raw,
error=result.get("error"),
)
if success:
logger.info(
f"Successfully processed {document_name}: "
f"{output.chunks_count} chunks, {output.cost_total:.4f} EUR"
)
else:
logger.error(f"Failed to process {pdf_path}: {output.error}")
return output
except httpx.HTTPError as e:
logger.error(f"HTTP error downloading PDF: {e}")
return ParsePdfOutput(
success=False,
document_name="",
source_id="",
pages=0,
chunks_count=0,
cost_ocr=0.0,
cost_llm=0.0,
cost_total=0.0,
output_dir="",
metadata={},
error=f"Failed to download PDF: {e}",
)
except Exception as e:
logger.error(f"Error processing PDF: {e}", exc_info=True)
return ParsePdfOutput(
success=False,
document_name="",
source_id="",
pages=0,
chunks_count=0,
cost_ocr=0.0,
cost_llm=0.0,
cost_total=0.0,
output_dir="",
metadata={},
error=f"Processing error: {str(e)}",
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,361 @@
"""
Pydantic schemas for MCP tool inputs and outputs.
All schemas use strict validation and include field descriptions
for automatic JSON schema generation in MCP tool definitions.
"""
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, Field
# =============================================================================
# Parsing Tool Schemas
# =============================================================================
class ParsePdfInput(BaseModel):
"""Input schema for parse_pdf tool."""
pdf_path: str = Field(
...,
description="Path to the PDF file to process, or URL to download",
min_length=1,
)
class ParsePdfOutput(BaseModel):
"""Output schema for parse_pdf tool."""
success: bool = Field(..., description="Whether processing succeeded")
document_name: str = Field(..., description="Name of the processed document")
source_id: str = Field(..., description="Unique identifier for the document")
pages: int = Field(..., description="Number of pages processed")
chunks_count: int = Field(..., description="Number of chunks created")
cost_ocr: float = Field(..., description="OCR processing cost in EUR")
cost_llm: float = Field(..., description="LLM processing cost in EUR")
cost_total: float = Field(..., description="Total processing cost in EUR")
output_dir: str = Field(..., description="Directory containing output files")
metadata: Dict[str, Any] = Field(
default_factory=dict,
description="Extracted metadata (title, author, language, year)",
)
error: Optional[str] = Field(None, description="Error message if failed")
# =============================================================================
# Retrieval Tool Schemas
# =============================================================================
class ChunkResult(BaseModel):
"""A single chunk result from search."""
text: str = Field(..., description="Chunk text content")
similarity: float = Field(..., description="Similarity score (0-1)")
source_id: str = Field(..., description="Source document ID (e.g., 'peirce_collected_papers')")
canonical_reference: Optional[str] = Field(None, description="Academic citation reference (e.g., 'CP 5.628', 'Ménon 80a')")
section_path: str = Field(..., description="Hierarchical section path")
chapter_title: Optional[str] = Field(None, description="Chapter title if available")
work_title: str = Field(..., description="Title of the work")
work_author: str = Field(..., description="Author of the work")
order_index: int = Field(..., description="Position in document")
class SearchChunksInput(BaseModel):
"""Input schema for search_chunks tool."""
query: str = Field(
...,
description="Semantic search query",
min_length=1,
max_length=1000,
)
limit: int = Field(
default=10,
description="Maximum number of results to return",
ge=1,
le=500,
)
min_similarity: float = Field(
default=0.0,
description="Minimum similarity threshold (0-1)",
ge=0.0,
le=1.0,
)
author_filter: Optional[str] = Field(
None,
description="Filter by author name",
)
work_filter: Optional[str] = Field(
None,
description="Filter by work title",
)
language_filter: Optional[str] = Field(
None,
description="Filter by language code (e.g., 'fr', 'en')",
)
class SearchChunksOutput(BaseModel):
"""Output schema for search_chunks tool."""
results: List[ChunkResult] = Field(
default_factory=list,
description="List of matching chunks",
)
total_count: int = Field(..., description="Total number of results")
query: str = Field(..., description="Original query")
class SummaryResult(BaseModel):
"""A single summary result from search."""
text: str = Field(..., description="Summary text")
similarity: float = Field(..., description="Similarity score (0-1)")
title: str = Field(..., description="Section title")
section_path: str = Field(..., description="Hierarchical section path")
level: int = Field(..., description="Hierarchy level (1=chapter, 2=section, etc.)")
concepts: List[str] = Field(default_factory=list, description="Key concepts")
document_source_id: str = Field(..., description="Source document ID")
class SearchSummariesInput(BaseModel):
"""Input schema for search_summaries tool."""
query: str = Field(
...,
description="Semantic search query",
min_length=1,
max_length=1000,
)
limit: int = Field(
default=10,
description="Maximum number of results to return",
ge=1,
le=100,
)
min_level: Optional[int] = Field(
None,
description="Minimum hierarchy level (1=chapter)",
ge=1,
le=5,
)
max_level: Optional[int] = Field(
None,
description="Maximum hierarchy level",
ge=1,
le=5,
)
class SearchSummariesOutput(BaseModel):
"""Output schema for search_summaries tool."""
results: List[SummaryResult] = Field(
default_factory=list,
description="List of matching summaries",
)
total_count: int = Field(..., description="Total number of results")
query: str = Field(..., description="Original query")
class GetDocumentInput(BaseModel):
"""Input schema for get_document tool."""
source_id: str = Field(
...,
description="Document source ID (e.g., 'platon-menon')",
min_length=1,
)
include_chunks: bool = Field(
default=False,
description="Include document chunks in response",
)
chunk_limit: int = Field(
default=50,
description="Maximum chunks to return if include_chunks=True",
ge=1,
le=500,
)
class DocumentInfo(BaseModel):
"""Document information."""
source_id: str = Field(..., description="Unique document identifier")
work_title: str = Field(..., description="Title of the work")
work_author: str = Field(..., description="Author of the work")
edition: Optional[str] = Field(None, description="Edition information")
pages: int = Field(..., description="Number of pages")
language: str = Field(..., description="Document language")
toc: Optional[Dict[str, Any]] = Field(None, description="Table of contents")
hierarchy: Optional[Dict[str, Any]] = Field(None, description="Document hierarchy")
class GetDocumentOutput(BaseModel):
"""Output schema for get_document tool."""
document: Optional[DocumentInfo] = Field(None, description="Document information")
chunks: List[ChunkResult] = Field(
default_factory=list,
description="Document chunks (if requested)",
)
chunks_total: int = Field(
default=0,
description="Total number of chunks in document",
)
found: bool = Field(..., description="Whether document was found")
error: Optional[str] = Field(None, description="Error message if not found")
class ListDocumentsInput(BaseModel):
"""Input schema for list_documents tool."""
author_filter: Optional[str] = Field(None, description="Filter by author name")
work_filter: Optional[str] = Field(None, description="Filter by work title")
language_filter: Optional[str] = Field(None, description="Filter by language code")
limit: int = Field(
default=50,
description="Maximum number of results",
ge=1,
le=250,
)
offset: int = Field(
default=0,
description="Offset for pagination",
ge=0,
)
class DocumentSummary(BaseModel):
"""Summary of a document for listing."""
source_id: str = Field(..., description="Unique document identifier")
work_title: str = Field(..., description="Title of the work")
work_author: str = Field(..., description="Author of the work")
pages: int = Field(..., description="Number of pages")
chunks_count: int = Field(..., description="Number of chunks")
language: str = Field(..., description="Document language")
class ListDocumentsOutput(BaseModel):
"""Output schema for list_documents tool."""
documents: List[DocumentSummary] = Field(
default_factory=list,
description="List of documents",
)
total_count: int = Field(..., description="Total number of documents")
limit: int = Field(..., description="Applied limit")
offset: int = Field(..., description="Applied offset")
class GetChunksByDocumentInput(BaseModel):
"""Input schema for get_chunks_by_document tool."""
source_id: str = Field(
...,
description="Document source ID",
min_length=1,
)
limit: int = Field(
default=50,
description="Maximum number of chunks to return",
ge=1,
le=500,
)
offset: int = Field(
default=0,
description="Offset for pagination",
ge=0,
)
section_filter: Optional[str] = Field(
None,
description="Filter by section path prefix",
)
class GetChunksByDocumentOutput(BaseModel):
"""Output schema for get_chunks_by_document tool."""
chunks: List[ChunkResult] = Field(
default_factory=list,
description="Ordered list of chunks",
)
total_count: int = Field(..., description="Total chunks in document")
document_source_id: str = Field(..., description="Document source ID")
limit: int = Field(..., description="Applied limit")
offset: int = Field(..., description="Applied offset")
class WorkInfo(BaseModel):
"""Information about a work."""
title: str = Field(..., description="Work title")
author: str = Field(..., description="Author name")
year: Optional[int] = Field(None, description="Publication year")
language: str = Field(..., description="Language code")
genre: Optional[str] = Field(None, description="Genre classification")
class AuthorWorkResult(BaseModel):
"""Work with its documents for author filtering."""
work: WorkInfo = Field(..., description="Work information")
documents: List[DocumentSummary] = Field(
default_factory=list,
description="Documents for this work",
)
total_chunks: int = Field(..., description="Total chunks across all documents")
class FilterByAuthorInput(BaseModel):
"""Input schema for filter_by_author tool."""
author: str = Field(
...,
description="Author name to search for",
min_length=1,
)
include_chunk_counts: bool = Field(
default=True,
description="Include chunk counts in results",
)
class FilterByAuthorOutput(BaseModel):
"""Output schema for filter_by_author tool."""
author: str = Field(..., description="Searched author name")
works: List[AuthorWorkResult] = Field(
default_factory=list,
description="Works by this author",
)
total_works: int = Field(..., description="Total number of works")
total_documents: int = Field(..., description="Total number of documents")
total_chunks: int = Field(..., description="Total number of chunks")
class DeleteDocumentInput(BaseModel):
"""Input schema for delete_document tool."""
source_id: str = Field(
...,
description="Document source ID to delete",
min_length=1,
)
confirm: bool = Field(
default=False,
description="Must be True to confirm deletion",
)
class DeleteDocumentOutput(BaseModel):
"""Output schema for delete_document tool."""
success: bool = Field(..., description="Whether deletion succeeded")
source_id: str = Field(..., description="Deleted document source ID")
chunks_deleted: int = Field(..., description="Number of chunks deleted")
summaries_deleted: int = Field(..., description="Number of summaries deleted")
error: Optional[str] = Field(None, description="Error message if failed")