Add Library RAG project and cleanup root directory
- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
106
generations/library_rag/mcp_tools/__init__.py
Normal file
106
generations/library_rag/mcp_tools/__init__.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""
|
||||
MCP Tools for Library RAG Server.
|
||||
|
||||
This package contains all tool implementations for the Library RAG MCP server:
|
||||
- Parsing tools: PDF ingestion with optimal parameters
|
||||
- Retrieval tools: Semantic search and document management
|
||||
- Exceptions: Custom exception classes for structured error handling
|
||||
- Logging: Structured JSON logging configuration
|
||||
"""
|
||||
|
||||
from mcp_tools.schemas import (
|
||||
ParsePdfInput,
|
||||
ParsePdfOutput,
|
||||
SearchChunksInput,
|
||||
SearchChunksOutput,
|
||||
SearchSummariesInput,
|
||||
SearchSummariesOutput,
|
||||
GetDocumentInput,
|
||||
GetDocumentOutput,
|
||||
ListDocumentsInput,
|
||||
ListDocumentsOutput,
|
||||
GetChunksByDocumentInput,
|
||||
GetChunksByDocumentOutput,
|
||||
FilterByAuthorInput,
|
||||
FilterByAuthorOutput,
|
||||
DeleteDocumentInput,
|
||||
DeleteDocumentOutput,
|
||||
)
|
||||
|
||||
from mcp_tools.exceptions import (
|
||||
MCPToolError,
|
||||
WeaviateConnectionError,
|
||||
PDFProcessingError,
|
||||
DocumentNotFoundError,
|
||||
ValidationError,
|
||||
LLMProcessingError,
|
||||
DownloadError,
|
||||
)
|
||||
|
||||
from mcp_tools.logging_config import (
|
||||
setup_mcp_logging,
|
||||
get_tool_logger,
|
||||
ToolInvocationLogger,
|
||||
log_tool_invocation,
|
||||
log_weaviate_query,
|
||||
redact_sensitive_data,
|
||||
redact_dict,
|
||||
)
|
||||
|
||||
from mcp_tools.parsing_tools import parse_pdf_handler
|
||||
from mcp_tools.retrieval_tools import (
|
||||
search_chunks_handler,
|
||||
search_summaries_handler,
|
||||
get_document_handler,
|
||||
list_documents_handler,
|
||||
get_chunks_by_document_handler,
|
||||
filter_by_author_handler,
|
||||
delete_document_handler,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Parsing tools
|
||||
"parse_pdf_handler",
|
||||
# Retrieval tools
|
||||
"search_chunks_handler",
|
||||
"search_summaries_handler",
|
||||
"get_document_handler",
|
||||
"list_documents_handler",
|
||||
"get_chunks_by_document_handler",
|
||||
"filter_by_author_handler",
|
||||
"delete_document_handler",
|
||||
# Parsing schemas
|
||||
"ParsePdfInput",
|
||||
"ParsePdfOutput",
|
||||
# Retrieval schemas
|
||||
"SearchChunksInput",
|
||||
"SearchChunksOutput",
|
||||
"SearchSummariesInput",
|
||||
"SearchSummariesOutput",
|
||||
"GetDocumentInput",
|
||||
"GetDocumentOutput",
|
||||
"ListDocumentsInput",
|
||||
"ListDocumentsOutput",
|
||||
"GetChunksByDocumentInput",
|
||||
"GetChunksByDocumentOutput",
|
||||
"FilterByAuthorInput",
|
||||
"FilterByAuthorOutput",
|
||||
"DeleteDocumentInput",
|
||||
"DeleteDocumentOutput",
|
||||
# Exceptions
|
||||
"MCPToolError",
|
||||
"WeaviateConnectionError",
|
||||
"PDFProcessingError",
|
||||
"DocumentNotFoundError",
|
||||
"ValidationError",
|
||||
"LLMProcessingError",
|
||||
"DownloadError",
|
||||
# Logging
|
||||
"setup_mcp_logging",
|
||||
"get_tool_logger",
|
||||
"ToolInvocationLogger",
|
||||
"log_tool_invocation",
|
||||
"log_weaviate_query",
|
||||
"redact_sensitive_data",
|
||||
"redact_dict",
|
||||
]
|
||||
297
generations/library_rag/mcp_tools/exceptions.py
Normal file
297
generations/library_rag/mcp_tools/exceptions.py
Normal file
@@ -0,0 +1,297 @@
|
||||
"""Custom exception classes for Library RAG MCP Server.
|
||||
|
||||
This module defines custom exception classes used throughout the MCP server
|
||||
for structured error handling and consistent error responses.
|
||||
|
||||
Exception Hierarchy:
|
||||
MCPToolError (base)
|
||||
├── WeaviateConnectionError - Database connection failures
|
||||
├── PDFProcessingError - PDF parsing/OCR failures
|
||||
├── DocumentNotFoundError - Document/chunk retrieval failures
|
||||
└── ValidationError - Input validation failures
|
||||
|
||||
Example:
|
||||
Raise and catch custom exceptions::
|
||||
|
||||
from mcp_tools.exceptions import WeaviateConnectionError
|
||||
|
||||
try:
|
||||
client = connect_to_weaviate()
|
||||
except Exception as e:
|
||||
raise WeaviateConnectionError("Failed to connect") from e
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
|
||||
class MCPToolError(Exception):
|
||||
"""Base exception for all MCP tool errors.
|
||||
|
||||
This is the base class for all custom exceptions in the MCP server.
|
||||
It provides structured error information that can be converted to
|
||||
MCP error responses.
|
||||
|
||||
Attributes:
|
||||
message: Human-readable error description.
|
||||
error_code: Machine-readable error code for categorization.
|
||||
details: Additional context about the error.
|
||||
original_error: The underlying exception if this wraps another error.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
*,
|
||||
error_code: str = "MCP_ERROR",
|
||||
details: Optional[Dict[str, Any]] = None,
|
||||
original_error: Optional[Exception] = None,
|
||||
) -> None:
|
||||
"""Initialize the MCPToolError.
|
||||
|
||||
Args:
|
||||
message: Human-readable error description.
|
||||
error_code: Machine-readable error code (default: "MCP_ERROR").
|
||||
details: Additional context about the error (optional).
|
||||
original_error: The underlying exception if wrapping (optional).
|
||||
"""
|
||||
super().__init__(message)
|
||||
self.message = message
|
||||
self.error_code = error_code
|
||||
self.details = details or {}
|
||||
self.original_error = original_error
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert exception to a dictionary for JSON serialization.
|
||||
|
||||
Returns:
|
||||
Dictionary with error information suitable for MCP responses.
|
||||
"""
|
||||
result: Dict[str, Any] = {
|
||||
"error": True,
|
||||
"error_code": self.error_code,
|
||||
"message": self.message,
|
||||
}
|
||||
if self.details:
|
||||
result["details"] = self.details
|
||||
if self.original_error:
|
||||
result["original_error"] = str(self.original_error)
|
||||
return result
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Return string representation of the error."""
|
||||
if self.original_error:
|
||||
return f"[{self.error_code}] {self.message} (caused by: {self.original_error})"
|
||||
return f"[{self.error_code}] {self.message}"
|
||||
|
||||
|
||||
class WeaviateConnectionError(MCPToolError):
|
||||
"""Raised when Weaviate database connection fails.
|
||||
|
||||
This exception is raised when the MCP server cannot establish or
|
||||
maintain a connection to the Weaviate vector database.
|
||||
|
||||
Example:
|
||||
>>> raise WeaviateConnectionError(
|
||||
... "Cannot connect to Weaviate at localhost:8080",
|
||||
... details={"host": "localhost", "port": 8080}
|
||||
... )
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str = "Failed to connect to Weaviate",
|
||||
*,
|
||||
details: Optional[Dict[str, Any]] = None,
|
||||
original_error: Optional[Exception] = None,
|
||||
) -> None:
|
||||
"""Initialize WeaviateConnectionError.
|
||||
|
||||
Args:
|
||||
message: Error description (default: "Failed to connect to Weaviate").
|
||||
details: Additional context (host, port, etc.).
|
||||
original_error: The underlying connection exception.
|
||||
"""
|
||||
super().__init__(
|
||||
message,
|
||||
error_code="WEAVIATE_CONNECTION_ERROR",
|
||||
details=details,
|
||||
original_error=original_error,
|
||||
)
|
||||
|
||||
|
||||
class PDFProcessingError(MCPToolError):
|
||||
"""Raised when PDF processing fails.
|
||||
|
||||
This exception is raised when the MCP server encounters an error
|
||||
during PDF parsing, OCR, or any step in the PDF ingestion pipeline.
|
||||
|
||||
Example:
|
||||
>>> raise PDFProcessingError(
|
||||
... "OCR failed for page 5",
|
||||
... details={"page": 5, "pdf_path": "/docs/test.pdf"}
|
||||
... )
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str = "PDF processing failed",
|
||||
*,
|
||||
details: Optional[Dict[str, Any]] = None,
|
||||
original_error: Optional[Exception] = None,
|
||||
) -> None:
|
||||
"""Initialize PDFProcessingError.
|
||||
|
||||
Args:
|
||||
message: Error description (default: "PDF processing failed").
|
||||
details: Additional context (pdf_path, page, step, etc.).
|
||||
original_error: The underlying processing exception.
|
||||
"""
|
||||
super().__init__(
|
||||
message,
|
||||
error_code="PDF_PROCESSING_ERROR",
|
||||
details=details,
|
||||
original_error=original_error,
|
||||
)
|
||||
|
||||
|
||||
class DocumentNotFoundError(MCPToolError):
|
||||
"""Raised when a requested document or chunk is not found.
|
||||
|
||||
This exception is raised when a retrieval operation cannot find
|
||||
the requested document, chunk, or summary in Weaviate.
|
||||
|
||||
Example:
|
||||
>>> raise DocumentNotFoundError(
|
||||
... "Document not found",
|
||||
... details={"source_id": "platon-menon"}
|
||||
... )
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str = "Document not found",
|
||||
*,
|
||||
details: Optional[Dict[str, Any]] = None,
|
||||
original_error: Optional[Exception] = None,
|
||||
) -> None:
|
||||
"""Initialize DocumentNotFoundError.
|
||||
|
||||
Args:
|
||||
message: Error description (default: "Document not found").
|
||||
details: Additional context (source_id, query, etc.).
|
||||
original_error: The underlying exception if any.
|
||||
"""
|
||||
super().__init__(
|
||||
message,
|
||||
error_code="DOCUMENT_NOT_FOUND",
|
||||
details=details,
|
||||
original_error=original_error,
|
||||
)
|
||||
|
||||
|
||||
class ValidationError(MCPToolError):
|
||||
"""Raised when input validation fails.
|
||||
|
||||
This exception is raised when user input does not meet the
|
||||
required validation criteria (e.g., invalid paths, bad parameters).
|
||||
|
||||
Example:
|
||||
>>> raise ValidationError(
|
||||
... "Invalid PDF path",
|
||||
... details={"path": "/nonexistent/file.pdf", "reason": "File not found"}
|
||||
... )
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str = "Validation failed",
|
||||
*,
|
||||
details: Optional[Dict[str, Any]] = None,
|
||||
original_error: Optional[Exception] = None,
|
||||
) -> None:
|
||||
"""Initialize ValidationError.
|
||||
|
||||
Args:
|
||||
message: Error description (default: "Validation failed").
|
||||
details: Additional context (field, value, reason, etc.).
|
||||
original_error: The underlying validation exception.
|
||||
"""
|
||||
super().__init__(
|
||||
message,
|
||||
error_code="VALIDATION_ERROR",
|
||||
details=details,
|
||||
original_error=original_error,
|
||||
)
|
||||
|
||||
|
||||
class LLMProcessingError(MCPToolError):
|
||||
"""Raised when LLM processing fails.
|
||||
|
||||
This exception is raised when the LLM (Mistral or Ollama) fails
|
||||
to process content during metadata extraction, chunking, or other
|
||||
LLM-based operations.
|
||||
|
||||
Example:
|
||||
>>> raise LLMProcessingError(
|
||||
... "LLM timeout during metadata extraction",
|
||||
... details={"provider": "ollama", "model": "mistral", "step": "metadata"}
|
||||
... )
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str = "LLM processing failed",
|
||||
*,
|
||||
details: Optional[Dict[str, Any]] = None,
|
||||
original_error: Optional[Exception] = None,
|
||||
) -> None:
|
||||
"""Initialize LLMProcessingError.
|
||||
|
||||
Args:
|
||||
message: Error description (default: "LLM processing failed").
|
||||
details: Additional context (provider, model, step, etc.).
|
||||
original_error: The underlying LLM exception.
|
||||
"""
|
||||
super().__init__(
|
||||
message,
|
||||
error_code="LLM_PROCESSING_ERROR",
|
||||
details=details,
|
||||
original_error=original_error,
|
||||
)
|
||||
|
||||
|
||||
class DownloadError(MCPToolError):
|
||||
"""Raised when file download from URL fails.
|
||||
|
||||
This exception is raised when the MCP server cannot download
|
||||
a PDF file from a provided URL.
|
||||
|
||||
Example:
|
||||
>>> raise DownloadError(
|
||||
... "Failed to download PDF",
|
||||
... details={"url": "https://example.com/doc.pdf", "status_code": 404}
|
||||
... )
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str = "File download failed",
|
||||
*,
|
||||
details: Optional[Dict[str, Any]] = None,
|
||||
original_error: Optional[Exception] = None,
|
||||
) -> None:
|
||||
"""Initialize DownloadError.
|
||||
|
||||
Args:
|
||||
message: Error description (default: "File download failed").
|
||||
details: Additional context (url, status_code, etc.).
|
||||
original_error: The underlying HTTP exception.
|
||||
"""
|
||||
super().__init__(
|
||||
message,
|
||||
error_code="DOWNLOAD_ERROR",
|
||||
details=details,
|
||||
original_error=original_error,
|
||||
)
|
||||
462
generations/library_rag/mcp_tools/logging_config.py
Normal file
462
generations/library_rag/mcp_tools/logging_config.py
Normal file
@@ -0,0 +1,462 @@
|
||||
"""Structured JSON logging configuration for Library RAG MCP Server.
|
||||
|
||||
This module provides structured JSON logging with sensitive data filtering
|
||||
and tool invocation tracking.
|
||||
|
||||
Features:
|
||||
- JSON-formatted log output for machine parsing
|
||||
- Sensitive data filtering (API keys, passwords)
|
||||
- Tool invocation logging with timing
|
||||
- Configurable log levels via environment variable
|
||||
|
||||
Example:
|
||||
Configure logging at server startup::
|
||||
|
||||
from mcp_tools.logging_config import setup_mcp_logging, get_tool_logger
|
||||
|
||||
# Setup logging
|
||||
logger = setup_mcp_logging(log_level="INFO")
|
||||
|
||||
# Get tool-specific logger
|
||||
tool_logger = get_tool_logger("search_chunks")
|
||||
tool_logger.info("Processing query", extra={"query": "justice"})
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime, timezone
|
||||
from functools import wraps
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Generator, Literal, Optional, TypeVar, cast
|
||||
|
||||
# Type variable for decorator return type preservation
|
||||
F = TypeVar("F", bound=Callable[..., Any])
|
||||
|
||||
# =============================================================================
|
||||
# Sensitive Data Patterns
|
||||
# =============================================================================
|
||||
|
||||
# Patterns to detect sensitive data in log messages
|
||||
SENSITIVE_PATTERNS = [
|
||||
# API keys
|
||||
(re.compile(r'(api[_-]?key\s*[=:]\s*)["\']?[\w-]{20,}["\']?', re.I), r"\1***REDACTED***"),
|
||||
(re.compile(r'(bearer\s+)[\w-]{20,}', re.I), r"\1***REDACTED***"),
|
||||
(re.compile(r'(authorization\s*[=:]\s*)["\']?[\w-]{20,}["\']?', re.I), r"\1***REDACTED***"),
|
||||
# Mistral API key format
|
||||
(re.compile(r'(MISTRAL_API_KEY\s*[=:]\s*)["\']?[\w-]+["\']?', re.I), r"\1***REDACTED***"),
|
||||
# Generic secrets
|
||||
(re.compile(r'(password\s*[=:]\s*)["\']?[^\s"\']+["\']?', re.I), r"\1***REDACTED***"),
|
||||
(re.compile(r'(secret\s*[=:]\s*)["\']?[\w-]+["\']?', re.I), r"\1***REDACTED***"),
|
||||
(re.compile(r'(token\s*[=:]\s*)["\']?[\w-]{20,}["\']?', re.I), r"\1***REDACTED***"),
|
||||
]
|
||||
|
||||
|
||||
def redact_sensitive_data(message: str) -> str:
|
||||
"""Remove sensitive data from log messages.
|
||||
|
||||
Args:
|
||||
message: The log message to sanitize.
|
||||
|
||||
Returns:
|
||||
Sanitized message with sensitive data redacted.
|
||||
|
||||
Example:
|
||||
>>> redact_sensitive_data("api_key=sk-12345abcdef")
|
||||
"api_key=***REDACTED***"
|
||||
"""
|
||||
result = message
|
||||
for pattern, replacement in SENSITIVE_PATTERNS:
|
||||
result = pattern.sub(replacement, result)
|
||||
return result
|
||||
|
||||
|
||||
def redact_dict(data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Recursively redact sensitive data from a dictionary.
|
||||
|
||||
Args:
|
||||
data: Dictionary that may contain sensitive data.
|
||||
|
||||
Returns:
|
||||
New dictionary with sensitive values redacted.
|
||||
"""
|
||||
sensitive_keys = {
|
||||
"api_key", "apikey", "api-key",
|
||||
"password", "passwd", "pwd",
|
||||
"secret", "token", "auth",
|
||||
"authorization", "bearer",
|
||||
"mistral_api_key", "MISTRAL_API_KEY",
|
||||
}
|
||||
|
||||
result: Dict[str, Any] = {}
|
||||
for key, value in data.items():
|
||||
key_lower = key.lower().replace("-", "_")
|
||||
|
||||
if key_lower in sensitive_keys or any(s in key_lower for s in ["key", "secret", "token", "password"]):
|
||||
result[key] = "***REDACTED***"
|
||||
elif isinstance(value, dict):
|
||||
result[key] = redact_dict(value)
|
||||
elif isinstance(value, str):
|
||||
result[key] = redact_sensitive_data(value)
|
||||
else:
|
||||
result[key] = value
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# JSON Log Formatter
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class JSONLogFormatter(logging.Formatter):
|
||||
"""JSON formatter for structured logging.
|
||||
|
||||
Outputs log records as single-line JSON objects with consistent structure.
|
||||
Automatically redacts sensitive data from messages and extra fields.
|
||||
|
||||
JSON Structure:
|
||||
{
|
||||
"timestamp": "2024-12-24T10:30:00.000Z",
|
||||
"level": "INFO",
|
||||
"logger": "library-rag-mcp.search_chunks",
|
||||
"message": "Processing query",
|
||||
"tool": "search_chunks",
|
||||
"duration_ms": 123,
|
||||
...extra fields...
|
||||
}
|
||||
"""
|
||||
|
||||
def format(self, record: logging.LogRecord) -> str:
|
||||
"""Format the log record as JSON.
|
||||
|
||||
Args:
|
||||
record: The log record to format.
|
||||
|
||||
Returns:
|
||||
JSON-formatted log string.
|
||||
"""
|
||||
# Base log structure
|
||||
log_entry: Dict[str, Any] = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"level": record.levelname,
|
||||
"logger": record.name,
|
||||
"message": redact_sensitive_data(record.getMessage()),
|
||||
}
|
||||
|
||||
# Add exception info if present
|
||||
if record.exc_info:
|
||||
log_entry["exception"] = self.formatException(record.exc_info)
|
||||
|
||||
# Add extra fields (excluding standard LogRecord attributes)
|
||||
standard_attrs = {
|
||||
"name", "msg", "args", "levelname", "levelno", "pathname",
|
||||
"filename", "module", "lineno", "funcName", "created",
|
||||
"msecs", "relativeCreated", "thread", "threadName",
|
||||
"processName", "process", "exc_info", "exc_text", "stack_info",
|
||||
"message", "taskName",
|
||||
}
|
||||
|
||||
for key, value in record.__dict__.items():
|
||||
if key not in standard_attrs and not key.startswith("_"):
|
||||
if isinstance(value, dict):
|
||||
log_entry[key] = redact_dict(value)
|
||||
elif isinstance(value, str):
|
||||
log_entry[key] = redact_sensitive_data(value)
|
||||
else:
|
||||
log_entry[key] = value
|
||||
|
||||
return json.dumps(log_entry, default=str, ensure_ascii=False)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Logging Setup
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def setup_mcp_logging(
|
||||
log_level: str = "INFO",
|
||||
log_dir: Optional[Path] = None,
|
||||
json_format: bool = True,
|
||||
) -> logging.Logger:
|
||||
"""Configure structured logging for the MCP server.
|
||||
|
||||
Sets up logging with JSON formatting to both file and stderr.
|
||||
Uses stderr for console output since stdout is used for MCP communication.
|
||||
|
||||
Args:
|
||||
log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL).
|
||||
log_dir: Directory for log files. Defaults to "logs".
|
||||
json_format: Use JSON formatting (default True).
|
||||
|
||||
Returns:
|
||||
Configured logger instance for the MCP server.
|
||||
|
||||
Example:
|
||||
>>> logger = setup_mcp_logging(log_level="DEBUG")
|
||||
>>> logger.info("Server started", extra={"port": 8080})
|
||||
"""
|
||||
# Determine log directory
|
||||
if log_dir is None:
|
||||
log_dir = Path("logs")
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Get or create the root MCP logger
|
||||
logger = logging.getLogger("library-rag-mcp")
|
||||
logger.setLevel(getattr(logging, log_level.upper(), logging.INFO))
|
||||
|
||||
# Clear existing handlers to avoid duplicates
|
||||
logger.handlers.clear()
|
||||
|
||||
# Create formatters
|
||||
if json_format:
|
||||
formatter: logging.Formatter = JSONLogFormatter()
|
||||
else:
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
|
||||
# File handler (JSON logs)
|
||||
file_handler = logging.FileHandler(
|
||||
log_dir / "mcp_server.log",
|
||||
encoding="utf-8",
|
||||
)
|
||||
file_handler.setLevel(logging.DEBUG) # Log everything to file
|
||||
file_handler.setFormatter(formatter)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# Stderr handler (for console output - stdout is for MCP)
|
||||
stderr_handler = logging.StreamHandler(sys.stderr)
|
||||
stderr_handler.setLevel(getattr(logging, log_level.upper(), logging.INFO))
|
||||
stderr_handler.setFormatter(formatter)
|
||||
logger.addHandler(stderr_handler)
|
||||
|
||||
# Prevent propagation to root logger
|
||||
logger.propagate = False
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
def get_tool_logger(tool_name: str) -> logging.Logger:
|
||||
"""Get a logger for a specific MCP tool.
|
||||
|
||||
Creates a child logger under the main MCP logger with the tool name
|
||||
automatically included in log entries.
|
||||
|
||||
Args:
|
||||
tool_name: Name of the MCP tool (e.g., "search_chunks", "parse_pdf").
|
||||
|
||||
Returns:
|
||||
Logger instance for the tool.
|
||||
|
||||
Example:
|
||||
>>> logger = get_tool_logger("search_chunks")
|
||||
>>> logger.info("Query processed", extra={"results": 10})
|
||||
"""
|
||||
return logging.getLogger(f"library-rag-mcp.{tool_name}")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tool Invocation Logging
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class ToolInvocationLogger:
|
||||
"""Context manager for logging tool invocations with timing.
|
||||
|
||||
Automatically logs tool start, success/failure, and duration.
|
||||
Handles exception logging and provides structured output.
|
||||
|
||||
Example:
|
||||
>>> with ToolInvocationLogger("search_chunks", {"query": "justice"}) as inv:
|
||||
... result = do_search()
|
||||
... inv.set_result({"count": 10})
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tool_name: str,
|
||||
inputs: Dict[str, Any],
|
||||
logger: Optional[logging.Logger] = None,
|
||||
) -> None:
|
||||
"""Initialize the invocation logger.
|
||||
|
||||
Args:
|
||||
tool_name: Name of the tool being invoked.
|
||||
inputs: Tool input parameters (will be redacted).
|
||||
logger: Logger to use. Defaults to tool-specific logger.
|
||||
"""
|
||||
self.tool_name = tool_name
|
||||
self.inputs = redact_dict(inputs)
|
||||
self.logger = logger or get_tool_logger(tool_name)
|
||||
self.start_time: float = 0.0
|
||||
self.result: Optional[Dict[str, Any]] = None
|
||||
self.error: Optional[Exception] = None
|
||||
|
||||
def __enter__(self) -> "ToolInvocationLogger":
|
||||
"""Start timing and log invocation start."""
|
||||
self.start_time = time.perf_counter()
|
||||
self.logger.info(
|
||||
f"Tool invocation started: {self.tool_name}",
|
||||
extra={
|
||||
"tool": self.tool_name,
|
||||
"event": "invocation_start",
|
||||
"inputs": self.inputs,
|
||||
},
|
||||
)
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: Optional[type],
|
||||
exc_val: Optional[BaseException],
|
||||
exc_tb: Any,
|
||||
) -> Literal[False]:
|
||||
"""Log invocation completion with timing."""
|
||||
duration_ms = (time.perf_counter() - self.start_time) * 1000
|
||||
|
||||
if exc_val is not None:
|
||||
# Log error
|
||||
self.logger.error(
|
||||
f"Tool invocation failed: {self.tool_name}",
|
||||
extra={
|
||||
"tool": self.tool_name,
|
||||
"event": "invocation_error",
|
||||
"duration_ms": round(duration_ms, 2),
|
||||
"error_type": exc_type.__name__ if exc_type else "Unknown",
|
||||
"error_message": str(exc_val),
|
||||
},
|
||||
exc_info=True,
|
||||
)
|
||||
# Don't suppress the exception
|
||||
return False
|
||||
|
||||
# Log success
|
||||
extra: Dict[str, Any] = {
|
||||
"tool": self.tool_name,
|
||||
"event": "invocation_success",
|
||||
"duration_ms": round(duration_ms, 2),
|
||||
}
|
||||
if self.result:
|
||||
extra["result_summary"] = self._summarize_result()
|
||||
|
||||
self.logger.info(
|
||||
f"Tool invocation completed: {self.tool_name}",
|
||||
extra=extra,
|
||||
)
|
||||
return False
|
||||
|
||||
def set_result(self, result: Dict[str, Any]) -> None:
|
||||
"""Set the result for logging summary.
|
||||
|
||||
Args:
|
||||
result: The tool result dictionary.
|
||||
"""
|
||||
self.result = result
|
||||
|
||||
def _summarize_result(self) -> Dict[str, Any]:
|
||||
"""Create a summary of the result for logging.
|
||||
|
||||
Returns:
|
||||
Dictionary with key result metrics (counts, success status, etc.)
|
||||
"""
|
||||
if not self.result:
|
||||
return {}
|
||||
|
||||
summary: Dict[str, Any] = {}
|
||||
|
||||
# Common summary fields
|
||||
if "success" in self.result:
|
||||
summary["success"] = self.result["success"]
|
||||
if "total_count" in self.result:
|
||||
summary["total_count"] = self.result["total_count"]
|
||||
if "results" in self.result and isinstance(self.result["results"], list):
|
||||
summary["result_count"] = len(self.result["results"])
|
||||
if "chunks_count" in self.result:
|
||||
summary["chunks_count"] = self.result["chunks_count"]
|
||||
if "cost_total" in self.result:
|
||||
summary["cost_total"] = self.result["cost_total"]
|
||||
if "found" in self.result:
|
||||
summary["found"] = self.result["found"]
|
||||
if "error" in self.result and self.result["error"]:
|
||||
summary["error"] = self.result["error"]
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
@contextmanager
|
||||
def log_tool_invocation(
|
||||
tool_name: str,
|
||||
inputs: Dict[str, Any],
|
||||
) -> Generator[ToolInvocationLogger, None, None]:
|
||||
"""Context manager for logging tool invocations.
|
||||
|
||||
Convenience function that creates and manages a ToolInvocationLogger.
|
||||
|
||||
Args:
|
||||
tool_name: Name of the tool being invoked.
|
||||
inputs: Tool input parameters.
|
||||
|
||||
Yields:
|
||||
ToolInvocationLogger instance for setting results.
|
||||
|
||||
Example:
|
||||
>>> with log_tool_invocation("search_chunks", {"query": "test"}) as inv:
|
||||
... result = search(query)
|
||||
... inv.set_result(result)
|
||||
"""
|
||||
logger_instance = ToolInvocationLogger(tool_name, inputs)
|
||||
with logger_instance as inv:
|
||||
yield inv
|
||||
|
||||
|
||||
def log_weaviate_query(
|
||||
operation: str,
|
||||
collection: str,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
result_count: Optional[int] = None,
|
||||
duration_ms: Optional[float] = None,
|
||||
) -> None:
|
||||
"""Log a Weaviate query operation.
|
||||
|
||||
Utility function for logging Weaviate database queries with consistent
|
||||
structure.
|
||||
|
||||
Args:
|
||||
operation: Query operation type (fetch, near_text, aggregate, etc.).
|
||||
collection: Weaviate collection name.
|
||||
filters: Query filters applied (optional).
|
||||
result_count: Number of results returned (optional).
|
||||
duration_ms: Query duration in milliseconds (optional).
|
||||
|
||||
Example:
|
||||
>>> log_weaviate_query(
|
||||
... operation="near_text",
|
||||
... collection="Chunk",
|
||||
... filters={"author": "Platon"},
|
||||
... result_count=10,
|
||||
... duration_ms=45.2
|
||||
... )
|
||||
"""
|
||||
logger = logging.getLogger("library-rag-mcp.weaviate")
|
||||
|
||||
extra: Dict[str, Any] = {
|
||||
"event": "weaviate_query",
|
||||
"operation": operation,
|
||||
"collection": collection,
|
||||
}
|
||||
|
||||
if filters:
|
||||
extra["filters"] = redact_dict(filters)
|
||||
if result_count is not None:
|
||||
extra["result_count"] = result_count
|
||||
if duration_ms is not None:
|
||||
extra["duration_ms"] = round(duration_ms, 2)
|
||||
|
||||
logger.debug(f"Weaviate {operation} on {collection}", extra=extra)
|
||||
335
generations/library_rag/mcp_tools/parsing_tools.py
Normal file
335
generations/library_rag/mcp_tools/parsing_tools.py
Normal file
@@ -0,0 +1,335 @@
|
||||
"""Parsing tools for Library RAG MCP Server.
|
||||
|
||||
This module implements the parse_pdf tool with optimal pre-configured parameters
|
||||
for PDF ingestion into the Library RAG system.
|
||||
|
||||
The tool uses fixed optimal parameters:
|
||||
- llm_provider: "mistral" (API-based, fast)
|
||||
- llm_model: "mistral-medium-latest" (best quality/cost ratio)
|
||||
- use_semantic_chunking: True (LLM-based intelligent chunking)
|
||||
- use_ocr_annotations: True (3x cost but better TOC extraction)
|
||||
- ingest_to_weaviate: True (automatic vectorization and storage)
|
||||
|
||||
Example:
|
||||
The parse_pdf tool can be invoked via MCP with a simple path::
|
||||
|
||||
{
|
||||
"tool": "parse_pdf",
|
||||
"arguments": {
|
||||
"pdf_path": "/path/to/document.pdf"
|
||||
}
|
||||
}
|
||||
|
||||
Or with a URL::
|
||||
|
||||
{
|
||||
"tool": "parse_pdf",
|
||||
"arguments": {
|
||||
"pdf_path": "https://example.com/document.pdf"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Literal
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
from mcp_tools.schemas import ParsePdfInput, ParsePdfOutput
|
||||
|
||||
# Import pdf_pipeline for PDF processing
|
||||
from utils.pdf_pipeline import process_pdf, process_pdf_bytes
|
||||
from utils.types import LLMProvider
|
||||
|
||||
# Logger for this module
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# =============================================================================
|
||||
# Constants - Fixed Optimal Parameters
|
||||
# =============================================================================
|
||||
|
||||
# LLM provider configuration (Mistral API for best results)
|
||||
FIXED_LLM_PROVIDER: LLMProvider = "mistral"
|
||||
FIXED_LLM_MODEL = "mistral-medium-latest"
|
||||
|
||||
# Processing options (optimal settings for quality)
|
||||
FIXED_USE_SEMANTIC_CHUNKING = True
|
||||
FIXED_USE_OCR_ANNOTATIONS = True
|
||||
FIXED_INGEST_TO_WEAVIATE = True
|
||||
|
||||
# Additional processing flags
|
||||
FIXED_USE_LLM = True
|
||||
# Note: The following flags are not supported by process_pdf() and should not be used
|
||||
# FIXED_CLEAN_CHUNKS = True
|
||||
# FIXED_EXTRACT_CONCEPTS = True
|
||||
# FIXED_VALIDATE_OUTPUT = True
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def is_url(path: str) -> bool:
|
||||
"""Check if a path is a URL.
|
||||
|
||||
Args:
|
||||
path: The path or URL string to check.
|
||||
|
||||
Returns:
|
||||
True if the path is a valid HTTP/HTTPS URL, False otherwise.
|
||||
|
||||
Example:
|
||||
>>> is_url("https://example.com/doc.pdf")
|
||||
True
|
||||
>>> is_url("/path/to/doc.pdf")
|
||||
False
|
||||
"""
|
||||
try:
|
||||
result = urlparse(path)
|
||||
return result.scheme in ("http", "https")
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
async def download_pdf(url: str, timeout: float = 60.0) -> bytes:
|
||||
"""Download a PDF file from a URL.
|
||||
|
||||
Args:
|
||||
url: The URL to download from. Must be HTTP or HTTPS.
|
||||
timeout: Maximum time in seconds to wait for download.
|
||||
Defaults to 60 seconds.
|
||||
|
||||
Returns:
|
||||
Raw bytes content of the downloaded PDF file.
|
||||
|
||||
Raises:
|
||||
httpx.HTTPError: If the download fails (network error, HTTP error, etc.).
|
||||
ValueError: If the URL is invalid or not accessible.
|
||||
|
||||
Example:
|
||||
>>> pdf_bytes = await download_pdf("https://example.com/document.pdf")
|
||||
>>> len(pdf_bytes) > 0
|
||||
True
|
||||
"""
|
||||
logger.info(f"Downloading PDF from: {url}")
|
||||
|
||||
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
content_type = response.headers.get("content-type", "")
|
||||
if "application/pdf" not in content_type.lower() and not url.lower().endswith(
|
||||
".pdf"
|
||||
):
|
||||
logger.warning(
|
||||
f"URL may not be a PDF (Content-Type: {content_type}), proceeding anyway"
|
||||
)
|
||||
|
||||
logger.info(f"Downloaded {len(response.content)} bytes from {url}")
|
||||
return response.content
|
||||
|
||||
|
||||
def extract_filename_from_url(url: str) -> str:
|
||||
"""Extract a filename from a URL.
|
||||
|
||||
Args:
|
||||
url: The URL to extract filename from.
|
||||
|
||||
Returns:
|
||||
Extracted filename with .pdf extension. Falls back to "downloaded.pdf"
|
||||
if no filename can be extracted.
|
||||
|
||||
Example:
|
||||
>>> extract_filename_from_url("https://example.com/documents/kant.pdf")
|
||||
"kant.pdf"
|
||||
>>> extract_filename_from_url("https://example.com/api/download")
|
||||
"downloaded.pdf"
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path
|
||||
|
||||
if path:
|
||||
# Get the last path component
|
||||
filename = path.split("/")[-1]
|
||||
if filename and "." in filename:
|
||||
return filename
|
||||
if filename:
|
||||
return f"{filename}.pdf"
|
||||
|
||||
return "downloaded.pdf"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main Tool Implementation
|
||||
# =============================================================================
|
||||
|
||||
|
||||
async def parse_pdf_handler(input_data: ParsePdfInput) -> ParsePdfOutput:
|
||||
"""Process a PDF document with optimal pre-configured parameters.
|
||||
|
||||
This is the main handler for the parse_pdf MCP tool. It processes PDFs
|
||||
through the Library RAG pipeline with the following fixed optimal settings:
|
||||
|
||||
- LLM: Mistral API (mistral-medium-latest) for fast, high-quality processing
|
||||
- OCR: Mistral OCR with annotations (better TOC extraction, 3x cost)
|
||||
- Chunking: Semantic LLM-based chunking (argumentative units)
|
||||
- Ingestion: Automatic Weaviate vectorization and storage
|
||||
|
||||
The tool accepts either a local file path or a URL. URLs are automatically
|
||||
downloaded before processing.
|
||||
|
||||
Args:
|
||||
input_data: Validated input containing pdf_path (local path or URL).
|
||||
|
||||
Returns:
|
||||
ParsePdfOutput containing processing results including:
|
||||
- success: Whether processing completed successfully
|
||||
- document_name: Name of the processed document
|
||||
- source_id: Unique identifier for retrieval
|
||||
- pages: Number of pages processed
|
||||
- chunks_count: Number of chunks created
|
||||
- cost_ocr: OCR cost in EUR
|
||||
- cost_llm: LLM cost in EUR
|
||||
- cost_total: Total processing cost
|
||||
- output_dir: Directory containing output files
|
||||
- metadata: Extracted document metadata
|
||||
- error: Error message if processing failed
|
||||
|
||||
Example:
|
||||
>>> input_data = ParsePdfInput(pdf_path="/docs/aristotle.pdf")
|
||||
>>> result = await parse_pdf_handler(input_data)
|
||||
>>> result.success
|
||||
True
|
||||
>>> result.chunks_count > 0
|
||||
True
|
||||
"""
|
||||
pdf_path = input_data.pdf_path
|
||||
logger.info(f"parse_pdf called with: {pdf_path}")
|
||||
|
||||
try:
|
||||
# Determine if input is a URL or local path
|
||||
if is_url(pdf_path):
|
||||
# Download PDF from URL
|
||||
logger.info(f"Detected URL input, downloading: {pdf_path}")
|
||||
pdf_bytes = await download_pdf(pdf_path)
|
||||
filename = extract_filename_from_url(pdf_path)
|
||||
|
||||
# Process from bytes
|
||||
result = process_pdf_bytes(
|
||||
file_bytes=pdf_bytes,
|
||||
filename=filename,
|
||||
output_dir=Path("output"),
|
||||
llm_provider=FIXED_LLM_PROVIDER,
|
||||
use_llm=FIXED_USE_LLM,
|
||||
llm_model=FIXED_LLM_MODEL,
|
||||
use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
|
||||
use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
|
||||
ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
|
||||
)
|
||||
else:
|
||||
# Process local file
|
||||
local_path = Path(pdf_path)
|
||||
if not local_path.exists():
|
||||
logger.error(f"PDF file not found: {pdf_path}")
|
||||
return ParsePdfOutput(
|
||||
success=False,
|
||||
document_name="",
|
||||
source_id="",
|
||||
pages=0,
|
||||
chunks_count=0,
|
||||
cost_ocr=0.0,
|
||||
cost_llm=0.0,
|
||||
cost_total=0.0,
|
||||
output_dir="",
|
||||
metadata={},
|
||||
error=f"PDF file not found: {pdf_path}",
|
||||
)
|
||||
|
||||
logger.info(f"Processing local file: {local_path}")
|
||||
result = process_pdf(
|
||||
pdf_path=local_path,
|
||||
output_dir=Path("output"),
|
||||
use_llm=FIXED_USE_LLM,
|
||||
llm_provider=FIXED_LLM_PROVIDER,
|
||||
llm_model=FIXED_LLM_MODEL,
|
||||
use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
|
||||
use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
|
||||
ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
|
||||
)
|
||||
|
||||
# Convert pipeline result to output schema
|
||||
success = result.get("success", False)
|
||||
document_name = result.get("document_name", "")
|
||||
source_id = result.get("source_id", document_name)
|
||||
|
||||
# Extract costs
|
||||
cost_ocr = result.get("cost_ocr", 0.0)
|
||||
cost_llm = result.get("cost_llm", 0.0)
|
||||
cost_total = result.get("cost_total", cost_ocr + cost_llm)
|
||||
|
||||
# Extract metadata
|
||||
metadata_raw = result.get("metadata", {})
|
||||
if metadata_raw is None:
|
||||
metadata_raw = {}
|
||||
|
||||
# Build output
|
||||
output = ParsePdfOutput(
|
||||
success=success,
|
||||
document_name=document_name,
|
||||
source_id=source_id,
|
||||
pages=result.get("pages", 0),
|
||||
chunks_count=result.get("chunks_count", 0),
|
||||
cost_ocr=cost_ocr,
|
||||
cost_llm=cost_llm,
|
||||
cost_total=cost_total,
|
||||
output_dir=str(result.get("output_dir", "")),
|
||||
metadata=metadata_raw,
|
||||
error=result.get("error"),
|
||||
)
|
||||
|
||||
if success:
|
||||
logger.info(
|
||||
f"Successfully processed {document_name}: "
|
||||
f"{output.chunks_count} chunks, {output.cost_total:.4f} EUR"
|
||||
)
|
||||
else:
|
||||
logger.error(f"Failed to process {pdf_path}: {output.error}")
|
||||
|
||||
return output
|
||||
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"HTTP error downloading PDF: {e}")
|
||||
return ParsePdfOutput(
|
||||
success=False,
|
||||
document_name="",
|
||||
source_id="",
|
||||
pages=0,
|
||||
chunks_count=0,
|
||||
cost_ocr=0.0,
|
||||
cost_llm=0.0,
|
||||
cost_total=0.0,
|
||||
output_dir="",
|
||||
metadata={},
|
||||
error=f"Failed to download PDF: {e}",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing PDF: {e}", exc_info=True)
|
||||
return ParsePdfOutput(
|
||||
success=False,
|
||||
document_name="",
|
||||
source_id="",
|
||||
pages=0,
|
||||
chunks_count=0,
|
||||
cost_ocr=0.0,
|
||||
cost_llm=0.0,
|
||||
cost_total=0.0,
|
||||
output_dir="",
|
||||
metadata={},
|
||||
error=f"Processing error: {str(e)}",
|
||||
)
|
||||
1552
generations/library_rag/mcp_tools/retrieval_tools.py
Normal file
1552
generations/library_rag/mcp_tools/retrieval_tools.py
Normal file
File diff suppressed because it is too large
Load Diff
361
generations/library_rag/mcp_tools/schemas.py
Normal file
361
generations/library_rag/mcp_tools/schemas.py
Normal file
@@ -0,0 +1,361 @@
|
||||
"""
|
||||
Pydantic schemas for MCP tool inputs and outputs.
|
||||
|
||||
All schemas use strict validation and include field descriptions
|
||||
for automatic JSON schema generation in MCP tool definitions.
|
||||
"""
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Parsing Tool Schemas
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class ParsePdfInput(BaseModel):
|
||||
"""Input schema for parse_pdf tool."""
|
||||
|
||||
pdf_path: str = Field(
|
||||
...,
|
||||
description="Path to the PDF file to process, or URL to download",
|
||||
min_length=1,
|
||||
)
|
||||
|
||||
|
||||
class ParsePdfOutput(BaseModel):
|
||||
"""Output schema for parse_pdf tool."""
|
||||
|
||||
success: bool = Field(..., description="Whether processing succeeded")
|
||||
document_name: str = Field(..., description="Name of the processed document")
|
||||
source_id: str = Field(..., description="Unique identifier for the document")
|
||||
pages: int = Field(..., description="Number of pages processed")
|
||||
chunks_count: int = Field(..., description="Number of chunks created")
|
||||
cost_ocr: float = Field(..., description="OCR processing cost in EUR")
|
||||
cost_llm: float = Field(..., description="LLM processing cost in EUR")
|
||||
cost_total: float = Field(..., description="Total processing cost in EUR")
|
||||
output_dir: str = Field(..., description="Directory containing output files")
|
||||
metadata: Dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="Extracted metadata (title, author, language, year)",
|
||||
)
|
||||
error: Optional[str] = Field(None, description="Error message if failed")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Retrieval Tool Schemas
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class ChunkResult(BaseModel):
|
||||
"""A single chunk result from search."""
|
||||
|
||||
text: str = Field(..., description="Chunk text content")
|
||||
similarity: float = Field(..., description="Similarity score (0-1)")
|
||||
source_id: str = Field(..., description="Source document ID (e.g., 'peirce_collected_papers')")
|
||||
canonical_reference: Optional[str] = Field(None, description="Academic citation reference (e.g., 'CP 5.628', 'Ménon 80a')")
|
||||
section_path: str = Field(..., description="Hierarchical section path")
|
||||
chapter_title: Optional[str] = Field(None, description="Chapter title if available")
|
||||
work_title: str = Field(..., description="Title of the work")
|
||||
work_author: str = Field(..., description="Author of the work")
|
||||
order_index: int = Field(..., description="Position in document")
|
||||
|
||||
|
||||
class SearchChunksInput(BaseModel):
|
||||
"""Input schema for search_chunks tool."""
|
||||
|
||||
query: str = Field(
|
||||
...,
|
||||
description="Semantic search query",
|
||||
min_length=1,
|
||||
max_length=1000,
|
||||
)
|
||||
limit: int = Field(
|
||||
default=10,
|
||||
description="Maximum number of results to return",
|
||||
ge=1,
|
||||
le=500,
|
||||
)
|
||||
min_similarity: float = Field(
|
||||
default=0.0,
|
||||
description="Minimum similarity threshold (0-1)",
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
)
|
||||
author_filter: Optional[str] = Field(
|
||||
None,
|
||||
description="Filter by author name",
|
||||
)
|
||||
work_filter: Optional[str] = Field(
|
||||
None,
|
||||
description="Filter by work title",
|
||||
)
|
||||
language_filter: Optional[str] = Field(
|
||||
None,
|
||||
description="Filter by language code (e.g., 'fr', 'en')",
|
||||
)
|
||||
|
||||
|
||||
class SearchChunksOutput(BaseModel):
|
||||
"""Output schema for search_chunks tool."""
|
||||
|
||||
results: List[ChunkResult] = Field(
|
||||
default_factory=list,
|
||||
description="List of matching chunks",
|
||||
)
|
||||
total_count: int = Field(..., description="Total number of results")
|
||||
query: str = Field(..., description="Original query")
|
||||
|
||||
|
||||
class SummaryResult(BaseModel):
|
||||
"""A single summary result from search."""
|
||||
|
||||
text: str = Field(..., description="Summary text")
|
||||
similarity: float = Field(..., description="Similarity score (0-1)")
|
||||
title: str = Field(..., description="Section title")
|
||||
section_path: str = Field(..., description="Hierarchical section path")
|
||||
level: int = Field(..., description="Hierarchy level (1=chapter, 2=section, etc.)")
|
||||
concepts: List[str] = Field(default_factory=list, description="Key concepts")
|
||||
document_source_id: str = Field(..., description="Source document ID")
|
||||
|
||||
|
||||
class SearchSummariesInput(BaseModel):
|
||||
"""Input schema for search_summaries tool."""
|
||||
|
||||
query: str = Field(
|
||||
...,
|
||||
description="Semantic search query",
|
||||
min_length=1,
|
||||
max_length=1000,
|
||||
)
|
||||
limit: int = Field(
|
||||
default=10,
|
||||
description="Maximum number of results to return",
|
||||
ge=1,
|
||||
le=100,
|
||||
)
|
||||
min_level: Optional[int] = Field(
|
||||
None,
|
||||
description="Minimum hierarchy level (1=chapter)",
|
||||
ge=1,
|
||||
le=5,
|
||||
)
|
||||
max_level: Optional[int] = Field(
|
||||
None,
|
||||
description="Maximum hierarchy level",
|
||||
ge=1,
|
||||
le=5,
|
||||
)
|
||||
|
||||
|
||||
class SearchSummariesOutput(BaseModel):
|
||||
"""Output schema for search_summaries tool."""
|
||||
|
||||
results: List[SummaryResult] = Field(
|
||||
default_factory=list,
|
||||
description="List of matching summaries",
|
||||
)
|
||||
total_count: int = Field(..., description="Total number of results")
|
||||
query: str = Field(..., description="Original query")
|
||||
|
||||
|
||||
class GetDocumentInput(BaseModel):
|
||||
"""Input schema for get_document tool."""
|
||||
|
||||
source_id: str = Field(
|
||||
...,
|
||||
description="Document source ID (e.g., 'platon-menon')",
|
||||
min_length=1,
|
||||
)
|
||||
include_chunks: bool = Field(
|
||||
default=False,
|
||||
description="Include document chunks in response",
|
||||
)
|
||||
chunk_limit: int = Field(
|
||||
default=50,
|
||||
description="Maximum chunks to return if include_chunks=True",
|
||||
ge=1,
|
||||
le=500,
|
||||
)
|
||||
|
||||
|
||||
class DocumentInfo(BaseModel):
|
||||
"""Document information."""
|
||||
|
||||
source_id: str = Field(..., description="Unique document identifier")
|
||||
work_title: str = Field(..., description="Title of the work")
|
||||
work_author: str = Field(..., description="Author of the work")
|
||||
edition: Optional[str] = Field(None, description="Edition information")
|
||||
pages: int = Field(..., description="Number of pages")
|
||||
language: str = Field(..., description="Document language")
|
||||
toc: Optional[Dict[str, Any]] = Field(None, description="Table of contents")
|
||||
hierarchy: Optional[Dict[str, Any]] = Field(None, description="Document hierarchy")
|
||||
|
||||
|
||||
class GetDocumentOutput(BaseModel):
|
||||
"""Output schema for get_document tool."""
|
||||
|
||||
document: Optional[DocumentInfo] = Field(None, description="Document information")
|
||||
chunks: List[ChunkResult] = Field(
|
||||
default_factory=list,
|
||||
description="Document chunks (if requested)",
|
||||
)
|
||||
chunks_total: int = Field(
|
||||
default=0,
|
||||
description="Total number of chunks in document",
|
||||
)
|
||||
found: bool = Field(..., description="Whether document was found")
|
||||
error: Optional[str] = Field(None, description="Error message if not found")
|
||||
|
||||
|
||||
class ListDocumentsInput(BaseModel):
|
||||
"""Input schema for list_documents tool."""
|
||||
|
||||
author_filter: Optional[str] = Field(None, description="Filter by author name")
|
||||
work_filter: Optional[str] = Field(None, description="Filter by work title")
|
||||
language_filter: Optional[str] = Field(None, description="Filter by language code")
|
||||
limit: int = Field(
|
||||
default=50,
|
||||
description="Maximum number of results",
|
||||
ge=1,
|
||||
le=250,
|
||||
)
|
||||
offset: int = Field(
|
||||
default=0,
|
||||
description="Offset for pagination",
|
||||
ge=0,
|
||||
)
|
||||
|
||||
|
||||
class DocumentSummary(BaseModel):
|
||||
"""Summary of a document for listing."""
|
||||
|
||||
source_id: str = Field(..., description="Unique document identifier")
|
||||
work_title: str = Field(..., description="Title of the work")
|
||||
work_author: str = Field(..., description="Author of the work")
|
||||
pages: int = Field(..., description="Number of pages")
|
||||
chunks_count: int = Field(..., description="Number of chunks")
|
||||
language: str = Field(..., description="Document language")
|
||||
|
||||
|
||||
class ListDocumentsOutput(BaseModel):
|
||||
"""Output schema for list_documents tool."""
|
||||
|
||||
documents: List[DocumentSummary] = Field(
|
||||
default_factory=list,
|
||||
description="List of documents",
|
||||
)
|
||||
total_count: int = Field(..., description="Total number of documents")
|
||||
limit: int = Field(..., description="Applied limit")
|
||||
offset: int = Field(..., description="Applied offset")
|
||||
|
||||
|
||||
class GetChunksByDocumentInput(BaseModel):
|
||||
"""Input schema for get_chunks_by_document tool."""
|
||||
|
||||
source_id: str = Field(
|
||||
...,
|
||||
description="Document source ID",
|
||||
min_length=1,
|
||||
)
|
||||
limit: int = Field(
|
||||
default=50,
|
||||
description="Maximum number of chunks to return",
|
||||
ge=1,
|
||||
le=500,
|
||||
)
|
||||
offset: int = Field(
|
||||
default=0,
|
||||
description="Offset for pagination",
|
||||
ge=0,
|
||||
)
|
||||
section_filter: Optional[str] = Field(
|
||||
None,
|
||||
description="Filter by section path prefix",
|
||||
)
|
||||
|
||||
|
||||
class GetChunksByDocumentOutput(BaseModel):
|
||||
"""Output schema for get_chunks_by_document tool."""
|
||||
|
||||
chunks: List[ChunkResult] = Field(
|
||||
default_factory=list,
|
||||
description="Ordered list of chunks",
|
||||
)
|
||||
total_count: int = Field(..., description="Total chunks in document")
|
||||
document_source_id: str = Field(..., description="Document source ID")
|
||||
limit: int = Field(..., description="Applied limit")
|
||||
offset: int = Field(..., description="Applied offset")
|
||||
|
||||
|
||||
class WorkInfo(BaseModel):
|
||||
"""Information about a work."""
|
||||
|
||||
title: str = Field(..., description="Work title")
|
||||
author: str = Field(..., description="Author name")
|
||||
year: Optional[int] = Field(None, description="Publication year")
|
||||
language: str = Field(..., description="Language code")
|
||||
genre: Optional[str] = Field(None, description="Genre classification")
|
||||
|
||||
|
||||
class AuthorWorkResult(BaseModel):
|
||||
"""Work with its documents for author filtering."""
|
||||
|
||||
work: WorkInfo = Field(..., description="Work information")
|
||||
documents: List[DocumentSummary] = Field(
|
||||
default_factory=list,
|
||||
description="Documents for this work",
|
||||
)
|
||||
total_chunks: int = Field(..., description="Total chunks across all documents")
|
||||
|
||||
|
||||
class FilterByAuthorInput(BaseModel):
|
||||
"""Input schema for filter_by_author tool."""
|
||||
|
||||
author: str = Field(
|
||||
...,
|
||||
description="Author name to search for",
|
||||
min_length=1,
|
||||
)
|
||||
include_chunk_counts: bool = Field(
|
||||
default=True,
|
||||
description="Include chunk counts in results",
|
||||
)
|
||||
|
||||
|
||||
class FilterByAuthorOutput(BaseModel):
|
||||
"""Output schema for filter_by_author tool."""
|
||||
|
||||
author: str = Field(..., description="Searched author name")
|
||||
works: List[AuthorWorkResult] = Field(
|
||||
default_factory=list,
|
||||
description="Works by this author",
|
||||
)
|
||||
total_works: int = Field(..., description="Total number of works")
|
||||
total_documents: int = Field(..., description="Total number of documents")
|
||||
total_chunks: int = Field(..., description="Total number of chunks")
|
||||
|
||||
|
||||
class DeleteDocumentInput(BaseModel):
|
||||
"""Input schema for delete_document tool."""
|
||||
|
||||
source_id: str = Field(
|
||||
...,
|
||||
description="Document source ID to delete",
|
||||
min_length=1,
|
||||
)
|
||||
confirm: bool = Field(
|
||||
default=False,
|
||||
description="Must be True to confirm deletion",
|
||||
)
|
||||
|
||||
|
||||
class DeleteDocumentOutput(BaseModel):
|
||||
"""Output schema for delete_document tool."""
|
||||
|
||||
success: bool = Field(..., description="Whether deletion succeeded")
|
||||
source_id: str = Field(..., description="Deleted document source ID")
|
||||
chunks_deleted: int = Field(..., description="Number of chunks deleted")
|
||||
summaries_deleted: int = Field(..., description="Number of summaries deleted")
|
||||
error: Optional[str] = Field(None, description="Error message if failed")
|
||||
Reference in New Issue
Block a user