- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
362 lines
12 KiB
Python
362 lines
12 KiB
Python
"""
|
|
Pydantic schemas for MCP tool inputs and outputs.
|
|
|
|
All schemas use strict validation and include field descriptions
|
|
for automatic JSON schema generation in MCP tool definitions.
|
|
"""
|
|
|
|
from typing import Any, Dict, List, Optional
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
# =============================================================================
|
|
# Parsing Tool Schemas
|
|
# =============================================================================
|
|
|
|
|
|
class ParsePdfInput(BaseModel):
|
|
"""Input schema for parse_pdf tool."""
|
|
|
|
pdf_path: str = Field(
|
|
...,
|
|
description="Path to the PDF file to process, or URL to download",
|
|
min_length=1,
|
|
)
|
|
|
|
|
|
class ParsePdfOutput(BaseModel):
|
|
"""Output schema for parse_pdf tool."""
|
|
|
|
success: bool = Field(..., description="Whether processing succeeded")
|
|
document_name: str = Field(..., description="Name of the processed document")
|
|
source_id: str = Field(..., description="Unique identifier for the document")
|
|
pages: int = Field(..., description="Number of pages processed")
|
|
chunks_count: int = Field(..., description="Number of chunks created")
|
|
cost_ocr: float = Field(..., description="OCR processing cost in EUR")
|
|
cost_llm: float = Field(..., description="LLM processing cost in EUR")
|
|
cost_total: float = Field(..., description="Total processing cost in EUR")
|
|
output_dir: str = Field(..., description="Directory containing output files")
|
|
metadata: Dict[str, Any] = Field(
|
|
default_factory=dict,
|
|
description="Extracted metadata (title, author, language, year)",
|
|
)
|
|
error: Optional[str] = Field(None, description="Error message if failed")
|
|
|
|
|
|
# =============================================================================
|
|
# Retrieval Tool Schemas
|
|
# =============================================================================
|
|
|
|
|
|
class ChunkResult(BaseModel):
|
|
"""A single chunk result from search."""
|
|
|
|
text: str = Field(..., description="Chunk text content")
|
|
similarity: float = Field(..., description="Similarity score (0-1)")
|
|
source_id: str = Field(..., description="Source document ID (e.g., 'peirce_collected_papers')")
|
|
canonical_reference: Optional[str] = Field(None, description="Academic citation reference (e.g., 'CP 5.628', 'Ménon 80a')")
|
|
section_path: str = Field(..., description="Hierarchical section path")
|
|
chapter_title: Optional[str] = Field(None, description="Chapter title if available")
|
|
work_title: str = Field(..., description="Title of the work")
|
|
work_author: str = Field(..., description="Author of the work")
|
|
order_index: int = Field(..., description="Position in document")
|
|
|
|
|
|
class SearchChunksInput(BaseModel):
|
|
"""Input schema for search_chunks tool."""
|
|
|
|
query: str = Field(
|
|
...,
|
|
description="Semantic search query",
|
|
min_length=1,
|
|
max_length=1000,
|
|
)
|
|
limit: int = Field(
|
|
default=10,
|
|
description="Maximum number of results to return",
|
|
ge=1,
|
|
le=500,
|
|
)
|
|
min_similarity: float = Field(
|
|
default=0.0,
|
|
description="Minimum similarity threshold (0-1)",
|
|
ge=0.0,
|
|
le=1.0,
|
|
)
|
|
author_filter: Optional[str] = Field(
|
|
None,
|
|
description="Filter by author name",
|
|
)
|
|
work_filter: Optional[str] = Field(
|
|
None,
|
|
description="Filter by work title",
|
|
)
|
|
language_filter: Optional[str] = Field(
|
|
None,
|
|
description="Filter by language code (e.g., 'fr', 'en')",
|
|
)
|
|
|
|
|
|
class SearchChunksOutput(BaseModel):
|
|
"""Output schema for search_chunks tool."""
|
|
|
|
results: List[ChunkResult] = Field(
|
|
default_factory=list,
|
|
description="List of matching chunks",
|
|
)
|
|
total_count: int = Field(..., description="Total number of results")
|
|
query: str = Field(..., description="Original query")
|
|
|
|
|
|
class SummaryResult(BaseModel):
|
|
"""A single summary result from search."""
|
|
|
|
text: str = Field(..., description="Summary text")
|
|
similarity: float = Field(..., description="Similarity score (0-1)")
|
|
title: str = Field(..., description="Section title")
|
|
section_path: str = Field(..., description="Hierarchical section path")
|
|
level: int = Field(..., description="Hierarchy level (1=chapter, 2=section, etc.)")
|
|
concepts: List[str] = Field(default_factory=list, description="Key concepts")
|
|
document_source_id: str = Field(..., description="Source document ID")
|
|
|
|
|
|
class SearchSummariesInput(BaseModel):
|
|
"""Input schema for search_summaries tool."""
|
|
|
|
query: str = Field(
|
|
...,
|
|
description="Semantic search query",
|
|
min_length=1,
|
|
max_length=1000,
|
|
)
|
|
limit: int = Field(
|
|
default=10,
|
|
description="Maximum number of results to return",
|
|
ge=1,
|
|
le=100,
|
|
)
|
|
min_level: Optional[int] = Field(
|
|
None,
|
|
description="Minimum hierarchy level (1=chapter)",
|
|
ge=1,
|
|
le=5,
|
|
)
|
|
max_level: Optional[int] = Field(
|
|
None,
|
|
description="Maximum hierarchy level",
|
|
ge=1,
|
|
le=5,
|
|
)
|
|
|
|
|
|
class SearchSummariesOutput(BaseModel):
|
|
"""Output schema for search_summaries tool."""
|
|
|
|
results: List[SummaryResult] = Field(
|
|
default_factory=list,
|
|
description="List of matching summaries",
|
|
)
|
|
total_count: int = Field(..., description="Total number of results")
|
|
query: str = Field(..., description="Original query")
|
|
|
|
|
|
class GetDocumentInput(BaseModel):
|
|
"""Input schema for get_document tool."""
|
|
|
|
source_id: str = Field(
|
|
...,
|
|
description="Document source ID (e.g., 'platon-menon')",
|
|
min_length=1,
|
|
)
|
|
include_chunks: bool = Field(
|
|
default=False,
|
|
description="Include document chunks in response",
|
|
)
|
|
chunk_limit: int = Field(
|
|
default=50,
|
|
description="Maximum chunks to return if include_chunks=True",
|
|
ge=1,
|
|
le=500,
|
|
)
|
|
|
|
|
|
class DocumentInfo(BaseModel):
|
|
"""Document information."""
|
|
|
|
source_id: str = Field(..., description="Unique document identifier")
|
|
work_title: str = Field(..., description="Title of the work")
|
|
work_author: str = Field(..., description="Author of the work")
|
|
edition: Optional[str] = Field(None, description="Edition information")
|
|
pages: int = Field(..., description="Number of pages")
|
|
language: str = Field(..., description="Document language")
|
|
toc: Optional[Dict[str, Any]] = Field(None, description="Table of contents")
|
|
hierarchy: Optional[Dict[str, Any]] = Field(None, description="Document hierarchy")
|
|
|
|
|
|
class GetDocumentOutput(BaseModel):
|
|
"""Output schema for get_document tool."""
|
|
|
|
document: Optional[DocumentInfo] = Field(None, description="Document information")
|
|
chunks: List[ChunkResult] = Field(
|
|
default_factory=list,
|
|
description="Document chunks (if requested)",
|
|
)
|
|
chunks_total: int = Field(
|
|
default=0,
|
|
description="Total number of chunks in document",
|
|
)
|
|
found: bool = Field(..., description="Whether document was found")
|
|
error: Optional[str] = Field(None, description="Error message if not found")
|
|
|
|
|
|
class ListDocumentsInput(BaseModel):
|
|
"""Input schema for list_documents tool."""
|
|
|
|
author_filter: Optional[str] = Field(None, description="Filter by author name")
|
|
work_filter: Optional[str] = Field(None, description="Filter by work title")
|
|
language_filter: Optional[str] = Field(None, description="Filter by language code")
|
|
limit: int = Field(
|
|
default=50,
|
|
description="Maximum number of results",
|
|
ge=1,
|
|
le=250,
|
|
)
|
|
offset: int = Field(
|
|
default=0,
|
|
description="Offset for pagination",
|
|
ge=0,
|
|
)
|
|
|
|
|
|
class DocumentSummary(BaseModel):
|
|
"""Summary of a document for listing."""
|
|
|
|
source_id: str = Field(..., description="Unique document identifier")
|
|
work_title: str = Field(..., description="Title of the work")
|
|
work_author: str = Field(..., description="Author of the work")
|
|
pages: int = Field(..., description="Number of pages")
|
|
chunks_count: int = Field(..., description="Number of chunks")
|
|
language: str = Field(..., description="Document language")
|
|
|
|
|
|
class ListDocumentsOutput(BaseModel):
|
|
"""Output schema for list_documents tool."""
|
|
|
|
documents: List[DocumentSummary] = Field(
|
|
default_factory=list,
|
|
description="List of documents",
|
|
)
|
|
total_count: int = Field(..., description="Total number of documents")
|
|
limit: int = Field(..., description="Applied limit")
|
|
offset: int = Field(..., description="Applied offset")
|
|
|
|
|
|
class GetChunksByDocumentInput(BaseModel):
|
|
"""Input schema for get_chunks_by_document tool."""
|
|
|
|
source_id: str = Field(
|
|
...,
|
|
description="Document source ID",
|
|
min_length=1,
|
|
)
|
|
limit: int = Field(
|
|
default=50,
|
|
description="Maximum number of chunks to return",
|
|
ge=1,
|
|
le=500,
|
|
)
|
|
offset: int = Field(
|
|
default=0,
|
|
description="Offset for pagination",
|
|
ge=0,
|
|
)
|
|
section_filter: Optional[str] = Field(
|
|
None,
|
|
description="Filter by section path prefix",
|
|
)
|
|
|
|
|
|
class GetChunksByDocumentOutput(BaseModel):
|
|
"""Output schema for get_chunks_by_document tool."""
|
|
|
|
chunks: List[ChunkResult] = Field(
|
|
default_factory=list,
|
|
description="Ordered list of chunks",
|
|
)
|
|
total_count: int = Field(..., description="Total chunks in document")
|
|
document_source_id: str = Field(..., description="Document source ID")
|
|
limit: int = Field(..., description="Applied limit")
|
|
offset: int = Field(..., description="Applied offset")
|
|
|
|
|
|
class WorkInfo(BaseModel):
|
|
"""Information about a work."""
|
|
|
|
title: str = Field(..., description="Work title")
|
|
author: str = Field(..., description="Author name")
|
|
year: Optional[int] = Field(None, description="Publication year")
|
|
language: str = Field(..., description="Language code")
|
|
genre: Optional[str] = Field(None, description="Genre classification")
|
|
|
|
|
|
class AuthorWorkResult(BaseModel):
|
|
"""Work with its documents for author filtering."""
|
|
|
|
work: WorkInfo = Field(..., description="Work information")
|
|
documents: List[DocumentSummary] = Field(
|
|
default_factory=list,
|
|
description="Documents for this work",
|
|
)
|
|
total_chunks: int = Field(..., description="Total chunks across all documents")
|
|
|
|
|
|
class FilterByAuthorInput(BaseModel):
|
|
"""Input schema for filter_by_author tool."""
|
|
|
|
author: str = Field(
|
|
...,
|
|
description="Author name to search for",
|
|
min_length=1,
|
|
)
|
|
include_chunk_counts: bool = Field(
|
|
default=True,
|
|
description="Include chunk counts in results",
|
|
)
|
|
|
|
|
|
class FilterByAuthorOutput(BaseModel):
|
|
"""Output schema for filter_by_author tool."""
|
|
|
|
author: str = Field(..., description="Searched author name")
|
|
works: List[AuthorWorkResult] = Field(
|
|
default_factory=list,
|
|
description="Works by this author",
|
|
)
|
|
total_works: int = Field(..., description="Total number of works")
|
|
total_documents: int = Field(..., description="Total number of documents")
|
|
total_chunks: int = Field(..., description="Total number of chunks")
|
|
|
|
|
|
class DeleteDocumentInput(BaseModel):
|
|
"""Input schema for delete_document tool."""
|
|
|
|
source_id: str = Field(
|
|
...,
|
|
description="Document source ID to delete",
|
|
min_length=1,
|
|
)
|
|
confirm: bool = Field(
|
|
default=False,
|
|
description="Must be True to confirm deletion",
|
|
)
|
|
|
|
|
|
class DeleteDocumentOutput(BaseModel):
|
|
"""Output schema for delete_document tool."""
|
|
|
|
success: bool = Field(..., description="Whether deletion succeeded")
|
|
source_id: str = Field(..., description="Deleted document source ID")
|
|
chunks_deleted: int = Field(..., description="Number of chunks deleted")
|
|
summaries_deleted: int = Field(..., description="Number of summaries deleted")
|
|
error: Optional[str] = Field(None, description="Error message if failed")
|