Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server)
  - PDF processing pipeline with OCR and LLM extraction
  - Weaviate vector database integration (BGE-M3 embeddings)
  - Flask web interface with search and document management
  - MCP server for Claude Desktop integration
  - Comprehensive test suite (134 tests)

- Clean up root directory
  - Remove obsolete documentation files
  - Remove backup and temporary files
  - Update autonomous agent configuration

- Update prompts
  - Enhance initializer bis prompt with better instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions

View File

@@ -0,0 +1,592 @@
"""
Library RAG MCP Server - PDF Ingestion & Semantic Retrieval.
This module provides an MCP (Model Context Protocol) server that exposes
Library RAG capabilities as tools for LLMs. It provides:
- 1 parsing tool: parse_pdf (PDF ingestion with optimal parameters)
- 7 retrieval tools: semantic search and document management
The server uses stdio transport for communication with LLM clients
like Claude Desktop.
Example:
Run the server directly::
python mcp_server.py
Or configure in Claude Desktop claude_desktop_config.json::
{
"mcpServers": {
"library-rag": {
"command": "python",
"args": ["path/to/mcp_server.py"],
"env": {"MISTRAL_API_KEY": "your-key"}
}
}
}
"""
import logging
import signal
import sys
from contextlib import asynccontextmanager
from pathlib import Path
from typing import Any, AsyncIterator, Dict
from mcp.server.fastmcp import FastMCP
from mcp_config import MCPConfig
from mcp_tools import (
ParsePdfInput,
parse_pdf_handler,
SearchChunksInput,
search_chunks_handler,
SearchSummariesInput,
search_summaries_handler,
GetDocumentInput,
get_document_handler,
ListDocumentsInput,
list_documents_handler,
GetChunksByDocumentInput,
get_chunks_by_document_handler,
FilterByAuthorInput,
filter_by_author_handler,
DeleteDocumentInput,
delete_document_handler,
# Logging utilities
setup_mcp_logging,
# Exception types for error handling
WeaviateConnectionError,
PDFProcessingError,
)
# =============================================================================
# Logging Configuration
# =============================================================================
# Note: We use setup_mcp_logging from mcp_tools.logging_config for structured
# JSON logging. The function is imported at the top of this file.
# =============================================================================
# Global State
# =============================================================================
# Configuration loaded at startup
config: MCPConfig | None = None
logger: logging.Logger | None = None
# =============================================================================
# Server Lifecycle
# =============================================================================
@asynccontextmanager
async def server_lifespan(server: FastMCP) -> AsyncIterator[None]:
"""
Manage server lifecycle - startup and shutdown.
This context manager handles:
- Loading configuration from environment
- Validating configuration
- Setting up logging
- Graceful shutdown cleanup
Args:
server: The FastMCP server instance.
Yields:
None during server runtime.
Raises:
ValueError: If configuration is invalid or missing required values.
"""
global config, logger
# Startup
try:
# Load and validate configuration
config = MCPConfig.from_env()
config.validate()
# Setup structured JSON logging with configured level
logger = setup_mcp_logging(
log_level=config.log_level,
log_dir=Path("logs"),
json_format=True,
)
logger.info(
"Library RAG MCP Server starting",
extra={
"event": "server_startup",
"weaviate_url": config.weaviate_url,
"output_dir": str(config.output_dir),
"llm_provider": config.default_llm_provider,
"log_level": config.log_level,
},
)
yield
except ValueError as e:
# Configuration error - log and re-raise
if logger:
logger.error(
"Configuration error",
extra={
"event": "config_error",
"error_message": str(e),
},
)
else:
print(f"Configuration error: {e}", file=sys.stderr)
raise
finally:
# Shutdown
if logger:
logger.info(
"Library RAG MCP Server shutting down",
extra={"event": "server_shutdown"},
)
# =============================================================================
# MCP Server Initialization
# =============================================================================
# Create the MCP server with lifespan management
mcp = FastMCP(
name="library-rag",
lifespan=server_lifespan,
)
# =============================================================================
# Tool Registration (placeholders - to be implemented in separate modules)
# =============================================================================
@mcp.tool()
async def ping() -> str:
"""
Health check tool to verify server is running.
Returns:
Success message with server status.
"""
return "Library RAG MCP Server is running!"
@mcp.tool()
async def parse_pdf(pdf_path: str) -> Dict[str, Any]:
"""
Process a PDF document with optimal pre-configured parameters.
Ingests a PDF file into the Library RAG system using Mistral OCR and LLM
for intelligent processing. The document is automatically chunked,
vectorized, and stored in Weaviate for semantic search.
Fixed optimal parameters used:
- LLM: Mistral API (mistral-medium-latest)
- OCR: With annotations (better TOC extraction)
- Chunking: Semantic LLM-based (argumentative units)
- Ingestion: Automatic Weaviate vectorization
Args:
pdf_path: Local file path or URL to the PDF document.
Returns:
Dictionary containing:
- success: Whether processing succeeded
- document_name: Name of the processed document
- source_id: Unique identifier for retrieval
- pages: Number of pages processed
- chunks_count: Number of chunks created
- cost_ocr: OCR cost in EUR
- cost_llm: LLM cost in EUR
- cost_total: Total processing cost
- output_dir: Directory with output files
- metadata: Extracted document metadata
- error: Error message if failed
"""
input_data = ParsePdfInput(pdf_path=pdf_path)
result = await parse_pdf_handler(input_data)
return result.model_dump(mode='json')
@mcp.tool()
async def search_chunks(
query: str,
limit: int = 10,
min_similarity: float = 0.0,
author_filter: str | None = None,
work_filter: str | None = None,
language_filter: str | None = None,
) -> Dict[str, Any]:
"""
Search for text chunks using semantic similarity.
Performs a near_text query on the Weaviate Chunk collection to find
semantically similar text passages from the indexed philosophical texts.
Args:
query: The search query text (e.g., "la justice et la vertu").
limit: Maximum number of results to return (1-100, default 10).
min_similarity: Minimum similarity threshold 0-1 (default 0).
author_filter: Filter by author name (e.g., "Platon").
work_filter: Filter by work title (e.g., "La Republique").
language_filter: Filter by language code (e.g., "fr", "en").
Returns:
Dictionary containing:
- results: List of matching chunks with text and metadata
- total_count: Number of results returned
- query: The original search query
"""
input_data = SearchChunksInput(
query=query,
limit=limit,
min_similarity=min_similarity,
author_filter=author_filter,
work_filter=work_filter,
language_filter=language_filter,
)
result = await search_chunks_handler(input_data)
return result.model_dump(mode='json')
@mcp.tool()
async def search_summaries(
query: str,
limit: int = 10,
min_level: int | None = None,
max_level: int | None = None,
) -> Dict[str, Any]:
"""
Search for chapter/section summaries using semantic similarity.
Performs a near_text query on the Weaviate Summary collection to find
semantically similar summaries from indexed philosophical texts.
Hierarchy levels:
- Level 1: Chapters (highest level)
- Level 2: Sections
- Level 3: Subsections
- etc.
Args:
query: The search query text (e.g., "la vertu et l'education").
limit: Maximum number of results to return (1-100, default 10).
min_level: Minimum hierarchy level filter (1=chapter, optional).
max_level: Maximum hierarchy level filter (optional).
Returns:
Dictionary containing:
- results: List of matching summaries with text and metadata
- total_count: Number of results returned
- query: The original search query
Example:
Search for summaries about virtue at chapter level only::
search_summaries(
query="la vertu",
limit=5,
min_level=1,
max_level=1
)
"""
input_data = SearchSummariesInput(
query=query,
limit=limit,
min_level=min_level,
max_level=max_level,
)
result = await search_summaries_handler(input_data)
return result.model_dump(mode='json')
@mcp.tool()
async def get_document(
source_id: str,
include_chunks: bool = False,
chunk_limit: int = 50,
) -> Dict[str, Any]:
"""
Retrieve a document by its source ID with optional chunks.
Fetches complete document metadata and optionally related text chunks
from the Weaviate database.
Args:
source_id: Document source ID (e.g., "platon-menon").
include_chunks: Include document chunks in response (default False).
chunk_limit: Maximum chunks to return if include_chunks=True (1-500, default 50).
Returns:
Dictionary containing:
- document: Document metadata (title, author, pages, TOC, hierarchy)
- chunks: List of chunks (if include_chunks=True)
- chunks_total: Total number of chunks in document
- found: Whether document was found
- error: Error message if not found
Example:
Get document metadata only::
get_document(source_id="platon-menon")
Get document with first 20 chunks::
get_document(
source_id="platon-menon",
include_chunks=True,
chunk_limit=20
)
"""
input_data = GetDocumentInput(
source_id=source_id,
include_chunks=include_chunks,
chunk_limit=chunk_limit,
)
result = await get_document_handler(input_data)
return result.model_dump(mode='json')
@mcp.tool()
async def list_documents(
author_filter: str | None = None,
work_filter: str | None = None,
language_filter: str | None = None,
limit: int = 50,
offset: int = 0,
) -> Dict[str, Any]:
"""
List all documents with filtering and pagination support.
Retrieves a list of all documents stored in the Library RAG system.
Supports filtering by author, work title, and language, as well as
pagination with limit and offset parameters.
Args:
author_filter: Filter by author name (e.g., "Platon").
work_filter: Filter by work title (e.g., "La Republique").
language_filter: Filter by language code (e.g., "fr", "en").
limit: Maximum number of results to return (1-250, default 50).
offset: Number of results to skip for pagination (default 0).
Returns:
Dictionary containing:
- documents: List of document summaries (source_id, title, author, pages, chunks_count, language)
- total_count: Total number of documents matching filters
- limit: Applied limit value
- offset: Applied offset value
Example:
List all French documents::
list_documents(language_filter="fr")
Paginate through results::
list_documents(limit=10, offset=0) # First 10
list_documents(limit=10, offset=10) # Next 10
"""
input_data = ListDocumentsInput(
author_filter=author_filter,
work_filter=work_filter,
language_filter=language_filter,
limit=limit,
offset=offset,
)
result = await list_documents_handler(input_data)
return result.model_dump(mode='json')
@mcp.tool()
async def get_chunks_by_document(
source_id: str,
limit: int = 50,
offset: int = 0,
section_filter: str | None = None,
) -> Dict[str, Any]:
"""
Retrieve all chunks for a document in sequential order.
Fetches all text chunks belonging to a specific document, ordered by
their position in the document (orderIndex). Supports pagination and
optional filtering by section path.
Args:
source_id: Document source ID (e.g., "platon-menon").
limit: Maximum number of chunks to return (1-500, default 50).
offset: Number of chunks to skip for pagination (default 0).
section_filter: Filter by section path prefix (e.g., "Chapter 1").
Returns:
Dictionary containing:
- chunks: List of chunks in document order
- total_count: Total number of chunks in document
- document_source_id: The queried document source ID
- limit: Applied limit value
- offset: Applied offset value
Example:
Get first 20 chunks::
get_chunks_by_document(source_id="platon-menon", limit=20)
Get chunks from a specific section::
get_chunks_by_document(
source_id="platon-menon",
section_filter="Chapter 3"
)
Paginate through chunks::
get_chunks_by_document(source_id="platon-menon", limit=50, offset=0)
get_chunks_by_document(source_id="platon-menon", limit=50, offset=50)
"""
input_data = GetChunksByDocumentInput(
source_id=source_id,
limit=limit,
offset=offset,
section_filter=section_filter,
)
result = await get_chunks_by_document_handler(input_data)
return result.model_dump(mode='json')
@mcp.tool()
async def filter_by_author(
author: str,
include_chunk_counts: bool = True,
) -> Dict[str, Any]:
"""
Get all works and documents by a specific author.
Retrieves all works associated with an author, along with their related
documents. Optionally includes total chunk counts for each work.
Args:
author: The author name to search for (e.g., "Platon", "Aristotle").
include_chunk_counts: Whether to include chunk counts (default True).
Returns:
Dictionary containing:
- author: The searched author name
- works: List of works with work info and documents
- total_works: Total number of works by this author
- total_documents: Total number of documents across all works
- total_chunks: Total number of chunks (if include_chunk_counts=True)
Example:
Get all works by Platon::
filter_by_author(author="Platon")
Get works without chunk counts (faster)::
filter_by_author(author="Platon", include_chunk_counts=False)
"""
input_data = FilterByAuthorInput(
author=author,
include_chunk_counts=include_chunk_counts,
)
result = await filter_by_author_handler(input_data)
return result.model_dump(mode='json')
@mcp.tool()
async def delete_document(
source_id: str,
confirm: bool = False,
) -> Dict[str, Any]:
"""
Delete a document and all its chunks/summaries from Weaviate.
Removes all data associated with a document: the Document object itself,
all Chunk objects, and all Summary objects. Requires explicit confirmation
to prevent accidental deletions.
IMPORTANT: This operation is irreversible. Use with caution.
Args:
source_id: Document source ID to delete (e.g., "platon-menon").
confirm: Must be True to confirm deletion (safety check, default False).
Returns:
Dictionary containing:
- success: Whether deletion succeeded
- source_id: The deleted document source ID
- chunks_deleted: Number of chunks deleted
- summaries_deleted: Number of summaries deleted
- error: Error message if failed
Example:
Delete a document (requires confirmation)::
delete_document(
source_id="platon-menon",
confirm=True
)
Without confirm=True, the operation will fail with an error message::
delete_document(source_id="platon-menon")
# Returns: {"success": false, "error": "Confirmation required..."}
"""
input_data = DeleteDocumentInput(
source_id=source_id,
confirm=confirm,
)
result = await delete_document_handler(input_data)
return result.model_dump(mode='json')
# =============================================================================
# Signal Handlers
# =============================================================================
def handle_shutdown(signum: int, frame: object) -> None:
"""
Handle shutdown signals gracefully.
Args:
signum: Signal number received.
frame: Current stack frame (unused).
"""
if logger:
logger.info(f"Received signal {signum}, initiating graceful shutdown...")
sys.exit(0)
# =============================================================================
# Main Entry Point
# =============================================================================
def main() -> None:
"""
Main entry point for the MCP server.
Sets up signal handlers and runs the server with stdio transport.
"""
# Register signal handlers for graceful shutdown
signal.signal(signal.SIGINT, handle_shutdown)
signal.signal(signal.SIGTERM, handle_shutdown)
# Run the server with stdio transport (default for MCP)
mcp.run(transport="stdio")
if __name__ == "__main__":
main()