Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server)
  - PDF processing pipeline with OCR and LLM extraction
  - Weaviate vector database integration (BGE-M3 embeddings)
  - Flask web interface with search and document management
  - MCP server for Claude Desktop integration
  - Comprehensive test suite (134 tests)

- Clean up root directory
  - Remove obsolete documentation files
  - Remove backup and temporary files
  - Update autonomous agent configuration

- Update prompts
  - Enhance initializer bis prompt with better instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions

View File

@@ -0,0 +1,335 @@
"""Parsing tools for Library RAG MCP Server.
This module implements the parse_pdf tool with optimal pre-configured parameters
for PDF ingestion into the Library RAG system.
The tool uses fixed optimal parameters:
- llm_provider: "mistral" (API-based, fast)
- llm_model: "mistral-medium-latest" (best quality/cost ratio)
- use_semantic_chunking: True (LLM-based intelligent chunking)
- use_ocr_annotations: True (3x cost but better TOC extraction)
- ingest_to_weaviate: True (automatic vectorization and storage)
Example:
The parse_pdf tool can be invoked via MCP with a simple path::
{
"tool": "parse_pdf",
"arguments": {
"pdf_path": "/path/to/document.pdf"
}
}
Or with a URL::
{
"tool": "parse_pdf",
"arguments": {
"pdf_path": "https://example.com/document.pdf"
}
}
"""
from __future__ import annotations
import logging
import tempfile
from pathlib import Path
from typing import Any, Dict, Literal
from urllib.parse import urlparse
import httpx
from mcp_tools.schemas import ParsePdfInput, ParsePdfOutput
# Import pdf_pipeline for PDF processing
from utils.pdf_pipeline import process_pdf, process_pdf_bytes
from utils.types import LLMProvider
# Logger for this module
logger = logging.getLogger(__name__)
# =============================================================================
# Constants - Fixed Optimal Parameters
# =============================================================================
# LLM provider configuration (Mistral API for best results)
FIXED_LLM_PROVIDER: LLMProvider = "mistral"
FIXED_LLM_MODEL = "mistral-medium-latest"
# Processing options (optimal settings for quality)
FIXED_USE_SEMANTIC_CHUNKING = True
FIXED_USE_OCR_ANNOTATIONS = True
FIXED_INGEST_TO_WEAVIATE = True
# Additional processing flags
FIXED_USE_LLM = True
# Note: The following flags are not supported by process_pdf() and should not be used
# FIXED_CLEAN_CHUNKS = True
# FIXED_EXTRACT_CONCEPTS = True
# FIXED_VALIDATE_OUTPUT = True
# =============================================================================
# Helper Functions
# =============================================================================
def is_url(path: str) -> bool:
"""Check if a path is a URL.
Args:
path: The path or URL string to check.
Returns:
True if the path is a valid HTTP/HTTPS URL, False otherwise.
Example:
>>> is_url("https://example.com/doc.pdf")
True
>>> is_url("/path/to/doc.pdf")
False
"""
try:
result = urlparse(path)
return result.scheme in ("http", "https")
except ValueError:
return False
async def download_pdf(url: str, timeout: float = 60.0) -> bytes:
"""Download a PDF file from a URL.
Args:
url: The URL to download from. Must be HTTP or HTTPS.
timeout: Maximum time in seconds to wait for download.
Defaults to 60 seconds.
Returns:
Raw bytes content of the downloaded PDF file.
Raises:
httpx.HTTPError: If the download fails (network error, HTTP error, etc.).
ValueError: If the URL is invalid or not accessible.
Example:
>>> pdf_bytes = await download_pdf("https://example.com/document.pdf")
>>> len(pdf_bytes) > 0
True
"""
logger.info(f"Downloading PDF from: {url}")
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
response = await client.get(url)
response.raise_for_status()
content_type = response.headers.get("content-type", "")
if "application/pdf" not in content_type.lower() and not url.lower().endswith(
".pdf"
):
logger.warning(
f"URL may not be a PDF (Content-Type: {content_type}), proceeding anyway"
)
logger.info(f"Downloaded {len(response.content)} bytes from {url}")
return response.content
def extract_filename_from_url(url: str) -> str:
"""Extract a filename from a URL.
Args:
url: The URL to extract filename from.
Returns:
Extracted filename with .pdf extension. Falls back to "downloaded.pdf"
if no filename can be extracted.
Example:
>>> extract_filename_from_url("https://example.com/documents/kant.pdf")
"kant.pdf"
>>> extract_filename_from_url("https://example.com/api/download")
"downloaded.pdf"
"""
parsed = urlparse(url)
path = parsed.path
if path:
# Get the last path component
filename = path.split("/")[-1]
if filename and "." in filename:
return filename
if filename:
return f"{filename}.pdf"
return "downloaded.pdf"
# =============================================================================
# Main Tool Implementation
# =============================================================================
async def parse_pdf_handler(input_data: ParsePdfInput) -> ParsePdfOutput:
"""Process a PDF document with optimal pre-configured parameters.
This is the main handler for the parse_pdf MCP tool. It processes PDFs
through the Library RAG pipeline with the following fixed optimal settings:
- LLM: Mistral API (mistral-medium-latest) for fast, high-quality processing
- OCR: Mistral OCR with annotations (better TOC extraction, 3x cost)
- Chunking: Semantic LLM-based chunking (argumentative units)
- Ingestion: Automatic Weaviate vectorization and storage
The tool accepts either a local file path or a URL. URLs are automatically
downloaded before processing.
Args:
input_data: Validated input containing pdf_path (local path or URL).
Returns:
ParsePdfOutput containing processing results including:
- success: Whether processing completed successfully
- document_name: Name of the processed document
- source_id: Unique identifier for retrieval
- pages: Number of pages processed
- chunks_count: Number of chunks created
- cost_ocr: OCR cost in EUR
- cost_llm: LLM cost in EUR
- cost_total: Total processing cost
- output_dir: Directory containing output files
- metadata: Extracted document metadata
- error: Error message if processing failed
Example:
>>> input_data = ParsePdfInput(pdf_path="/docs/aristotle.pdf")
>>> result = await parse_pdf_handler(input_data)
>>> result.success
True
>>> result.chunks_count > 0
True
"""
pdf_path = input_data.pdf_path
logger.info(f"parse_pdf called with: {pdf_path}")
try:
# Determine if input is a URL or local path
if is_url(pdf_path):
# Download PDF from URL
logger.info(f"Detected URL input, downloading: {pdf_path}")
pdf_bytes = await download_pdf(pdf_path)
filename = extract_filename_from_url(pdf_path)
# Process from bytes
result = process_pdf_bytes(
file_bytes=pdf_bytes,
filename=filename,
output_dir=Path("output"),
llm_provider=FIXED_LLM_PROVIDER,
use_llm=FIXED_USE_LLM,
llm_model=FIXED_LLM_MODEL,
use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
)
else:
# Process local file
local_path = Path(pdf_path)
if not local_path.exists():
logger.error(f"PDF file not found: {pdf_path}")
return ParsePdfOutput(
success=False,
document_name="",
source_id="",
pages=0,
chunks_count=0,
cost_ocr=0.0,
cost_llm=0.0,
cost_total=0.0,
output_dir="",
metadata={},
error=f"PDF file not found: {pdf_path}",
)
logger.info(f"Processing local file: {local_path}")
result = process_pdf(
pdf_path=local_path,
output_dir=Path("output"),
use_llm=FIXED_USE_LLM,
llm_provider=FIXED_LLM_PROVIDER,
llm_model=FIXED_LLM_MODEL,
use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
)
# Convert pipeline result to output schema
success = result.get("success", False)
document_name = result.get("document_name", "")
source_id = result.get("source_id", document_name)
# Extract costs
cost_ocr = result.get("cost_ocr", 0.0)
cost_llm = result.get("cost_llm", 0.0)
cost_total = result.get("cost_total", cost_ocr + cost_llm)
# Extract metadata
metadata_raw = result.get("metadata", {})
if metadata_raw is None:
metadata_raw = {}
# Build output
output = ParsePdfOutput(
success=success,
document_name=document_name,
source_id=source_id,
pages=result.get("pages", 0),
chunks_count=result.get("chunks_count", 0),
cost_ocr=cost_ocr,
cost_llm=cost_llm,
cost_total=cost_total,
output_dir=str(result.get("output_dir", "")),
metadata=metadata_raw,
error=result.get("error"),
)
if success:
logger.info(
f"Successfully processed {document_name}: "
f"{output.chunks_count} chunks, {output.cost_total:.4f} EUR"
)
else:
logger.error(f"Failed to process {pdf_path}: {output.error}")
return output
except httpx.HTTPError as e:
logger.error(f"HTTP error downloading PDF: {e}")
return ParsePdfOutput(
success=False,
document_name="",
source_id="",
pages=0,
chunks_count=0,
cost_ocr=0.0,
cost_llm=0.0,
cost_total=0.0,
output_dir="",
metadata={},
error=f"Failed to download PDF: {e}",
)
except Exception as e:
logger.error(f"Error processing PDF: {e}", exc_info=True)
return ParsePdfOutput(
success=False,
document_name="",
source_id="",
pages=0,
chunks_count=0,
cost_ocr=0.0,
cost_llm=0.0,
cost_total=0.0,
output_dir="",
metadata={},
error=f"Processing error: {str(e)}",
)