- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
336 lines
11 KiB
Python
336 lines
11 KiB
Python
"""Parsing tools for Library RAG MCP Server.
|
|
|
|
This module implements the parse_pdf tool with optimal pre-configured parameters
|
|
for PDF ingestion into the Library RAG system.
|
|
|
|
The tool uses fixed optimal parameters:
|
|
- llm_provider: "mistral" (API-based, fast)
|
|
- llm_model: "mistral-medium-latest" (best quality/cost ratio)
|
|
- use_semantic_chunking: True (LLM-based intelligent chunking)
|
|
- use_ocr_annotations: True (3x cost but better TOC extraction)
|
|
- ingest_to_weaviate: True (automatic vectorization and storage)
|
|
|
|
Example:
|
|
The parse_pdf tool can be invoked via MCP with a simple path::
|
|
|
|
{
|
|
"tool": "parse_pdf",
|
|
"arguments": {
|
|
"pdf_path": "/path/to/document.pdf"
|
|
}
|
|
}
|
|
|
|
Or with a URL::
|
|
|
|
{
|
|
"tool": "parse_pdf",
|
|
"arguments": {
|
|
"pdf_path": "https://example.com/document.pdf"
|
|
}
|
|
}
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import tempfile
|
|
from pathlib import Path
|
|
from typing import Any, Dict, Literal
|
|
from urllib.parse import urlparse
|
|
|
|
import httpx
|
|
|
|
from mcp_tools.schemas import ParsePdfInput, ParsePdfOutput
|
|
|
|
# Import pdf_pipeline for PDF processing
|
|
from utils.pdf_pipeline import process_pdf, process_pdf_bytes
|
|
from utils.types import LLMProvider
|
|
|
|
# Logger for this module
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# =============================================================================
|
|
# Constants - Fixed Optimal Parameters
|
|
# =============================================================================
|
|
|
|
# LLM provider configuration (Mistral API for best results)
|
|
FIXED_LLM_PROVIDER: LLMProvider = "mistral"
|
|
FIXED_LLM_MODEL = "mistral-medium-latest"
|
|
|
|
# Processing options (optimal settings for quality)
|
|
FIXED_USE_SEMANTIC_CHUNKING = True
|
|
FIXED_USE_OCR_ANNOTATIONS = True
|
|
FIXED_INGEST_TO_WEAVIATE = True
|
|
|
|
# Additional processing flags
|
|
FIXED_USE_LLM = True
|
|
# Note: The following flags are not supported by process_pdf() and should not be used
|
|
# FIXED_CLEAN_CHUNKS = True
|
|
# FIXED_EXTRACT_CONCEPTS = True
|
|
# FIXED_VALIDATE_OUTPUT = True
|
|
|
|
|
|
# =============================================================================
|
|
# Helper Functions
|
|
# =============================================================================
|
|
|
|
|
|
def is_url(path: str) -> bool:
|
|
"""Check if a path is a URL.
|
|
|
|
Args:
|
|
path: The path or URL string to check.
|
|
|
|
Returns:
|
|
True if the path is a valid HTTP/HTTPS URL, False otherwise.
|
|
|
|
Example:
|
|
>>> is_url("https://example.com/doc.pdf")
|
|
True
|
|
>>> is_url("/path/to/doc.pdf")
|
|
False
|
|
"""
|
|
try:
|
|
result = urlparse(path)
|
|
return result.scheme in ("http", "https")
|
|
except ValueError:
|
|
return False
|
|
|
|
|
|
async def download_pdf(url: str, timeout: float = 60.0) -> bytes:
|
|
"""Download a PDF file from a URL.
|
|
|
|
Args:
|
|
url: The URL to download from. Must be HTTP or HTTPS.
|
|
timeout: Maximum time in seconds to wait for download.
|
|
Defaults to 60 seconds.
|
|
|
|
Returns:
|
|
Raw bytes content of the downloaded PDF file.
|
|
|
|
Raises:
|
|
httpx.HTTPError: If the download fails (network error, HTTP error, etc.).
|
|
ValueError: If the URL is invalid or not accessible.
|
|
|
|
Example:
|
|
>>> pdf_bytes = await download_pdf("https://example.com/document.pdf")
|
|
>>> len(pdf_bytes) > 0
|
|
True
|
|
"""
|
|
logger.info(f"Downloading PDF from: {url}")
|
|
|
|
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
|
response = await client.get(url)
|
|
response.raise_for_status()
|
|
|
|
content_type = response.headers.get("content-type", "")
|
|
if "application/pdf" not in content_type.lower() and not url.lower().endswith(
|
|
".pdf"
|
|
):
|
|
logger.warning(
|
|
f"URL may not be a PDF (Content-Type: {content_type}), proceeding anyway"
|
|
)
|
|
|
|
logger.info(f"Downloaded {len(response.content)} bytes from {url}")
|
|
return response.content
|
|
|
|
|
|
def extract_filename_from_url(url: str) -> str:
|
|
"""Extract a filename from a URL.
|
|
|
|
Args:
|
|
url: The URL to extract filename from.
|
|
|
|
Returns:
|
|
Extracted filename with .pdf extension. Falls back to "downloaded.pdf"
|
|
if no filename can be extracted.
|
|
|
|
Example:
|
|
>>> extract_filename_from_url("https://example.com/documents/kant.pdf")
|
|
"kant.pdf"
|
|
>>> extract_filename_from_url("https://example.com/api/download")
|
|
"downloaded.pdf"
|
|
"""
|
|
parsed = urlparse(url)
|
|
path = parsed.path
|
|
|
|
if path:
|
|
# Get the last path component
|
|
filename = path.split("/")[-1]
|
|
if filename and "." in filename:
|
|
return filename
|
|
if filename:
|
|
return f"{filename}.pdf"
|
|
|
|
return "downloaded.pdf"
|
|
|
|
|
|
# =============================================================================
|
|
# Main Tool Implementation
|
|
# =============================================================================
|
|
|
|
|
|
async def parse_pdf_handler(input_data: ParsePdfInput) -> ParsePdfOutput:
|
|
"""Process a PDF document with optimal pre-configured parameters.
|
|
|
|
This is the main handler for the parse_pdf MCP tool. It processes PDFs
|
|
through the Library RAG pipeline with the following fixed optimal settings:
|
|
|
|
- LLM: Mistral API (mistral-medium-latest) for fast, high-quality processing
|
|
- OCR: Mistral OCR with annotations (better TOC extraction, 3x cost)
|
|
- Chunking: Semantic LLM-based chunking (argumentative units)
|
|
- Ingestion: Automatic Weaviate vectorization and storage
|
|
|
|
The tool accepts either a local file path or a URL. URLs are automatically
|
|
downloaded before processing.
|
|
|
|
Args:
|
|
input_data: Validated input containing pdf_path (local path or URL).
|
|
|
|
Returns:
|
|
ParsePdfOutput containing processing results including:
|
|
- success: Whether processing completed successfully
|
|
- document_name: Name of the processed document
|
|
- source_id: Unique identifier for retrieval
|
|
- pages: Number of pages processed
|
|
- chunks_count: Number of chunks created
|
|
- cost_ocr: OCR cost in EUR
|
|
- cost_llm: LLM cost in EUR
|
|
- cost_total: Total processing cost
|
|
- output_dir: Directory containing output files
|
|
- metadata: Extracted document metadata
|
|
- error: Error message if processing failed
|
|
|
|
Example:
|
|
>>> input_data = ParsePdfInput(pdf_path="/docs/aristotle.pdf")
|
|
>>> result = await parse_pdf_handler(input_data)
|
|
>>> result.success
|
|
True
|
|
>>> result.chunks_count > 0
|
|
True
|
|
"""
|
|
pdf_path = input_data.pdf_path
|
|
logger.info(f"parse_pdf called with: {pdf_path}")
|
|
|
|
try:
|
|
# Determine if input is a URL or local path
|
|
if is_url(pdf_path):
|
|
# Download PDF from URL
|
|
logger.info(f"Detected URL input, downloading: {pdf_path}")
|
|
pdf_bytes = await download_pdf(pdf_path)
|
|
filename = extract_filename_from_url(pdf_path)
|
|
|
|
# Process from bytes
|
|
result = process_pdf_bytes(
|
|
file_bytes=pdf_bytes,
|
|
filename=filename,
|
|
output_dir=Path("output"),
|
|
llm_provider=FIXED_LLM_PROVIDER,
|
|
use_llm=FIXED_USE_LLM,
|
|
llm_model=FIXED_LLM_MODEL,
|
|
use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
|
|
use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
|
|
ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
|
|
)
|
|
else:
|
|
# Process local file
|
|
local_path = Path(pdf_path)
|
|
if not local_path.exists():
|
|
logger.error(f"PDF file not found: {pdf_path}")
|
|
return ParsePdfOutput(
|
|
success=False,
|
|
document_name="",
|
|
source_id="",
|
|
pages=0,
|
|
chunks_count=0,
|
|
cost_ocr=0.0,
|
|
cost_llm=0.0,
|
|
cost_total=0.0,
|
|
output_dir="",
|
|
metadata={},
|
|
error=f"PDF file not found: {pdf_path}",
|
|
)
|
|
|
|
logger.info(f"Processing local file: {local_path}")
|
|
result = process_pdf(
|
|
pdf_path=local_path,
|
|
output_dir=Path("output"),
|
|
use_llm=FIXED_USE_LLM,
|
|
llm_provider=FIXED_LLM_PROVIDER,
|
|
llm_model=FIXED_LLM_MODEL,
|
|
use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
|
|
use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
|
|
ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
|
|
)
|
|
|
|
# Convert pipeline result to output schema
|
|
success = result.get("success", False)
|
|
document_name = result.get("document_name", "")
|
|
source_id = result.get("source_id", document_name)
|
|
|
|
# Extract costs
|
|
cost_ocr = result.get("cost_ocr", 0.0)
|
|
cost_llm = result.get("cost_llm", 0.0)
|
|
cost_total = result.get("cost_total", cost_ocr + cost_llm)
|
|
|
|
# Extract metadata
|
|
metadata_raw = result.get("metadata", {})
|
|
if metadata_raw is None:
|
|
metadata_raw = {}
|
|
|
|
# Build output
|
|
output = ParsePdfOutput(
|
|
success=success,
|
|
document_name=document_name,
|
|
source_id=source_id,
|
|
pages=result.get("pages", 0),
|
|
chunks_count=result.get("chunks_count", 0),
|
|
cost_ocr=cost_ocr,
|
|
cost_llm=cost_llm,
|
|
cost_total=cost_total,
|
|
output_dir=str(result.get("output_dir", "")),
|
|
metadata=metadata_raw,
|
|
error=result.get("error"),
|
|
)
|
|
|
|
if success:
|
|
logger.info(
|
|
f"Successfully processed {document_name}: "
|
|
f"{output.chunks_count} chunks, {output.cost_total:.4f} EUR"
|
|
)
|
|
else:
|
|
logger.error(f"Failed to process {pdf_path}: {output.error}")
|
|
|
|
return output
|
|
|
|
except httpx.HTTPError as e:
|
|
logger.error(f"HTTP error downloading PDF: {e}")
|
|
return ParsePdfOutput(
|
|
success=False,
|
|
document_name="",
|
|
source_id="",
|
|
pages=0,
|
|
chunks_count=0,
|
|
cost_ocr=0.0,
|
|
cost_llm=0.0,
|
|
cost_total=0.0,
|
|
output_dir="",
|
|
metadata={},
|
|
error=f"Failed to download PDF: {e}",
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error processing PDF: {e}", exc_info=True)
|
|
return ParsePdfOutput(
|
|
success=False,
|
|
document_name="",
|
|
source_id="",
|
|
pages=0,
|
|
chunks_count=0,
|
|
cost_ocr=0.0,
|
|
cost_llm=0.0,
|
|
cost_total=0.0,
|
|
output_dir="",
|
|
metadata={},
|
|
error=f"Processing error: {str(e)}",
|
|
)
|