Add Library RAG project and cleanup root directory
- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
335
generations/library_rag/mcp_tools/parsing_tools.py
Normal file
335
generations/library_rag/mcp_tools/parsing_tools.py
Normal file
@@ -0,0 +1,335 @@
|
||||
"""Parsing tools for Library RAG MCP Server.
|
||||
|
||||
This module implements the parse_pdf tool with optimal pre-configured parameters
|
||||
for PDF ingestion into the Library RAG system.
|
||||
|
||||
The tool uses fixed optimal parameters:
|
||||
- llm_provider: "mistral" (API-based, fast)
|
||||
- llm_model: "mistral-medium-latest" (best quality/cost ratio)
|
||||
- use_semantic_chunking: True (LLM-based intelligent chunking)
|
||||
- use_ocr_annotations: True (3x cost but better TOC extraction)
|
||||
- ingest_to_weaviate: True (automatic vectorization and storage)
|
||||
|
||||
Example:
|
||||
The parse_pdf tool can be invoked via MCP with a simple path::
|
||||
|
||||
{
|
||||
"tool": "parse_pdf",
|
||||
"arguments": {
|
||||
"pdf_path": "/path/to/document.pdf"
|
||||
}
|
||||
}
|
||||
|
||||
Or with a URL::
|
||||
|
||||
{
|
||||
"tool": "parse_pdf",
|
||||
"arguments": {
|
||||
"pdf_path": "https://example.com/document.pdf"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Literal
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
from mcp_tools.schemas import ParsePdfInput, ParsePdfOutput
|
||||
|
||||
# Import pdf_pipeline for PDF processing
|
||||
from utils.pdf_pipeline import process_pdf, process_pdf_bytes
|
||||
from utils.types import LLMProvider
|
||||
|
||||
# Logger for this module
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# =============================================================================
|
||||
# Constants - Fixed Optimal Parameters
|
||||
# =============================================================================
|
||||
|
||||
# LLM provider configuration (Mistral API for best results)
|
||||
FIXED_LLM_PROVIDER: LLMProvider = "mistral"
|
||||
FIXED_LLM_MODEL = "mistral-medium-latest"
|
||||
|
||||
# Processing options (optimal settings for quality)
|
||||
FIXED_USE_SEMANTIC_CHUNKING = True
|
||||
FIXED_USE_OCR_ANNOTATIONS = True
|
||||
FIXED_INGEST_TO_WEAVIATE = True
|
||||
|
||||
# Additional processing flags
|
||||
FIXED_USE_LLM = True
|
||||
# Note: The following flags are not supported by process_pdf() and should not be used
|
||||
# FIXED_CLEAN_CHUNKS = True
|
||||
# FIXED_EXTRACT_CONCEPTS = True
|
||||
# FIXED_VALIDATE_OUTPUT = True
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def is_url(path: str) -> bool:
|
||||
"""Check if a path is a URL.
|
||||
|
||||
Args:
|
||||
path: The path or URL string to check.
|
||||
|
||||
Returns:
|
||||
True if the path is a valid HTTP/HTTPS URL, False otherwise.
|
||||
|
||||
Example:
|
||||
>>> is_url("https://example.com/doc.pdf")
|
||||
True
|
||||
>>> is_url("/path/to/doc.pdf")
|
||||
False
|
||||
"""
|
||||
try:
|
||||
result = urlparse(path)
|
||||
return result.scheme in ("http", "https")
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
async def download_pdf(url: str, timeout: float = 60.0) -> bytes:
|
||||
"""Download a PDF file from a URL.
|
||||
|
||||
Args:
|
||||
url: The URL to download from. Must be HTTP or HTTPS.
|
||||
timeout: Maximum time in seconds to wait for download.
|
||||
Defaults to 60 seconds.
|
||||
|
||||
Returns:
|
||||
Raw bytes content of the downloaded PDF file.
|
||||
|
||||
Raises:
|
||||
httpx.HTTPError: If the download fails (network error, HTTP error, etc.).
|
||||
ValueError: If the URL is invalid or not accessible.
|
||||
|
||||
Example:
|
||||
>>> pdf_bytes = await download_pdf("https://example.com/document.pdf")
|
||||
>>> len(pdf_bytes) > 0
|
||||
True
|
||||
"""
|
||||
logger.info(f"Downloading PDF from: {url}")
|
||||
|
||||
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
content_type = response.headers.get("content-type", "")
|
||||
if "application/pdf" not in content_type.lower() and not url.lower().endswith(
|
||||
".pdf"
|
||||
):
|
||||
logger.warning(
|
||||
f"URL may not be a PDF (Content-Type: {content_type}), proceeding anyway"
|
||||
)
|
||||
|
||||
logger.info(f"Downloaded {len(response.content)} bytes from {url}")
|
||||
return response.content
|
||||
|
||||
|
||||
def extract_filename_from_url(url: str) -> str:
|
||||
"""Extract a filename from a URL.
|
||||
|
||||
Args:
|
||||
url: The URL to extract filename from.
|
||||
|
||||
Returns:
|
||||
Extracted filename with .pdf extension. Falls back to "downloaded.pdf"
|
||||
if no filename can be extracted.
|
||||
|
||||
Example:
|
||||
>>> extract_filename_from_url("https://example.com/documents/kant.pdf")
|
||||
"kant.pdf"
|
||||
>>> extract_filename_from_url("https://example.com/api/download")
|
||||
"downloaded.pdf"
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path
|
||||
|
||||
if path:
|
||||
# Get the last path component
|
||||
filename = path.split("/")[-1]
|
||||
if filename and "." in filename:
|
||||
return filename
|
||||
if filename:
|
||||
return f"{filename}.pdf"
|
||||
|
||||
return "downloaded.pdf"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main Tool Implementation
|
||||
# =============================================================================
|
||||
|
||||
|
||||
async def parse_pdf_handler(input_data: ParsePdfInput) -> ParsePdfOutput:
|
||||
"""Process a PDF document with optimal pre-configured parameters.
|
||||
|
||||
This is the main handler for the parse_pdf MCP tool. It processes PDFs
|
||||
through the Library RAG pipeline with the following fixed optimal settings:
|
||||
|
||||
- LLM: Mistral API (mistral-medium-latest) for fast, high-quality processing
|
||||
- OCR: Mistral OCR with annotations (better TOC extraction, 3x cost)
|
||||
- Chunking: Semantic LLM-based chunking (argumentative units)
|
||||
- Ingestion: Automatic Weaviate vectorization and storage
|
||||
|
||||
The tool accepts either a local file path or a URL. URLs are automatically
|
||||
downloaded before processing.
|
||||
|
||||
Args:
|
||||
input_data: Validated input containing pdf_path (local path or URL).
|
||||
|
||||
Returns:
|
||||
ParsePdfOutput containing processing results including:
|
||||
- success: Whether processing completed successfully
|
||||
- document_name: Name of the processed document
|
||||
- source_id: Unique identifier for retrieval
|
||||
- pages: Number of pages processed
|
||||
- chunks_count: Number of chunks created
|
||||
- cost_ocr: OCR cost in EUR
|
||||
- cost_llm: LLM cost in EUR
|
||||
- cost_total: Total processing cost
|
||||
- output_dir: Directory containing output files
|
||||
- metadata: Extracted document metadata
|
||||
- error: Error message if processing failed
|
||||
|
||||
Example:
|
||||
>>> input_data = ParsePdfInput(pdf_path="/docs/aristotle.pdf")
|
||||
>>> result = await parse_pdf_handler(input_data)
|
||||
>>> result.success
|
||||
True
|
||||
>>> result.chunks_count > 0
|
||||
True
|
||||
"""
|
||||
pdf_path = input_data.pdf_path
|
||||
logger.info(f"parse_pdf called with: {pdf_path}")
|
||||
|
||||
try:
|
||||
# Determine if input is a URL or local path
|
||||
if is_url(pdf_path):
|
||||
# Download PDF from URL
|
||||
logger.info(f"Detected URL input, downloading: {pdf_path}")
|
||||
pdf_bytes = await download_pdf(pdf_path)
|
||||
filename = extract_filename_from_url(pdf_path)
|
||||
|
||||
# Process from bytes
|
||||
result = process_pdf_bytes(
|
||||
file_bytes=pdf_bytes,
|
||||
filename=filename,
|
||||
output_dir=Path("output"),
|
||||
llm_provider=FIXED_LLM_PROVIDER,
|
||||
use_llm=FIXED_USE_LLM,
|
||||
llm_model=FIXED_LLM_MODEL,
|
||||
use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
|
||||
use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
|
||||
ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
|
||||
)
|
||||
else:
|
||||
# Process local file
|
||||
local_path = Path(pdf_path)
|
||||
if not local_path.exists():
|
||||
logger.error(f"PDF file not found: {pdf_path}")
|
||||
return ParsePdfOutput(
|
||||
success=False,
|
||||
document_name="",
|
||||
source_id="",
|
||||
pages=0,
|
||||
chunks_count=0,
|
||||
cost_ocr=0.0,
|
||||
cost_llm=0.0,
|
||||
cost_total=0.0,
|
||||
output_dir="",
|
||||
metadata={},
|
||||
error=f"PDF file not found: {pdf_path}",
|
||||
)
|
||||
|
||||
logger.info(f"Processing local file: {local_path}")
|
||||
result = process_pdf(
|
||||
pdf_path=local_path,
|
||||
output_dir=Path("output"),
|
||||
use_llm=FIXED_USE_LLM,
|
||||
llm_provider=FIXED_LLM_PROVIDER,
|
||||
llm_model=FIXED_LLM_MODEL,
|
||||
use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
|
||||
use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
|
||||
ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
|
||||
)
|
||||
|
||||
# Convert pipeline result to output schema
|
||||
success = result.get("success", False)
|
||||
document_name = result.get("document_name", "")
|
||||
source_id = result.get("source_id", document_name)
|
||||
|
||||
# Extract costs
|
||||
cost_ocr = result.get("cost_ocr", 0.0)
|
||||
cost_llm = result.get("cost_llm", 0.0)
|
||||
cost_total = result.get("cost_total", cost_ocr + cost_llm)
|
||||
|
||||
# Extract metadata
|
||||
metadata_raw = result.get("metadata", {})
|
||||
if metadata_raw is None:
|
||||
metadata_raw = {}
|
||||
|
||||
# Build output
|
||||
output = ParsePdfOutput(
|
||||
success=success,
|
||||
document_name=document_name,
|
||||
source_id=source_id,
|
||||
pages=result.get("pages", 0),
|
||||
chunks_count=result.get("chunks_count", 0),
|
||||
cost_ocr=cost_ocr,
|
||||
cost_llm=cost_llm,
|
||||
cost_total=cost_total,
|
||||
output_dir=str(result.get("output_dir", "")),
|
||||
metadata=metadata_raw,
|
||||
error=result.get("error"),
|
||||
)
|
||||
|
||||
if success:
|
||||
logger.info(
|
||||
f"Successfully processed {document_name}: "
|
||||
f"{output.chunks_count} chunks, {output.cost_total:.4f} EUR"
|
||||
)
|
||||
else:
|
||||
logger.error(f"Failed to process {pdf_path}: {output.error}")
|
||||
|
||||
return output
|
||||
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"HTTP error downloading PDF: {e}")
|
||||
return ParsePdfOutput(
|
||||
success=False,
|
||||
document_name="",
|
||||
source_id="",
|
||||
pages=0,
|
||||
chunks_count=0,
|
||||
cost_ocr=0.0,
|
||||
cost_llm=0.0,
|
||||
cost_total=0.0,
|
||||
output_dir="",
|
||||
metadata={},
|
||||
error=f"Failed to download PDF: {e}",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing PDF: {e}", exc_info=True)
|
||||
return ParsePdfOutput(
|
||||
success=False,
|
||||
document_name="",
|
||||
source_id="",
|
||||
pages=0,
|
||||
chunks_count=0,
|
||||
cost_ocr=0.0,
|
||||
cost_llm=0.0,
|
||||
cost_total=0.0,
|
||||
output_dir="",
|
||||
metadata={},
|
||||
error=f"Processing error: {str(e)}",
|
||||
)
|
||||
Reference in New Issue
Block a user