linear-coding-agent/generations/library_rag/mcp_tools/parsing_tools.py

"""Parsing tools for Library RAG MCP Server.

This module implements the parse_pdf tool with optimal pre-configured parameters
for PDF ingestion into the Library RAG system.

The tool uses fixed optimal parameters:
    - llm_provider: "mistral" (API-based, fast)
    - llm_model: "mistral-medium-latest" (best quality/cost ratio)
    - use_semantic_chunking: True (LLM-based intelligent chunking)
    - use_ocr_annotations: True (3x cost but better TOC extraction)
    - ingest_to_weaviate: True (automatic vectorization and storage)

Example:
    The parse_pdf tool can be invoked via MCP with a simple path::

        {
            "tool": "parse_pdf",
            "arguments": {
                "pdf_path": "/path/to/document.pdf"
            }
        }

    Or with a URL::

        {
            "tool": "parse_pdf",
            "arguments": {
                "pdf_path": "https://example.com/document.pdf"
            }
        }
"""

from __future__ import annotations

import logging
import tempfile
from pathlib import Path
from typing import Any, Dict, Literal
from urllib.parse import urlparse

import httpx

from mcp_tools.schemas import ParsePdfInput, ParsePdfOutput

# Import pdf_pipeline for PDF processing
from utils.pdf_pipeline import process_pdf, process_pdf_bytes
from utils.types import LLMProvider

# Logger for this module
logger = logging.getLogger(__name__)

# =============================================================================
# Constants - Fixed Optimal Parameters
# =============================================================================

# LLM provider configuration (Mistral API for best results)
FIXED_LLM_PROVIDER: LLMProvider = "mistral"
FIXED_LLM_MODEL = "mistral-medium-latest"

# Processing options (optimal settings for quality)
FIXED_USE_SEMANTIC_CHUNKING = True
FIXED_USE_OCR_ANNOTATIONS = True
FIXED_INGEST_TO_WEAVIATE = True

# Additional processing flags
FIXED_USE_LLM = True
# Note: The following flags are not supported by process_pdf() and should not be used
# FIXED_CLEAN_CHUNKS = True
# FIXED_EXTRACT_CONCEPTS = True
# FIXED_VALIDATE_OUTPUT = True


# =============================================================================
# Helper Functions
# =============================================================================


def is_url(path: str) -> bool:
    """Check if a path is a URL.

    Args:
        path: The path or URL string to check.

    Returns:
        True if the path is a valid HTTP/HTTPS URL, False otherwise.

    Example:
        >>> is_url("https://example.com/doc.pdf")
        True
        >>> is_url("/path/to/doc.pdf")
        False
    """
    try:
        result = urlparse(path)
        return result.scheme in ("http", "https")
    except ValueError:
        return False


async def download_pdf(url: str, timeout: float = 60.0) -> bytes:
    """Download a PDF file from a URL.

    Args:
        url: The URL to download from. Must be HTTP or HTTPS.
        timeout: Maximum time in seconds to wait for download.
            Defaults to 60 seconds.

    Returns:
        Raw bytes content of the downloaded PDF file.

    Raises:
        httpx.HTTPError: If the download fails (network error, HTTP error, etc.).
        ValueError: If the URL is invalid or not accessible.

    Example:
        >>> pdf_bytes = await download_pdf("https://example.com/document.pdf")
        >>> len(pdf_bytes) > 0
        True
    """
    logger.info(f"Downloading PDF from: {url}")

    async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
        response = await client.get(url)
        response.raise_for_status()

        content_type = response.headers.get("content-type", "")
        if "application/pdf" not in content_type.lower() and not url.lower().endswith(
            ".pdf"
        ):
            logger.warning(
                f"URL may not be a PDF (Content-Type: {content_type}), proceeding anyway"
            )

        logger.info(f"Downloaded {len(response.content)} bytes from {url}")
        return response.content


def extract_filename_from_url(url: str) -> str:
    """Extract a filename from a URL.

    Args:
        url: The URL to extract filename from.

    Returns:
        Extracted filename with .pdf extension. Falls back to "downloaded.pdf"
        if no filename can be extracted.

    Example:
        >>> extract_filename_from_url("https://example.com/documents/kant.pdf")
        "kant.pdf"
        >>> extract_filename_from_url("https://example.com/api/download")
        "downloaded.pdf"
    """
    parsed = urlparse(url)
    path = parsed.path

    if path:
        # Get the last path component
        filename = path.split("/")[-1]
        if filename and "." in filename:
            return filename
        if filename:
            return f"{filename}.pdf"

    return "downloaded.pdf"


# =============================================================================
# Main Tool Implementation
# =============================================================================


async def parse_pdf_handler(input_data: ParsePdfInput) -> ParsePdfOutput:
    """Process a PDF document with optimal pre-configured parameters.

    This is the main handler for the parse_pdf MCP tool. It processes PDFs
    through the Library RAG pipeline with the following fixed optimal settings:

    - LLM: Mistral API (mistral-medium-latest) for fast, high-quality processing
    - OCR: Mistral OCR with annotations (better TOC extraction, 3x cost)
    - Chunking: Semantic LLM-based chunking (argumentative units)
    - Ingestion: Automatic Weaviate vectorization and storage

    The tool accepts either a local file path or a URL. URLs are automatically
    downloaded before processing.

    Args:
        input_data: Validated input containing pdf_path (local path or URL).

    Returns:
        ParsePdfOutput containing processing results including:
        - success: Whether processing completed successfully
        - document_name: Name of the processed document
        - source_id: Unique identifier for retrieval
        - pages: Number of pages processed
        - chunks_count: Number of chunks created
        - cost_ocr: OCR cost in EUR
        - cost_llm: LLM cost in EUR
        - cost_total: Total processing cost
        - output_dir: Directory containing output files
        - metadata: Extracted document metadata
        - error: Error message if processing failed

    Example:
        >>> input_data = ParsePdfInput(pdf_path="/docs/aristotle.pdf")
        >>> result = await parse_pdf_handler(input_data)
        >>> result.success
        True
        >>> result.chunks_count > 0
        True
    """
    pdf_path = input_data.pdf_path
    logger.info(f"parse_pdf called with: {pdf_path}")

    try:
        # Determine if input is a URL or local path
        if is_url(pdf_path):
            # Download PDF from URL
            logger.info(f"Detected URL input, downloading: {pdf_path}")
            pdf_bytes = await download_pdf(pdf_path)
            filename = extract_filename_from_url(pdf_path)

            # Process from bytes
            result = process_pdf_bytes(
                file_bytes=pdf_bytes,
                filename=filename,
                output_dir=Path("output"),
                llm_provider=FIXED_LLM_PROVIDER,
                use_llm=FIXED_USE_LLM,
                llm_model=FIXED_LLM_MODEL,
                use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
                use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
                ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
            )
        else:
            # Process local file
            local_path = Path(pdf_path)
            if not local_path.exists():
                logger.error(f"PDF file not found: {pdf_path}")
                return ParsePdfOutput(
                    success=False,
                    document_name="",
                    source_id="",
                    pages=0,
                    chunks_count=0,
                    cost_ocr=0.0,
                    cost_llm=0.0,
                    cost_total=0.0,
                    output_dir="",
                    metadata={},
                    error=f"PDF file not found: {pdf_path}",
                )

            logger.info(f"Processing local file: {local_path}")
            result = process_pdf(
                pdf_path=local_path,
                output_dir=Path("output"),
                use_llm=FIXED_USE_LLM,
                llm_provider=FIXED_LLM_PROVIDER,
                llm_model=FIXED_LLM_MODEL,
                use_semantic_chunking=FIXED_USE_SEMANTIC_CHUNKING,
                use_ocr_annotations=FIXED_USE_OCR_ANNOTATIONS,
                ingest_to_weaviate=FIXED_INGEST_TO_WEAVIATE,
            )

        # Convert pipeline result to output schema
        success = result.get("success", False)
        document_name = result.get("document_name", "")
        source_id = result.get("source_id", document_name)

        # Extract costs
        cost_ocr = result.get("cost_ocr", 0.0)
        cost_llm = result.get("cost_llm", 0.0)
        cost_total = result.get("cost_total", cost_ocr + cost_llm)

        # Extract metadata
        metadata_raw = result.get("metadata", {})
        if metadata_raw is None:
            metadata_raw = {}

        # Build output
        output = ParsePdfOutput(
            success=success,
            document_name=document_name,
            source_id=source_id,
            pages=result.get("pages", 0),
            chunks_count=result.get("chunks_count", 0),
            cost_ocr=cost_ocr,
            cost_llm=cost_llm,
            cost_total=cost_total,
            output_dir=str(result.get("output_dir", "")),
            metadata=metadata_raw,
            error=result.get("error"),
        )

        if success:
            logger.info(
                f"Successfully processed {document_name}: "
                f"{output.chunks_count} chunks, {output.cost_total:.4f} EUR"
            )
        else:
            logger.error(f"Failed to process {pdf_path}: {output.error}")

        return output

    except httpx.HTTPError as e:
        logger.error(f"HTTP error downloading PDF: {e}")
        return ParsePdfOutput(
            success=False,
            document_name="",
            source_id="",
            pages=0,
            chunks_count=0,
            cost_ocr=0.0,
            cost_llm=0.0,
            cost_total=0.0,
            output_dir="",
            metadata={},
            error=f"Failed to download PDF: {e}",
        )
    except Exception as e:
        logger.error(f"Error processing PDF: {e}", exc_info=True)
        return ParsePdfOutput(
            success=False,
            document_name="",
            source_id="",
            pages=0,
            chunks_count=0,
            cost_ocr=0.0,
            cost_llm=0.0,
            cost_total=0.0,
            output_dir="",
            metadata={},
            error=f"Processing error: {str(e)}",
        )