Files
linear-coding-agent/generations/library_rag/tests/mcp/test_parsing_tools.py
David Blanc Brioir d2f7165120 Add Library RAG project and cleanup root directory
- Add complete Library RAG application (Flask + MCP server)
  - PDF processing pipeline with OCR and LLM extraction
  - Weaviate vector database integration (BGE-M3 embeddings)
  - Flask web interface with search and document management
  - MCP server for Claude Desktop integration
  - Comprehensive test suite (134 tests)

- Clean up root directory
  - Remove obsolete documentation files
  - Remove backup and temporary files
  - Update autonomous agent configuration

- Update prompts
  - Enhance initializer bis prompt with better instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 11:57:12 +01:00

674 lines
25 KiB
Python

"""
Unit tests for MCP parsing tools.
Tests the parse_pdf tool handler with mocked dependencies to ensure:
- Local file processing works correctly
- URL-based PDF downloads work correctly
- Error handling is comprehensive
- Fixed parameters are used correctly
- Cost tracking is accurate
Uses asyncio for async test support.
"""
import asyncio
from pathlib import Path
from typing import Any, Dict
from unittest.mock import AsyncMock, MagicMock, patch
import httpx
import pytest
from mcp_tools.parsing_tools import (
FIXED_LLM_MODEL,
FIXED_LLM_PROVIDER,
FIXED_USE_LLM,
FIXED_USE_OCR_ANNOTATIONS,
FIXED_USE_SEMANTIC_CHUNKING,
download_pdf,
extract_filename_from_url,
is_url,
parse_pdf_handler,
)
from mcp_tools.schemas import ParsePdfInput, ParsePdfOutput
# =============================================================================
# Test is_url Helper Function
# =============================================================================
class TestIsUrl:
"""Tests for the is_url helper function."""
def test_https_url(self) -> None:
"""Test that HTTPS URLs are recognized."""
assert is_url("https://example.com/document.pdf") is True
def test_http_url(self) -> None:
"""Test that HTTP URLs are recognized."""
assert is_url("http://example.com/document.pdf") is True
def test_local_path_unix(self) -> None:
"""Test that Unix local paths are not recognized as URLs."""
assert is_url("/path/to/document.pdf") is False
def test_local_path_windows(self) -> None:
"""Test that Windows local paths are not recognized as URLs."""
assert is_url("C:\\Documents\\document.pdf") is False
def test_relative_path(self) -> None:
"""Test that relative paths are not recognized as URLs."""
assert is_url("./documents/document.pdf") is False
def test_ftp_url_not_supported(self) -> None:
"""Test that FTP URLs are not recognized (only HTTP/HTTPS supported)."""
assert is_url("ftp://example.com/document.pdf") is False
def test_empty_string(self) -> None:
"""Test that empty strings are not recognized as URLs."""
assert is_url("") is False
# =============================================================================
# Test extract_filename_from_url Helper Function
# =============================================================================
class TestExtractFilenameFromUrl:
"""Tests for the extract_filename_from_url helper function."""
def test_url_with_pdf_filename(self) -> None:
"""Test extraction when URL has a .pdf filename."""
result = extract_filename_from_url("https://example.com/docs/aristotle.pdf")
assert result == "aristotle.pdf"
def test_url_with_filename_no_extension(self) -> None:
"""Test extraction when URL has a filename without extension."""
result = extract_filename_from_url("https://example.com/docs/aristotle")
assert result == "aristotle.pdf"
def test_url_without_path(self) -> None:
"""Test extraction when URL has no path."""
result = extract_filename_from_url("https://example.com/")
assert result == "downloaded.pdf"
def test_url_with_api_endpoint(self) -> None:
"""Test extraction when URL is an API endpoint."""
result = extract_filename_from_url("https://api.example.com/download")
assert result == "download.pdf"
def test_url_with_query_params(self) -> None:
"""Test extraction when URL has query parameters."""
result = extract_filename_from_url(
"https://example.com/docs/kant.pdf?token=abc"
)
assert result == "kant.pdf"
# =============================================================================
# Test download_pdf Function
# =============================================================================
class TestDownloadPdf:
"""Tests for the download_pdf async function."""
def test_successful_download(self) -> None:
"""Test successful PDF download from URL."""
async def run_test() -> None:
mock_response = MagicMock()
mock_response.content = b"%PDF-1.4 test content"
mock_response.headers = {"content-type": "application/pdf"}
mock_response.raise_for_status = MagicMock()
with patch(
"mcp_tools.parsing_tools.httpx.AsyncClient"
) as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
mock_client_class.return_value.__aenter__ = AsyncMock(
return_value=mock_client
)
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
result = await download_pdf("https://example.com/document.pdf")
assert result == b"%PDF-1.4 test content"
mock_client.get.assert_called_once_with(
"https://example.com/document.pdf"
)
asyncio.run(run_test())
def test_download_with_non_pdf_content_type(self) -> None:
"""Test download proceeds with warning when content-type is not PDF."""
async def run_test() -> None:
mock_response = MagicMock()
mock_response.content = b"%PDF-1.4 test content"
mock_response.headers = {"content-type": "application/octet-stream"}
mock_response.raise_for_status = MagicMock()
with patch(
"mcp_tools.parsing_tools.httpx.AsyncClient"
) as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
mock_client_class.return_value.__aenter__ = AsyncMock(
return_value=mock_client
)
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
# Should still succeed, just logs a warning
result = await download_pdf("https://example.com/document.pdf")
assert result == b"%PDF-1.4 test content"
asyncio.run(run_test())
def test_download_http_error(self) -> None:
"""Test that HTTP errors are propagated."""
async def run_test() -> None:
with patch(
"mcp_tools.parsing_tools.httpx.AsyncClient"
) as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(
side_effect=httpx.HTTPStatusError(
"Not Found",
request=MagicMock(),
response=MagicMock(status_code=404),
)
)
mock_client_class.return_value.__aenter__ = AsyncMock(
return_value=mock_client
)
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
with pytest.raises(httpx.HTTPStatusError):
await download_pdf("https://example.com/nonexistent.pdf")
asyncio.run(run_test())
# =============================================================================
# Test parse_pdf_handler - Local Files
# =============================================================================
class TestParsePdfHandlerLocalFile:
"""Tests for parse_pdf_handler with local file paths."""
def test_successful_local_file_processing(
self,
temp_pdf_file: Path,
successful_pipeline_result: Dict[str, Any],
) -> None:
"""Test successful processing of a local PDF file."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = successful_pipeline_result
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.success is True
assert result.document_name == "test-document"
assert result.pages == 10
assert result.chunks_count == 25
assert result.cost_ocr == 0.03
assert result.cost_llm == 0.05
assert result.cost_total == 0.08
assert result.metadata["title"] == "Test Document Title"
assert result.error is None
asyncio.run(run_test())
def test_local_file_uses_fixed_parameters(
self,
temp_pdf_file: Path,
successful_pipeline_result: Dict[str, Any],
) -> None:
"""Test that local file processing uses the fixed optimal parameters."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = successful_pipeline_result
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
await parse_pdf_handler(input_data)
# Verify fixed parameters are passed
mock_process_pdf.assert_called_once()
call_kwargs = mock_process_pdf.call_args.kwargs
assert call_kwargs["use_llm"] == FIXED_USE_LLM
assert call_kwargs["llm_provider"] == FIXED_LLM_PROVIDER
assert call_kwargs["llm_model"] == FIXED_LLM_MODEL
assert call_kwargs["use_semantic_chunking"] == FIXED_USE_SEMANTIC_CHUNKING
assert call_kwargs["use_ocr_annotations"] == FIXED_USE_OCR_ANNOTATIONS
asyncio.run(run_test())
def test_file_not_found_error(self) -> None:
"""Test error handling when local file does not exist."""
async def run_test() -> None:
input_data = ParsePdfInput(pdf_path="/nonexistent/path/document.pdf")
result = await parse_pdf_handler(input_data)
assert result.success is False
assert "not found" in result.error.lower()
assert result.pages == 0
assert result.chunks_count == 0
asyncio.run(run_test())
def test_pipeline_failure(
self,
temp_pdf_file: Path,
failed_pipeline_result: Dict[str, Any],
) -> None:
"""Test handling when the pipeline returns a failure."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = failed_pipeline_result
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.success is False
assert "OCR processing failed" in result.error
assert result.pages == 0
asyncio.run(run_test())
def test_pipeline_exception(
self,
temp_pdf_file: Path,
) -> None:
"""Test handling when the pipeline raises an exception."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.side_effect = RuntimeError("Unexpected error")
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.success is False
assert "Processing error" in result.error
assert "Unexpected error" in result.error
asyncio.run(run_test())
# =============================================================================
# Test parse_pdf_handler - URL Downloads
# =============================================================================
class TestParsePdfHandlerUrl:
"""Tests for parse_pdf_handler with URL inputs."""
def test_successful_url_processing(
self,
sample_pdf_bytes: bytes,
successful_pipeline_result: Dict[str, Any],
) -> None:
"""Test successful processing of a PDF from URL."""
async def run_test() -> None:
with patch(
"mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
) as mock_download:
with patch(
"mcp_tools.parsing_tools.process_pdf_bytes"
) as mock_process:
mock_download.return_value = sample_pdf_bytes
mock_process.return_value = successful_pipeline_result
input_data = ParsePdfInput(
pdf_path="https://example.com/philosophy/kant.pdf"
)
result = await parse_pdf_handler(input_data)
assert result.success is True
assert result.document_name == "test-document"
mock_download.assert_called_once_with(
"https://example.com/philosophy/kant.pdf"
)
asyncio.run(run_test())
def test_url_uses_extracted_filename(
self,
sample_pdf_bytes: bytes,
successful_pipeline_result: Dict[str, Any],
) -> None:
"""Test that filename is extracted from URL for processing."""
async def run_test() -> None:
with patch(
"mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
) as mock_download:
with patch(
"mcp_tools.parsing_tools.process_pdf_bytes"
) as mock_process:
mock_download.return_value = sample_pdf_bytes
mock_process.return_value = successful_pipeline_result
input_data = ParsePdfInput(
pdf_path="https://example.com/docs/aristotle-metaphysics.pdf"
)
await parse_pdf_handler(input_data)
# Verify filename was extracted and passed
mock_process.assert_called_once()
call_kwargs = mock_process.call_args.kwargs
assert call_kwargs["filename"] == "aristotle-metaphysics.pdf"
asyncio.run(run_test())
def test_url_uses_fixed_parameters(
self,
sample_pdf_bytes: bytes,
successful_pipeline_result: Dict[str, Any],
) -> None:
"""Test that URL processing uses the fixed optimal parameters."""
async def run_test() -> None:
with patch(
"mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
) as mock_download:
with patch(
"mcp_tools.parsing_tools.process_pdf_bytes"
) as mock_process:
mock_download.return_value = sample_pdf_bytes
mock_process.return_value = successful_pipeline_result
input_data = ParsePdfInput(
pdf_path="https://example.com/document.pdf"
)
await parse_pdf_handler(input_data)
call_kwargs = mock_process.call_args.kwargs
assert call_kwargs["llm_provider"] == FIXED_LLM_PROVIDER
assert call_kwargs["llm_model"] == FIXED_LLM_MODEL
assert (
call_kwargs["use_semantic_chunking"]
== FIXED_USE_SEMANTIC_CHUNKING
)
assert (
call_kwargs["use_ocr_annotations"] == FIXED_USE_OCR_ANNOTATIONS
)
asyncio.run(run_test())
def test_url_download_http_error(self) -> None:
"""Test error handling when URL download fails with HTTP error."""
async def run_test() -> None:
with patch(
"mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
) as mock_download:
mock_download.side_effect = httpx.HTTPStatusError(
"Not Found",
request=MagicMock(),
response=MagicMock(status_code=404),
)
input_data = ParsePdfInput(
pdf_path="https://example.com/nonexistent.pdf"
)
result = await parse_pdf_handler(input_data)
assert result.success is False
assert "Failed to download PDF" in result.error
asyncio.run(run_test())
def test_url_download_network_error(self) -> None:
"""Test error handling when URL download fails with network error."""
async def run_test() -> None:
with patch(
"mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
) as mock_download:
mock_download.side_effect = httpx.ConnectError("Connection refused")
input_data = ParsePdfInput(
pdf_path="https://example.com/document.pdf"
)
result = await parse_pdf_handler(input_data)
assert result.success is False
assert "Failed to download PDF" in result.error
asyncio.run(run_test())
# =============================================================================
# Test Cost Tracking
# =============================================================================
class TestCostTracking:
"""Tests for cost tracking in parse_pdf output."""
def test_costs_are_tracked_correctly(
self,
temp_pdf_file: Path,
) -> None:
"""Test that OCR and LLM costs are correctly tracked."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = {
"success": True,
"document_name": "test-doc",
"source_id": "test-doc",
"pages": 50,
"chunks_count": 100,
"cost_ocr": 0.15, # 50 pages * 0.003€
"cost_llm": 0.25,
"cost_total": 0.40,
"output_dir": Path("output/test-doc"),
"metadata": {},
"error": None,
}
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.cost_ocr == 0.15
assert result.cost_llm == 0.25
assert result.cost_total == 0.40
asyncio.run(run_test())
def test_cost_total_calculated_when_missing(
self,
temp_pdf_file: Path,
) -> None:
"""Test that cost_total is calculated if not provided."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = {
"success": True,
"document_name": "test-doc",
"source_id": "test-doc",
"pages": 10,
"chunks_count": 20,
"cost_ocr": 0.03,
"cost_llm": 0.05,
# cost_total intentionally missing
"output_dir": Path("output/test-doc"),
"metadata": {},
"error": None,
}
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.cost_total == 0.08 # 0.03 + 0.05
asyncio.run(run_test())
def test_zero_costs_on_failure(
self,
temp_pdf_file: Path,
) -> None:
"""Test that costs are zero when processing fails early."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.side_effect = RuntimeError("Early failure")
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.success is False
assert result.cost_ocr == 0.0
assert result.cost_llm == 0.0
assert result.cost_total == 0.0
asyncio.run(run_test())
# =============================================================================
# Test Metadata Handling
# =============================================================================
class TestMetadataHandling:
"""Tests for metadata extraction and handling."""
def test_metadata_extracted_correctly(
self,
temp_pdf_file: Path,
) -> None:
"""Test that metadata is correctly passed through."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = {
"success": True,
"document_name": "platon-menon",
"source_id": "platon-menon",
"pages": 80,
"chunks_count": 150,
"cost_ocr": 0.24,
"cost_llm": 0.30,
"cost_total": 0.54,
"output_dir": Path("output/platon-menon"),
"metadata": {
"title": "Ménon",
"author": "Platon",
"language": "fr",
"year": -380,
"genre": "dialogue",
},
"error": None,
}
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.metadata["title"] == "Ménon"
assert result.metadata["author"] == "Platon"
assert result.metadata["language"] == "fr"
assert result.metadata["year"] == -380
assert result.metadata["genre"] == "dialogue"
asyncio.run(run_test())
def test_empty_metadata_handled(
self,
temp_pdf_file: Path,
) -> None:
"""Test that empty/None metadata is handled gracefully."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = {
"success": True,
"document_name": "test-doc",
"source_id": "test-doc",
"pages": 10,
"chunks_count": 20,
"cost_ocr": 0.03,
"cost_llm": 0.05,
"cost_total": 0.08,
"output_dir": Path("output/test-doc"),
"metadata": None, # Explicitly None
"error": None,
}
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.metadata == {}
asyncio.run(run_test())
# =============================================================================
# Test Output Schema Validation
# =============================================================================
class TestOutputSchemaValidation:
"""Tests for ParsePdfOutput schema compliance."""
def test_output_is_valid_schema(
self,
temp_pdf_file: Path,
successful_pipeline_result: Dict[str, Any],
) -> None:
"""Test that output conforms to ParsePdfOutput schema."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = successful_pipeline_result
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
# Verify it's the correct type
assert isinstance(result, ParsePdfOutput)
# Verify all required fields are present
assert hasattr(result, "success")
assert hasattr(result, "document_name")
assert hasattr(result, "source_id")
assert hasattr(result, "pages")
assert hasattr(result, "chunks_count")
assert hasattr(result, "cost_ocr")
assert hasattr(result, "cost_llm")
assert hasattr(result, "cost_total")
assert hasattr(result, "output_dir")
assert hasattr(result, "metadata")
assert hasattr(result, "error")
asyncio.run(run_test())
def test_error_output_is_valid_schema(self) -> None:
"""Test that error output conforms to ParsePdfOutput schema."""
async def run_test() -> None:
input_data = ParsePdfInput(pdf_path="/nonexistent/file.pdf")
result = await parse_pdf_handler(input_data)
assert isinstance(result, ParsePdfOutput)
assert result.success is False
assert result.error is not None
assert isinstance(result.error, str)
asyncio.run(run_test())