Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server)
  - PDF processing pipeline with OCR and LLM extraction
  - Weaviate vector database integration (BGE-M3 embeddings)
  - Flask web interface with search and document management
  - MCP server for Claude Desktop integration
  - Comprehensive test suite (134 tests)

- Clean up root directory
  - Remove obsolete documentation files
  - Remove backup and temporary files
  - Update autonomous agent configuration

- Update prompts
  - Enhance initializer bis prompt with better instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions

View File

@@ -0,0 +1 @@
"""MCP server unit tests."""

View File

@@ -0,0 +1,196 @@
"""
Pytest fixtures for MCP server tests.
Provides common fixtures for mocking dependencies and test data.
"""
import os
from pathlib import Path
from typing import Any, Dict, Generator
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from mcp_config import MCPConfig
@pytest.fixture
def mock_env_with_api_key() -> Generator[Dict[str, str], None, None]:
"""
Provide environment with MISTRAL_API_KEY set.
Yields:
Dictionary of environment variables.
"""
env = {"MISTRAL_API_KEY": "test-api-key-12345"}
with patch.dict(os.environ, env, clear=True):
yield env
@pytest.fixture
def valid_config() -> MCPConfig:
"""
Provide a valid MCPConfig instance for testing.
Returns:
MCPConfig with valid test values.
"""
return MCPConfig(
mistral_api_key="test-api-key",
ollama_base_url="http://localhost:11434",
structure_llm_model="test-model",
structure_llm_temperature=0.2,
default_llm_provider="ollama",
weaviate_host="localhost",
weaviate_port=8080,
log_level="INFO",
output_dir=Path("test_output"),
)
@pytest.fixture
def mock_weaviate_client() -> Generator[MagicMock, None, None]:
"""
Provide a mocked Weaviate client.
Yields:
MagicMock configured as a Weaviate client.
"""
with patch("weaviate.connect_to_local") as mock_connect:
mock_client = MagicMock()
mock_connect.return_value = mock_client
yield mock_client
# =============================================================================
# Parsing Tools Fixtures
# =============================================================================
@pytest.fixture
def sample_pdf_bytes() -> bytes:
"""
Provide minimal valid PDF bytes for testing.
Returns:
Bytes representing a minimal valid PDF file.
"""
# Minimal valid PDF structure
return b"""%PDF-1.4
1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj
2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj
3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >> endobj
xref
0 4
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
trailer << /Size 4 /Root 1 0 R >>
startxref
193
%%EOF"""
@pytest.fixture
def successful_pipeline_result() -> Dict[str, Any]:
"""
Provide a successful pipeline result for mocking.
Returns:
Dictionary mimicking a successful process_pdf result.
"""
return {
"success": True,
"document_name": "test-document",
"source_id": "test-document",
"pages": 10,
"chunks_count": 25,
"cost_ocr": 0.03,
"cost_llm": 0.05,
"cost_total": 0.08,
"output_dir": Path("output/test-document"),
"metadata": {
"title": "Test Document Title",
"author": "Test Author",
"language": "en",
"year": 2023,
},
"error": None,
}
@pytest.fixture
def failed_pipeline_result() -> Dict[str, Any]:
"""
Provide a failed pipeline result for mocking.
Returns:
Dictionary mimicking a failed process_pdf result.
"""
return {
"success": False,
"document_name": "failed-document",
"source_id": "failed-document",
"pages": 0,
"chunks_count": 0,
"cost_ocr": 0.0,
"cost_llm": 0.0,
"cost_total": 0.0,
"output_dir": "",
"metadata": {},
"error": "OCR processing failed: Invalid PDF structure",
}
@pytest.fixture
def mock_process_pdf() -> Generator[MagicMock, None, None]:
"""
Provide a mocked process_pdf function.
Yields:
MagicMock for utils.pdf_pipeline.process_pdf.
"""
with patch("mcp_tools.parsing_tools.process_pdf") as mock:
yield mock
@pytest.fixture
def mock_process_pdf_bytes() -> Generator[MagicMock, None, None]:
"""
Provide a mocked process_pdf_bytes function.
Yields:
MagicMock for utils.pdf_pipeline.process_pdf_bytes.
"""
with patch("mcp_tools.parsing_tools.process_pdf_bytes") as mock:
yield mock
@pytest.fixture
def mock_download_pdf() -> Generator[AsyncMock, None, None]:
"""
Provide a mocked download_pdf function.
Yields:
AsyncMock for mcp_tools.parsing_tools.download_pdf.
"""
with patch("mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock) as mock:
yield mock
@pytest.fixture
def temp_pdf_file(tmp_path: Path, sample_pdf_bytes: bytes) -> Path:
"""
Create a temporary PDF file for testing.
Args:
tmp_path: Pytest tmp_path fixture.
sample_pdf_bytes: Sample PDF content.
Returns:
Path to the temporary PDF file.
"""
pdf_path = tmp_path / "test_document.pdf"
pdf_path.write_bytes(sample_pdf_bytes)
return pdf_path

View File

@@ -0,0 +1,133 @@
"""
Unit tests for MCP configuration management.
Tests the MCPConfig class for proper loading, validation, and defaults.
"""
import os
import pytest
from pathlib import Path
from unittest.mock import patch
from mcp_config import MCPConfig
class TestMCPConfigFromEnv:
"""Test MCPConfig.from_env() method."""
def test_loads_with_required_key(self) -> None:
"""Test config loads when MISTRAL_API_KEY is present."""
with patch.dict(os.environ, {"MISTRAL_API_KEY": "test-key-123"}, clear=True):
config = MCPConfig.from_env()
assert config.mistral_api_key == "test-key-123"
def test_raises_without_api_key(self) -> None:
"""Test ValueError is raised when MISTRAL_API_KEY is missing."""
with patch("mcp_config.load_dotenv"): # Prevent reading .env file
with patch.dict(os.environ, {}, clear=True):
with pytest.raises(ValueError) as exc_info:
MCPConfig.from_env()
assert "MISTRAL_API_KEY" in str(exc_info.value)
def test_default_values_applied(self) -> None:
"""Test all default values are applied correctly."""
with patch.dict(os.environ, {"MISTRAL_API_KEY": "test-key"}, clear=True):
config = MCPConfig.from_env()
# Check all defaults
assert config.ollama_base_url == "http://localhost:11434"
assert config.structure_llm_model == "deepseek-r1:14b"
assert config.structure_llm_temperature == 0.2
assert config.default_llm_provider == "ollama"
assert config.weaviate_host == "localhost"
assert config.weaviate_port == 8080
assert config.log_level == "INFO"
assert config.output_dir == Path("output")
def test_custom_values_loaded(self) -> None:
"""Test custom environment values are loaded correctly."""
custom_env = {
"MISTRAL_API_KEY": "custom-key",
"OLLAMA_BASE_URL": "http://custom:1234",
"STRUCTURE_LLM_MODEL": "custom-model",
"STRUCTURE_LLM_TEMPERATURE": "0.7",
"DEFAULT_LLM_PROVIDER": "mistral",
"WEAVIATE_HOST": "weaviate.example.com",
"WEAVIATE_PORT": "9999",
"LOG_LEVEL": "DEBUG",
"OUTPUT_DIR": "/custom/output",
}
with patch.dict(os.environ, custom_env, clear=True):
config = MCPConfig.from_env()
assert config.mistral_api_key == "custom-key"
assert config.ollama_base_url == "http://custom:1234"
assert config.structure_llm_model == "custom-model"
assert config.structure_llm_temperature == 0.7
assert config.default_llm_provider == "mistral"
assert config.weaviate_host == "weaviate.example.com"
assert config.weaviate_port == 9999
assert config.log_level == "DEBUG"
assert config.output_dir == Path("/custom/output")
class TestMCPConfigValidation:
"""Test MCPConfig.validate() method."""
def test_valid_config_passes(self) -> None:
"""Test valid configuration passes validation."""
config = MCPConfig(
mistral_api_key="test-key",
default_llm_provider="ollama",
log_level="INFO",
structure_llm_temperature=0.5,
)
# Should not raise
config.validate()
def test_invalid_llm_provider_fails(self) -> None:
"""Test invalid LLM provider raises ValueError."""
config = MCPConfig(
mistral_api_key="test-key",
default_llm_provider="invalid", # type: ignore
)
with pytest.raises(ValueError) as exc_info:
config.validate()
assert "Invalid LLM provider" in str(exc_info.value)
def test_invalid_log_level_fails(self) -> None:
"""Test invalid log level raises ValueError."""
config = MCPConfig(
mistral_api_key="test-key",
log_level="INVALID",
)
with pytest.raises(ValueError) as exc_info:
config.validate()
assert "Invalid log level" in str(exc_info.value)
def test_invalid_temperature_fails(self) -> None:
"""Test temperature outside 0-2 range raises ValueError."""
config = MCPConfig(
mistral_api_key="test-key",
structure_llm_temperature=2.5,
)
with pytest.raises(ValueError) as exc_info:
config.validate()
assert "Invalid temperature" in str(exc_info.value)
class TestMCPConfigProperties:
"""Test MCPConfig properties."""
def test_weaviate_url_property(self) -> None:
"""Test weaviate_url property returns correct URL."""
config = MCPConfig(
mistral_api_key="test-key",
weaviate_host="my-host",
weaviate_port=9090,
)
assert config.weaviate_url == "http://my-host:9090"
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -0,0 +1,673 @@
"""
Unit tests for MCP parsing tools.
Tests the parse_pdf tool handler with mocked dependencies to ensure:
- Local file processing works correctly
- URL-based PDF downloads work correctly
- Error handling is comprehensive
- Fixed parameters are used correctly
- Cost tracking is accurate
Uses asyncio for async test support.
"""
import asyncio
from pathlib import Path
from typing import Any, Dict
from unittest.mock import AsyncMock, MagicMock, patch
import httpx
import pytest
from mcp_tools.parsing_tools import (
FIXED_LLM_MODEL,
FIXED_LLM_PROVIDER,
FIXED_USE_LLM,
FIXED_USE_OCR_ANNOTATIONS,
FIXED_USE_SEMANTIC_CHUNKING,
download_pdf,
extract_filename_from_url,
is_url,
parse_pdf_handler,
)
from mcp_tools.schemas import ParsePdfInput, ParsePdfOutput
# =============================================================================
# Test is_url Helper Function
# =============================================================================
class TestIsUrl:
"""Tests for the is_url helper function."""
def test_https_url(self) -> None:
"""Test that HTTPS URLs are recognized."""
assert is_url("https://example.com/document.pdf") is True
def test_http_url(self) -> None:
"""Test that HTTP URLs are recognized."""
assert is_url("http://example.com/document.pdf") is True
def test_local_path_unix(self) -> None:
"""Test that Unix local paths are not recognized as URLs."""
assert is_url("/path/to/document.pdf") is False
def test_local_path_windows(self) -> None:
"""Test that Windows local paths are not recognized as URLs."""
assert is_url("C:\\Documents\\document.pdf") is False
def test_relative_path(self) -> None:
"""Test that relative paths are not recognized as URLs."""
assert is_url("./documents/document.pdf") is False
def test_ftp_url_not_supported(self) -> None:
"""Test that FTP URLs are not recognized (only HTTP/HTTPS supported)."""
assert is_url("ftp://example.com/document.pdf") is False
def test_empty_string(self) -> None:
"""Test that empty strings are not recognized as URLs."""
assert is_url("") is False
# =============================================================================
# Test extract_filename_from_url Helper Function
# =============================================================================
class TestExtractFilenameFromUrl:
"""Tests for the extract_filename_from_url helper function."""
def test_url_with_pdf_filename(self) -> None:
"""Test extraction when URL has a .pdf filename."""
result = extract_filename_from_url("https://example.com/docs/aristotle.pdf")
assert result == "aristotle.pdf"
def test_url_with_filename_no_extension(self) -> None:
"""Test extraction when URL has a filename without extension."""
result = extract_filename_from_url("https://example.com/docs/aristotle")
assert result == "aristotle.pdf"
def test_url_without_path(self) -> None:
"""Test extraction when URL has no path."""
result = extract_filename_from_url("https://example.com/")
assert result == "downloaded.pdf"
def test_url_with_api_endpoint(self) -> None:
"""Test extraction when URL is an API endpoint."""
result = extract_filename_from_url("https://api.example.com/download")
assert result == "download.pdf"
def test_url_with_query_params(self) -> None:
"""Test extraction when URL has query parameters."""
result = extract_filename_from_url(
"https://example.com/docs/kant.pdf?token=abc"
)
assert result == "kant.pdf"
# =============================================================================
# Test download_pdf Function
# =============================================================================
class TestDownloadPdf:
"""Tests for the download_pdf async function."""
def test_successful_download(self) -> None:
"""Test successful PDF download from URL."""
async def run_test() -> None:
mock_response = MagicMock()
mock_response.content = b"%PDF-1.4 test content"
mock_response.headers = {"content-type": "application/pdf"}
mock_response.raise_for_status = MagicMock()
with patch(
"mcp_tools.parsing_tools.httpx.AsyncClient"
) as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
mock_client_class.return_value.__aenter__ = AsyncMock(
return_value=mock_client
)
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
result = await download_pdf("https://example.com/document.pdf")
assert result == b"%PDF-1.4 test content"
mock_client.get.assert_called_once_with(
"https://example.com/document.pdf"
)
asyncio.run(run_test())
def test_download_with_non_pdf_content_type(self) -> None:
"""Test download proceeds with warning when content-type is not PDF."""
async def run_test() -> None:
mock_response = MagicMock()
mock_response.content = b"%PDF-1.4 test content"
mock_response.headers = {"content-type": "application/octet-stream"}
mock_response.raise_for_status = MagicMock()
with patch(
"mcp_tools.parsing_tools.httpx.AsyncClient"
) as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
mock_client_class.return_value.__aenter__ = AsyncMock(
return_value=mock_client
)
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
# Should still succeed, just logs a warning
result = await download_pdf("https://example.com/document.pdf")
assert result == b"%PDF-1.4 test content"
asyncio.run(run_test())
def test_download_http_error(self) -> None:
"""Test that HTTP errors are propagated."""
async def run_test() -> None:
with patch(
"mcp_tools.parsing_tools.httpx.AsyncClient"
) as mock_client_class:
mock_client = AsyncMock()
mock_client.get = AsyncMock(
side_effect=httpx.HTTPStatusError(
"Not Found",
request=MagicMock(),
response=MagicMock(status_code=404),
)
)
mock_client_class.return_value.__aenter__ = AsyncMock(
return_value=mock_client
)
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
with pytest.raises(httpx.HTTPStatusError):
await download_pdf("https://example.com/nonexistent.pdf")
asyncio.run(run_test())
# =============================================================================
# Test parse_pdf_handler - Local Files
# =============================================================================
class TestParsePdfHandlerLocalFile:
"""Tests for parse_pdf_handler with local file paths."""
def test_successful_local_file_processing(
self,
temp_pdf_file: Path,
successful_pipeline_result: Dict[str, Any],
) -> None:
"""Test successful processing of a local PDF file."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = successful_pipeline_result
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.success is True
assert result.document_name == "test-document"
assert result.pages == 10
assert result.chunks_count == 25
assert result.cost_ocr == 0.03
assert result.cost_llm == 0.05
assert result.cost_total == 0.08
assert result.metadata["title"] == "Test Document Title"
assert result.error is None
asyncio.run(run_test())
def test_local_file_uses_fixed_parameters(
self,
temp_pdf_file: Path,
successful_pipeline_result: Dict[str, Any],
) -> None:
"""Test that local file processing uses the fixed optimal parameters."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = successful_pipeline_result
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
await parse_pdf_handler(input_data)
# Verify fixed parameters are passed
mock_process_pdf.assert_called_once()
call_kwargs = mock_process_pdf.call_args.kwargs
assert call_kwargs["use_llm"] == FIXED_USE_LLM
assert call_kwargs["llm_provider"] == FIXED_LLM_PROVIDER
assert call_kwargs["llm_model"] == FIXED_LLM_MODEL
assert call_kwargs["use_semantic_chunking"] == FIXED_USE_SEMANTIC_CHUNKING
assert call_kwargs["use_ocr_annotations"] == FIXED_USE_OCR_ANNOTATIONS
asyncio.run(run_test())
def test_file_not_found_error(self) -> None:
"""Test error handling when local file does not exist."""
async def run_test() -> None:
input_data = ParsePdfInput(pdf_path="/nonexistent/path/document.pdf")
result = await parse_pdf_handler(input_data)
assert result.success is False
assert "not found" in result.error.lower()
assert result.pages == 0
assert result.chunks_count == 0
asyncio.run(run_test())
def test_pipeline_failure(
self,
temp_pdf_file: Path,
failed_pipeline_result: Dict[str, Any],
) -> None:
"""Test handling when the pipeline returns a failure."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = failed_pipeline_result
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.success is False
assert "OCR processing failed" in result.error
assert result.pages == 0
asyncio.run(run_test())
def test_pipeline_exception(
self,
temp_pdf_file: Path,
) -> None:
"""Test handling when the pipeline raises an exception."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.side_effect = RuntimeError("Unexpected error")
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.success is False
assert "Processing error" in result.error
assert "Unexpected error" in result.error
asyncio.run(run_test())
# =============================================================================
# Test parse_pdf_handler - URL Downloads
# =============================================================================
class TestParsePdfHandlerUrl:
"""Tests for parse_pdf_handler with URL inputs."""
def test_successful_url_processing(
self,
sample_pdf_bytes: bytes,
successful_pipeline_result: Dict[str, Any],
) -> None:
"""Test successful processing of a PDF from URL."""
async def run_test() -> None:
with patch(
"mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
) as mock_download:
with patch(
"mcp_tools.parsing_tools.process_pdf_bytes"
) as mock_process:
mock_download.return_value = sample_pdf_bytes
mock_process.return_value = successful_pipeline_result
input_data = ParsePdfInput(
pdf_path="https://example.com/philosophy/kant.pdf"
)
result = await parse_pdf_handler(input_data)
assert result.success is True
assert result.document_name == "test-document"
mock_download.assert_called_once_with(
"https://example.com/philosophy/kant.pdf"
)
asyncio.run(run_test())
def test_url_uses_extracted_filename(
self,
sample_pdf_bytes: bytes,
successful_pipeline_result: Dict[str, Any],
) -> None:
"""Test that filename is extracted from URL for processing."""
async def run_test() -> None:
with patch(
"mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
) as mock_download:
with patch(
"mcp_tools.parsing_tools.process_pdf_bytes"
) as mock_process:
mock_download.return_value = sample_pdf_bytes
mock_process.return_value = successful_pipeline_result
input_data = ParsePdfInput(
pdf_path="https://example.com/docs/aristotle-metaphysics.pdf"
)
await parse_pdf_handler(input_data)
# Verify filename was extracted and passed
mock_process.assert_called_once()
call_kwargs = mock_process.call_args.kwargs
assert call_kwargs["filename"] == "aristotle-metaphysics.pdf"
asyncio.run(run_test())
def test_url_uses_fixed_parameters(
self,
sample_pdf_bytes: bytes,
successful_pipeline_result: Dict[str, Any],
) -> None:
"""Test that URL processing uses the fixed optimal parameters."""
async def run_test() -> None:
with patch(
"mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
) as mock_download:
with patch(
"mcp_tools.parsing_tools.process_pdf_bytes"
) as mock_process:
mock_download.return_value = sample_pdf_bytes
mock_process.return_value = successful_pipeline_result
input_data = ParsePdfInput(
pdf_path="https://example.com/document.pdf"
)
await parse_pdf_handler(input_data)
call_kwargs = mock_process.call_args.kwargs
assert call_kwargs["llm_provider"] == FIXED_LLM_PROVIDER
assert call_kwargs["llm_model"] == FIXED_LLM_MODEL
assert (
call_kwargs["use_semantic_chunking"]
== FIXED_USE_SEMANTIC_CHUNKING
)
assert (
call_kwargs["use_ocr_annotations"] == FIXED_USE_OCR_ANNOTATIONS
)
asyncio.run(run_test())
def test_url_download_http_error(self) -> None:
"""Test error handling when URL download fails with HTTP error."""
async def run_test() -> None:
with patch(
"mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
) as mock_download:
mock_download.side_effect = httpx.HTTPStatusError(
"Not Found",
request=MagicMock(),
response=MagicMock(status_code=404),
)
input_data = ParsePdfInput(
pdf_path="https://example.com/nonexistent.pdf"
)
result = await parse_pdf_handler(input_data)
assert result.success is False
assert "Failed to download PDF" in result.error
asyncio.run(run_test())
def test_url_download_network_error(self) -> None:
"""Test error handling when URL download fails with network error."""
async def run_test() -> None:
with patch(
"mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
) as mock_download:
mock_download.side_effect = httpx.ConnectError("Connection refused")
input_data = ParsePdfInput(
pdf_path="https://example.com/document.pdf"
)
result = await parse_pdf_handler(input_data)
assert result.success is False
assert "Failed to download PDF" in result.error
asyncio.run(run_test())
# =============================================================================
# Test Cost Tracking
# =============================================================================
class TestCostTracking:
"""Tests for cost tracking in parse_pdf output."""
def test_costs_are_tracked_correctly(
self,
temp_pdf_file: Path,
) -> None:
"""Test that OCR and LLM costs are correctly tracked."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = {
"success": True,
"document_name": "test-doc",
"source_id": "test-doc",
"pages": 50,
"chunks_count": 100,
"cost_ocr": 0.15, # 50 pages * 0.003€
"cost_llm": 0.25,
"cost_total": 0.40,
"output_dir": Path("output/test-doc"),
"metadata": {},
"error": None,
}
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.cost_ocr == 0.15
assert result.cost_llm == 0.25
assert result.cost_total == 0.40
asyncio.run(run_test())
def test_cost_total_calculated_when_missing(
self,
temp_pdf_file: Path,
) -> None:
"""Test that cost_total is calculated if not provided."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = {
"success": True,
"document_name": "test-doc",
"source_id": "test-doc",
"pages": 10,
"chunks_count": 20,
"cost_ocr": 0.03,
"cost_llm": 0.05,
# cost_total intentionally missing
"output_dir": Path("output/test-doc"),
"metadata": {},
"error": None,
}
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.cost_total == 0.08 # 0.03 + 0.05
asyncio.run(run_test())
def test_zero_costs_on_failure(
self,
temp_pdf_file: Path,
) -> None:
"""Test that costs are zero when processing fails early."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.side_effect = RuntimeError("Early failure")
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.success is False
assert result.cost_ocr == 0.0
assert result.cost_llm == 0.0
assert result.cost_total == 0.0
asyncio.run(run_test())
# =============================================================================
# Test Metadata Handling
# =============================================================================
class TestMetadataHandling:
"""Tests for metadata extraction and handling."""
def test_metadata_extracted_correctly(
self,
temp_pdf_file: Path,
) -> None:
"""Test that metadata is correctly passed through."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = {
"success": True,
"document_name": "platon-menon",
"source_id": "platon-menon",
"pages": 80,
"chunks_count": 150,
"cost_ocr": 0.24,
"cost_llm": 0.30,
"cost_total": 0.54,
"output_dir": Path("output/platon-menon"),
"metadata": {
"title": "Ménon",
"author": "Platon",
"language": "fr",
"year": -380,
"genre": "dialogue",
},
"error": None,
}
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.metadata["title"] == "Ménon"
assert result.metadata["author"] == "Platon"
assert result.metadata["language"] == "fr"
assert result.metadata["year"] == -380
assert result.metadata["genre"] == "dialogue"
asyncio.run(run_test())
def test_empty_metadata_handled(
self,
temp_pdf_file: Path,
) -> None:
"""Test that empty/None metadata is handled gracefully."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = {
"success": True,
"document_name": "test-doc",
"source_id": "test-doc",
"pages": 10,
"chunks_count": 20,
"cost_ocr": 0.03,
"cost_llm": 0.05,
"cost_total": 0.08,
"output_dir": Path("output/test-doc"),
"metadata": None, # Explicitly None
"error": None,
}
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
assert result.metadata == {}
asyncio.run(run_test())
# =============================================================================
# Test Output Schema Validation
# =============================================================================
class TestOutputSchemaValidation:
"""Tests for ParsePdfOutput schema compliance."""
def test_output_is_valid_schema(
self,
temp_pdf_file: Path,
successful_pipeline_result: Dict[str, Any],
) -> None:
"""Test that output conforms to ParsePdfOutput schema."""
async def run_test() -> None:
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
mock_process_pdf.return_value = successful_pipeline_result
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
result = await parse_pdf_handler(input_data)
# Verify it's the correct type
assert isinstance(result, ParsePdfOutput)
# Verify all required fields are present
assert hasattr(result, "success")
assert hasattr(result, "document_name")
assert hasattr(result, "source_id")
assert hasattr(result, "pages")
assert hasattr(result, "chunks_count")
assert hasattr(result, "cost_ocr")
assert hasattr(result, "cost_llm")
assert hasattr(result, "cost_total")
assert hasattr(result, "output_dir")
assert hasattr(result, "metadata")
assert hasattr(result, "error")
asyncio.run(run_test())
def test_error_output_is_valid_schema(self) -> None:
"""Test that error output conforms to ParsePdfOutput schema."""
async def run_test() -> None:
input_data = ParsePdfInput(pdf_path="/nonexistent/file.pdf")
result = await parse_pdf_handler(input_data)
assert isinstance(result, ParsePdfOutput)
assert result.success is False
assert result.error is not None
assert isinstance(result.error, str)
asyncio.run(run_test())

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,256 @@
"""
Unit tests for MCP Pydantic schemas.
Tests schema validation, field constraints, and JSON schema generation.
"""
import pytest
from pydantic import ValidationError
from mcp_tools.schemas import (
ParsePdfInput,
ParsePdfOutput,
SearchChunksInput,
SearchChunksOutput,
SearchSummariesInput,
GetDocumentInput,
ListDocumentsInput,
GetChunksByDocumentInput,
FilterByAuthorInput,
DeleteDocumentInput,
ChunkResult,
DocumentInfo,
)
class TestParsePdfInput:
"""Test ParsePdfInput schema validation."""
def test_valid_path(self) -> None:
"""Test valid PDF path is accepted."""
input_data = ParsePdfInput(pdf_path="/path/to/document.pdf")
assert input_data.pdf_path == "/path/to/document.pdf"
def test_valid_url(self) -> None:
"""Test valid URL is accepted."""
input_data = ParsePdfInput(pdf_path="https://example.com/doc.pdf")
assert input_data.pdf_path == "https://example.com/doc.pdf"
def test_empty_path_rejected(self) -> None:
"""Test empty path raises validation error."""
with pytest.raises(ValidationError) as exc_info:
ParsePdfInput(pdf_path="")
assert "string_too_short" in str(exc_info.value).lower()
class TestParsePdfOutput:
"""Test ParsePdfOutput schema."""
def test_full_output(self) -> None:
"""Test creating complete output."""
output = ParsePdfOutput(
success=True,
document_name="test-doc",
source_id="test-doc-v1",
pages=10,
chunks_count=25,
cost_ocr=0.03,
cost_llm=0.01,
cost_total=0.04,
output_dir="/output/test-doc",
metadata={"title": "Test", "author": "Unknown"},
)
assert output.success is True
assert output.cost_total == 0.04
assert output.metadata["title"] == "Test"
def test_output_with_error(self) -> None:
"""Test output with error field set."""
output = ParsePdfOutput(
success=False,
document_name="failed-doc",
source_id="",
pages=0,
chunks_count=0,
cost_ocr=0.0,
cost_llm=0.0,
cost_total=0.0,
output_dir="",
error="PDF processing failed: corrupted file",
)
assert output.success is False
assert "corrupted" in output.error # type: ignore
class TestSearchChunksInput:
"""Test SearchChunksInput schema validation."""
def test_minimal_input(self) -> None:
"""Test minimal valid input."""
input_data = SearchChunksInput(query="test query")
assert input_data.query == "test query"
assert input_data.limit == 10 # default
assert input_data.min_similarity == 0.0 # default
def test_full_input(self) -> None:
"""Test input with all fields."""
input_data = SearchChunksInput(
query="What is justice?",
limit=20,
min_similarity=0.5,
author_filter="Platon",
work_filter="Republic",
language_filter="fr",
)
assert input_data.limit == 20
assert input_data.author_filter == "Platon"
def test_empty_query_rejected(self) -> None:
"""Test empty query raises error."""
with pytest.raises(ValidationError):
SearchChunksInput(query="")
def test_query_too_long_rejected(self) -> None:
"""Test query over 1000 chars is rejected."""
with pytest.raises(ValidationError):
SearchChunksInput(query="a" * 1001)
def test_limit_bounds(self) -> None:
"""Test limit validation bounds."""
with pytest.raises(ValidationError):
SearchChunksInput(query="test", limit=0)
with pytest.raises(ValidationError):
SearchChunksInput(query="test", limit=101)
def test_similarity_bounds(self) -> None:
"""Test similarity validation bounds."""
with pytest.raises(ValidationError):
SearchChunksInput(query="test", min_similarity=-0.1)
with pytest.raises(ValidationError):
SearchChunksInput(query="test", min_similarity=1.1)
class TestSearchSummariesInput:
"""Test SearchSummariesInput schema validation."""
def test_level_filters(self) -> None:
"""Test min/max level filters."""
input_data = SearchSummariesInput(
query="test",
min_level=1,
max_level=3,
)
assert input_data.min_level == 1
assert input_data.max_level == 3
def test_level_bounds(self) -> None:
"""Test level validation bounds."""
with pytest.raises(ValidationError):
SearchSummariesInput(query="test", min_level=0)
with pytest.raises(ValidationError):
SearchSummariesInput(query="test", max_level=6)
class TestGetDocumentInput:
"""Test GetDocumentInput schema validation."""
def test_defaults(self) -> None:
"""Test default values."""
input_data = GetDocumentInput(source_id="doc-123")
assert input_data.include_chunks is False
assert input_data.chunk_limit == 50
def test_with_chunks(self) -> None:
"""Test requesting chunks."""
input_data = GetDocumentInput(
source_id="doc-123",
include_chunks=True,
chunk_limit=100,
)
assert input_data.include_chunks is True
assert input_data.chunk_limit == 100
class TestDeleteDocumentInput:
"""Test DeleteDocumentInput schema validation."""
def test_requires_confirmation(self) -> None:
"""Test confirm defaults to False."""
input_data = DeleteDocumentInput(source_id="doc-to-delete")
assert input_data.confirm is False
def test_with_confirmation(self) -> None:
"""Test explicit confirmation."""
input_data = DeleteDocumentInput(
source_id="doc-to-delete",
confirm=True,
)
assert input_data.confirm is True
class TestChunkResult:
"""Test ChunkResult model."""
def test_full_chunk(self) -> None:
"""Test creating full chunk result."""
chunk = ChunkResult(
text="This is the chunk content.",
similarity=0.85,
section_path="Chapter 1 > Section 1",
chapter_title="Introduction",
work_title="The Republic",
work_author="Platon",
order_index=5,
)
assert chunk.similarity == 0.85
assert chunk.order_index == 5
class TestDocumentInfo:
"""Test DocumentInfo model."""
def test_with_optional_fields(self) -> None:
"""Test DocumentInfo with all fields."""
doc = DocumentInfo(
source_id="platon-republic",
work_title="The Republic",
work_author="Platon",
edition="GF Flammarion",
pages=500,
language="fr",
toc={"chapters": ["I", "II", "III"]},
hierarchy={"level": 1},
)
assert doc.toc is not None
assert doc.hierarchy is not None
class TestJsonSchemaGeneration:
"""Test JSON schema generation from Pydantic models."""
def test_schemas_have_descriptions(self) -> None:
"""Test all fields have descriptions for JSON schema."""
schema = SearchChunksInput.model_json_schema()
# Check field descriptions exist
properties = schema["properties"]
assert "description" in properties["query"]
assert "description" in properties["limit"]
assert "description" in properties["min_similarity"]
def test_schema_includes_constraints(self) -> None:
"""Test validation constraints are in JSON schema."""
schema = SearchChunksInput.model_json_schema()
props = schema["properties"]
# Check minLength constraint
assert props["query"].get("minLength") == 1
assert props["query"].get("maxLength") == 1000
# Check numeric constraints
assert props["limit"].get("minimum") == 1
assert props["limit"].get("maximum") == 100
if __name__ == "__main__":
pytest.main([__file__, "-v"])