Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions
--- a/generations/library_rag/tests/mcp/init.py
+++ b/generations/library_rag/tests/mcp/init.py
@@ -0,0 +1 @@
+"""MCP server unit tests."""
--- a/generations/library_rag/tests/mcp/conftest.py
+++ b/generations/library_rag/tests/mcp/conftest.py
@@ -0,0 +1,196 @@
+"""
+Pytest fixtures for MCP server tests.
+
+Provides common fixtures for mocking dependencies and test data.
+"""
+
+import os
+from pathlib import Path
+from typing import Any, Dict, Generator
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from mcp_config import MCPConfig
+
+
+@pytest.fixture
+def mock_env_with_api_key() -> Generator[Dict[str, str], None, None]:
+    """
+    Provide environment with MISTRAL_API_KEY set.
+
+    Yields:
+        Dictionary of environment variables.
+    """
+    env = {"MISTRAL_API_KEY": "test-api-key-12345"}
+    with patch.dict(os.environ, env, clear=True):
+        yield env
+
+
+@pytest.fixture
+def valid_config() -> MCPConfig:
+    """
+    Provide a valid MCPConfig instance for testing.
+
+    Returns:
+        MCPConfig with valid test values.
+    """
+    return MCPConfig(
+        mistral_api_key="test-api-key",
+        ollama_base_url="http://localhost:11434",
+        structure_llm_model="test-model",
+        structure_llm_temperature=0.2,
+        default_llm_provider="ollama",
+        weaviate_host="localhost",
+        weaviate_port=8080,
+        log_level="INFO",
+        output_dir=Path("test_output"),
+    )
+
+
+@pytest.fixture
+def mock_weaviate_client() -> Generator[MagicMock, None, None]:
+    """
+    Provide a mocked Weaviate client.
+
+    Yields:
+        MagicMock configured as a Weaviate client.
+    """
+    with patch("weaviate.connect_to_local") as mock_connect:
+        mock_client = MagicMock()
+        mock_connect.return_value = mock_client
+        yield mock_client
+
+
+# =============================================================================
+# Parsing Tools Fixtures
+# =============================================================================
+
+
+@pytest.fixture
+def sample_pdf_bytes() -> bytes:
+    """
+    Provide minimal valid PDF bytes for testing.
+
+    Returns:
+        Bytes representing a minimal valid PDF file.
+    """
+    # Minimal valid PDF structure
+    return b"""%PDF-1.4
+1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj
+2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj
+3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >> endobj
+xref
+0 4
+0000000000 65535 f
+0000000009 00000 n
+0000000058 00000 n
+0000000115 00000 n
+trailer << /Size 4 /Root 1 0 R >>
+startxref
+193
+%%EOF"""
+
+
+@pytest.fixture
+def successful_pipeline_result() -> Dict[str, Any]:
+    """
+    Provide a successful pipeline result for mocking.
+
+    Returns:
+        Dictionary mimicking a successful process_pdf result.
+    """
+    return {
+        "success": True,
+        "document_name": "test-document",
+        "source_id": "test-document",
+        "pages": 10,
+        "chunks_count": 25,
+        "cost_ocr": 0.03,
+        "cost_llm": 0.05,
+        "cost_total": 0.08,
+        "output_dir": Path("output/test-document"),
+        "metadata": {
+            "title": "Test Document Title",
+            "author": "Test Author",
+            "language": "en",
+            "year": 2023,
+        },
+        "error": None,
+    }
+
+
+@pytest.fixture
+def failed_pipeline_result() -> Dict[str, Any]:
+    """
+    Provide a failed pipeline result for mocking.
+
+    Returns:
+        Dictionary mimicking a failed process_pdf result.
+    """
+    return {
+        "success": False,
+        "document_name": "failed-document",
+        "source_id": "failed-document",
+        "pages": 0,
+        "chunks_count": 0,
+        "cost_ocr": 0.0,
+        "cost_llm": 0.0,
+        "cost_total": 0.0,
+        "output_dir": "",
+        "metadata": {},
+        "error": "OCR processing failed: Invalid PDF structure",
+    }
+
+
+@pytest.fixture
+def mock_process_pdf() -> Generator[MagicMock, None, None]:
+    """
+    Provide a mocked process_pdf function.
+
+    Yields:
+        MagicMock for utils.pdf_pipeline.process_pdf.
+    """
+    with patch("mcp_tools.parsing_tools.process_pdf") as mock:
+        yield mock
+
+
+@pytest.fixture
+def mock_process_pdf_bytes() -> Generator[MagicMock, None, None]:
+    """
+    Provide a mocked process_pdf_bytes function.
+
+    Yields:
+        MagicMock for utils.pdf_pipeline.process_pdf_bytes.
+    """
+    with patch("mcp_tools.parsing_tools.process_pdf_bytes") as mock:
+        yield mock
+
+
+@pytest.fixture
+def mock_download_pdf() -> Generator[AsyncMock, None, None]:
+    """
+    Provide a mocked download_pdf function.
+
+    Yields:
+        AsyncMock for mcp_tools.parsing_tools.download_pdf.
+    """
+    with patch("mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock) as mock:
+        yield mock
+
+
+@pytest.fixture
+def temp_pdf_file(tmp_path: Path, sample_pdf_bytes: bytes) -> Path:
+    """
+    Create a temporary PDF file for testing.
+
+    Args:
+        tmp_path: Pytest tmp_path fixture.
+        sample_pdf_bytes: Sample PDF content.
+
+    Returns:
+        Path to the temporary PDF file.
+    """
+    pdf_path = tmp_path / "test_document.pdf"
+    pdf_path.write_bytes(sample_pdf_bytes)
+    return pdf_path
--- a/generations/library_rag/tests/mcp/test_config.py
+++ b/generations/library_rag/tests/mcp/test_config.py
@@ -0,0 +1,133 @@
+"""
+Unit tests for MCP configuration management.
+
+Tests the MCPConfig class for proper loading, validation, and defaults.
+"""
+
+import os
+import pytest
+from pathlib import Path
+from unittest.mock import patch
+
+from mcp_config import MCPConfig
+
+
+class TestMCPConfigFromEnv:
+    """Test MCPConfig.from_env() method."""
+
+    def test_loads_with_required_key(self) -> None:
+        """Test config loads when MISTRAL_API_KEY is present."""
+        with patch.dict(os.environ, {"MISTRAL_API_KEY": "test-key-123"}, clear=True):
+            config = MCPConfig.from_env()
+            assert config.mistral_api_key == "test-key-123"
+
+    def test_raises_without_api_key(self) -> None:
+        """Test ValueError is raised when MISTRAL_API_KEY is missing."""
+        with patch("mcp_config.load_dotenv"):  # Prevent reading .env file
+            with patch.dict(os.environ, {}, clear=True):
+                with pytest.raises(ValueError) as exc_info:
+                    MCPConfig.from_env()
+                assert "MISTRAL_API_KEY" in str(exc_info.value)
+
+    def test_default_values_applied(self) -> None:
+        """Test all default values are applied correctly."""
+        with patch.dict(os.environ, {"MISTRAL_API_KEY": "test-key"}, clear=True):
+            config = MCPConfig.from_env()
+
+            # Check all defaults
+            assert config.ollama_base_url == "http://localhost:11434"
+            assert config.structure_llm_model == "deepseek-r1:14b"
+            assert config.structure_llm_temperature == 0.2
+            assert config.default_llm_provider == "ollama"
+            assert config.weaviate_host == "localhost"
+            assert config.weaviate_port == 8080
+            assert config.log_level == "INFO"
+            assert config.output_dir == Path("output")
+
+    def test_custom_values_loaded(self) -> None:
+        """Test custom environment values are loaded correctly."""
+        custom_env = {
+            "MISTRAL_API_KEY": "custom-key",
+            "OLLAMA_BASE_URL": "http://custom:1234",
+            "STRUCTURE_LLM_MODEL": "custom-model",
+            "STRUCTURE_LLM_TEMPERATURE": "0.7",
+            "DEFAULT_LLM_PROVIDER": "mistral",
+            "WEAVIATE_HOST": "weaviate.example.com",
+            "WEAVIATE_PORT": "9999",
+            "LOG_LEVEL": "DEBUG",
+            "OUTPUT_DIR": "/custom/output",
+        }
+        with patch.dict(os.environ, custom_env, clear=True):
+            config = MCPConfig.from_env()
+
+            assert config.mistral_api_key == "custom-key"
+            assert config.ollama_base_url == "http://custom:1234"
+            assert config.structure_llm_model == "custom-model"
+            assert config.structure_llm_temperature == 0.7
+            assert config.default_llm_provider == "mistral"
+            assert config.weaviate_host == "weaviate.example.com"
+            assert config.weaviate_port == 9999
+            assert config.log_level == "DEBUG"
+            assert config.output_dir == Path("/custom/output")
+
+
+class TestMCPConfigValidation:
+    """Test MCPConfig.validate() method."""
+
+    def test_valid_config_passes(self) -> None:
+        """Test valid configuration passes validation."""
+        config = MCPConfig(
+            mistral_api_key="test-key",
+            default_llm_provider="ollama",
+            log_level="INFO",
+            structure_llm_temperature=0.5,
+        )
+        # Should not raise
+        config.validate()
+
+    def test_invalid_llm_provider_fails(self) -> None:
+        """Test invalid LLM provider raises ValueError."""
+        config = MCPConfig(
+            mistral_api_key="test-key",
+            default_llm_provider="invalid",  # type: ignore
+        )
+        with pytest.raises(ValueError) as exc_info:
+            config.validate()
+        assert "Invalid LLM provider" in str(exc_info.value)
+
+    def test_invalid_log_level_fails(self) -> None:
+        """Test invalid log level raises ValueError."""
+        config = MCPConfig(
+            mistral_api_key="test-key",
+            log_level="INVALID",
+        )
+        with pytest.raises(ValueError) as exc_info:
+            config.validate()
+        assert "Invalid log level" in str(exc_info.value)
+
+    def test_invalid_temperature_fails(self) -> None:
+        """Test temperature outside 0-2 range raises ValueError."""
+        config = MCPConfig(
+            mistral_api_key="test-key",
+            structure_llm_temperature=2.5,
+        )
+        with pytest.raises(ValueError) as exc_info:
+            config.validate()
+        assert "Invalid temperature" in str(exc_info.value)
+
+
+class TestMCPConfigProperties:
+    """Test MCPConfig properties."""
+
+    def test_weaviate_url_property(self) -> None:
+        """Test weaviate_url property returns correct URL."""
+        config = MCPConfig(
+            mistral_api_key="test-key",
+            weaviate_host="my-host",
+            weaviate_port=9090,
+        )
+        assert config.weaviate_url == "http://my-host:9090"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
--- a/generations/library_rag/tests/mcp/test_parsing_tools.py
+++ b/generations/library_rag/tests/mcp/test_parsing_tools.py
@@ -0,0 +1,673 @@
+"""
+Unit tests for MCP parsing tools.
+
+Tests the parse_pdf tool handler with mocked dependencies to ensure:
+- Local file processing works correctly
+- URL-based PDF downloads work correctly
+- Error handling is comprehensive
+- Fixed parameters are used correctly
+- Cost tracking is accurate
+
+Uses asyncio for async test support.
+"""
+
+import asyncio
+from pathlib import Path
+from typing import Any, Dict
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import httpx
+import pytest
+
+from mcp_tools.parsing_tools import (
+    FIXED_LLM_MODEL,
+    FIXED_LLM_PROVIDER,
+    FIXED_USE_LLM,
+    FIXED_USE_OCR_ANNOTATIONS,
+    FIXED_USE_SEMANTIC_CHUNKING,
+    download_pdf,
+    extract_filename_from_url,
+    is_url,
+    parse_pdf_handler,
+)
+from mcp_tools.schemas import ParsePdfInput, ParsePdfOutput
+
+
+# =============================================================================
+# Test is_url Helper Function
+# =============================================================================
+
+
+class TestIsUrl:
+    """Tests for the is_url helper function."""
+
+    def test_https_url(self) -> None:
+        """Test that HTTPS URLs are recognized."""
+        assert is_url("https://example.com/document.pdf") is True
+
+    def test_http_url(self) -> None:
+        """Test that HTTP URLs are recognized."""
+        assert is_url("http://example.com/document.pdf") is True
+
+    def test_local_path_unix(self) -> None:
+        """Test that Unix local paths are not recognized as URLs."""
+        assert is_url("/path/to/document.pdf") is False
+
+    def test_local_path_windows(self) -> None:
+        """Test that Windows local paths are not recognized as URLs."""
+        assert is_url("C:\\Documents\\document.pdf") is False
+
+    def test_relative_path(self) -> None:
+        """Test that relative paths are not recognized as URLs."""
+        assert is_url("./documents/document.pdf") is False
+
+    def test_ftp_url_not_supported(self) -> None:
+        """Test that FTP URLs are not recognized (only HTTP/HTTPS supported)."""
+        assert is_url("ftp://example.com/document.pdf") is False
+
+    def test_empty_string(self) -> None:
+        """Test that empty strings are not recognized as URLs."""
+        assert is_url("") is False
+
+
+# =============================================================================
+# Test extract_filename_from_url Helper Function
+# =============================================================================
+
+
+class TestExtractFilenameFromUrl:
+    """Tests for the extract_filename_from_url helper function."""
+
+    def test_url_with_pdf_filename(self) -> None:
+        """Test extraction when URL has a .pdf filename."""
+        result = extract_filename_from_url("https://example.com/docs/aristotle.pdf")
+        assert result == "aristotle.pdf"
+
+    def test_url_with_filename_no_extension(self) -> None:
+        """Test extraction when URL has a filename without extension."""
+        result = extract_filename_from_url("https://example.com/docs/aristotle")
+        assert result == "aristotle.pdf"
+
+    def test_url_without_path(self) -> None:
+        """Test extraction when URL has no path."""
+        result = extract_filename_from_url("https://example.com/")
+        assert result == "downloaded.pdf"
+
+    def test_url_with_api_endpoint(self) -> None:
+        """Test extraction when URL is an API endpoint."""
+        result = extract_filename_from_url("https://api.example.com/download")
+        assert result == "download.pdf"
+
+    def test_url_with_query_params(self) -> None:
+        """Test extraction when URL has query parameters."""
+        result = extract_filename_from_url(
+            "https://example.com/docs/kant.pdf?token=abc"
+        )
+        assert result == "kant.pdf"
+
+
+# =============================================================================
+# Test download_pdf Function
+# =============================================================================
+
+
+class TestDownloadPdf:
+    """Tests for the download_pdf async function."""
+
+    def test_successful_download(self) -> None:
+        """Test successful PDF download from URL."""
+
+        async def run_test() -> None:
+            mock_response = MagicMock()
+            mock_response.content = b"%PDF-1.4 test content"
+            mock_response.headers = {"content-type": "application/pdf"}
+            mock_response.raise_for_status = MagicMock()
+
+            with patch(
+                "mcp_tools.parsing_tools.httpx.AsyncClient"
+            ) as mock_client_class:
+                mock_client = AsyncMock()
+                mock_client.get = AsyncMock(return_value=mock_response)
+                mock_client_class.return_value.__aenter__ = AsyncMock(
+                    return_value=mock_client
+                )
+                mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
+
+                result = await download_pdf("https://example.com/document.pdf")
+
+                assert result == b"%PDF-1.4 test content"
+                mock_client.get.assert_called_once_with(
+                    "https://example.com/document.pdf"
+                )
+
+        asyncio.run(run_test())
+
+    def test_download_with_non_pdf_content_type(self) -> None:
+        """Test download proceeds with warning when content-type is not PDF."""
+
+        async def run_test() -> None:
+            mock_response = MagicMock()
+            mock_response.content = b"%PDF-1.4 test content"
+            mock_response.headers = {"content-type": "application/octet-stream"}
+            mock_response.raise_for_status = MagicMock()
+
+            with patch(
+                "mcp_tools.parsing_tools.httpx.AsyncClient"
+            ) as mock_client_class:
+                mock_client = AsyncMock()
+                mock_client.get = AsyncMock(return_value=mock_response)
+                mock_client_class.return_value.__aenter__ = AsyncMock(
+                    return_value=mock_client
+                )
+                mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
+
+                # Should still succeed, just logs a warning
+                result = await download_pdf("https://example.com/document.pdf")
+                assert result == b"%PDF-1.4 test content"
+
+        asyncio.run(run_test())
+
+    def test_download_http_error(self) -> None:
+        """Test that HTTP errors are propagated."""
+
+        async def run_test() -> None:
+            with patch(
+                "mcp_tools.parsing_tools.httpx.AsyncClient"
+            ) as mock_client_class:
+                mock_client = AsyncMock()
+                mock_client.get = AsyncMock(
+                    side_effect=httpx.HTTPStatusError(
+                        "Not Found",
+                        request=MagicMock(),
+                        response=MagicMock(status_code=404),
+                    )
+                )
+                mock_client_class.return_value.__aenter__ = AsyncMock(
+                    return_value=mock_client
+                )
+                mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
+
+                with pytest.raises(httpx.HTTPStatusError):
+                    await download_pdf("https://example.com/nonexistent.pdf")
+
+        asyncio.run(run_test())
+
+
+# =============================================================================
+# Test parse_pdf_handler - Local Files
+# =============================================================================
+
+
+class TestParsePdfHandlerLocalFile:
+    """Tests for parse_pdf_handler with local file paths."""
+
+    def test_successful_local_file_processing(
+        self,
+        temp_pdf_file: Path,
+        successful_pipeline_result: Dict[str, Any],
+    ) -> None:
+        """Test successful processing of a local PDF file."""
+
+        async def run_test() -> None:
+            with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
+                mock_process_pdf.return_value = successful_pipeline_result
+
+                input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
+                result = await parse_pdf_handler(input_data)
+
+                assert result.success is True
+                assert result.document_name == "test-document"
+                assert result.pages == 10
+                assert result.chunks_count == 25
+                assert result.cost_ocr == 0.03
+                assert result.cost_llm == 0.05
+                assert result.cost_total == 0.08
+                assert result.metadata["title"] == "Test Document Title"
+                assert result.error is None
+
+        asyncio.run(run_test())
+
+    def test_local_file_uses_fixed_parameters(
+        self,
+        temp_pdf_file: Path,
+        successful_pipeline_result: Dict[str, Any],
+    ) -> None:
+        """Test that local file processing uses the fixed optimal parameters."""
+
+        async def run_test() -> None:
+            with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
+                mock_process_pdf.return_value = successful_pipeline_result
+
+                input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
+                await parse_pdf_handler(input_data)
+
+                # Verify fixed parameters are passed
+                mock_process_pdf.assert_called_once()
+                call_kwargs = mock_process_pdf.call_args.kwargs
+
+                assert call_kwargs["use_llm"] == FIXED_USE_LLM
+                assert call_kwargs["llm_provider"] == FIXED_LLM_PROVIDER
+                assert call_kwargs["llm_model"] == FIXED_LLM_MODEL
+                assert call_kwargs["use_semantic_chunking"] == FIXED_USE_SEMANTIC_CHUNKING
+                assert call_kwargs["use_ocr_annotations"] == FIXED_USE_OCR_ANNOTATIONS
+
+        asyncio.run(run_test())
+
+    def test_file_not_found_error(self) -> None:
+        """Test error handling when local file does not exist."""
+
+        async def run_test() -> None:
+            input_data = ParsePdfInput(pdf_path="/nonexistent/path/document.pdf")
+            result = await parse_pdf_handler(input_data)
+
+            assert result.success is False
+            assert "not found" in result.error.lower()
+            assert result.pages == 0
+            assert result.chunks_count == 0
+
+        asyncio.run(run_test())
+
+    def test_pipeline_failure(
+        self,
+        temp_pdf_file: Path,
+        failed_pipeline_result: Dict[str, Any],
+    ) -> None:
+        """Test handling when the pipeline returns a failure."""
+
+        async def run_test() -> None:
+            with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
+                mock_process_pdf.return_value = failed_pipeline_result
+
+                input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
+                result = await parse_pdf_handler(input_data)
+
+                assert result.success is False
+                assert "OCR processing failed" in result.error
+                assert result.pages == 0
+
+        asyncio.run(run_test())
+
+    def test_pipeline_exception(
+        self,
+        temp_pdf_file: Path,
+    ) -> None:
+        """Test handling when the pipeline raises an exception."""
+
+        async def run_test() -> None:
+            with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
+                mock_process_pdf.side_effect = RuntimeError("Unexpected error")
+
+                input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
+                result = await parse_pdf_handler(input_data)
+
+                assert result.success is False
+                assert "Processing error" in result.error
+                assert "Unexpected error" in result.error
+
+        asyncio.run(run_test())
+
+
+# =============================================================================
+# Test parse_pdf_handler - URL Downloads
+# =============================================================================
+
+
+class TestParsePdfHandlerUrl:
+    """Tests for parse_pdf_handler with URL inputs."""
+
+    def test_successful_url_processing(
+        self,
+        sample_pdf_bytes: bytes,
+        successful_pipeline_result: Dict[str, Any],
+    ) -> None:
+        """Test successful processing of a PDF from URL."""
+
+        async def run_test() -> None:
+            with patch(
+                "mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
+            ) as mock_download:
+                with patch(
+                    "mcp_tools.parsing_tools.process_pdf_bytes"
+                ) as mock_process:
+                    mock_download.return_value = sample_pdf_bytes
+                    mock_process.return_value = successful_pipeline_result
+
+                    input_data = ParsePdfInput(
+                        pdf_path="https://example.com/philosophy/kant.pdf"
+                    )
+                    result = await parse_pdf_handler(input_data)
+
+                    assert result.success is True
+                    assert result.document_name == "test-document"
+                    mock_download.assert_called_once_with(
+                        "https://example.com/philosophy/kant.pdf"
+                    )
+
+        asyncio.run(run_test())
+
+    def test_url_uses_extracted_filename(
+        self,
+        sample_pdf_bytes: bytes,
+        successful_pipeline_result: Dict[str, Any],
+    ) -> None:
+        """Test that filename is extracted from URL for processing."""
+
+        async def run_test() -> None:
+            with patch(
+                "mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
+            ) as mock_download:
+                with patch(
+                    "mcp_tools.parsing_tools.process_pdf_bytes"
+                ) as mock_process:
+                    mock_download.return_value = sample_pdf_bytes
+                    mock_process.return_value = successful_pipeline_result
+
+                    input_data = ParsePdfInput(
+                        pdf_path="https://example.com/docs/aristotle-metaphysics.pdf"
+                    )
+                    await parse_pdf_handler(input_data)
+
+                    # Verify filename was extracted and passed
+                    mock_process.assert_called_once()
+                    call_kwargs = mock_process.call_args.kwargs
+                    assert call_kwargs["filename"] == "aristotle-metaphysics.pdf"
+
+        asyncio.run(run_test())
+
+    def test_url_uses_fixed_parameters(
+        self,
+        sample_pdf_bytes: bytes,
+        successful_pipeline_result: Dict[str, Any],
+    ) -> None:
+        """Test that URL processing uses the fixed optimal parameters."""
+
+        async def run_test() -> None:
+            with patch(
+                "mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
+            ) as mock_download:
+                with patch(
+                    "mcp_tools.parsing_tools.process_pdf_bytes"
+                ) as mock_process:
+                    mock_download.return_value = sample_pdf_bytes
+                    mock_process.return_value = successful_pipeline_result
+
+                    input_data = ParsePdfInput(
+                        pdf_path="https://example.com/document.pdf"
+                    )
+                    await parse_pdf_handler(input_data)
+
+                    call_kwargs = mock_process.call_args.kwargs
+                    assert call_kwargs["llm_provider"] == FIXED_LLM_PROVIDER
+                    assert call_kwargs["llm_model"] == FIXED_LLM_MODEL
+                    assert (
+                        call_kwargs["use_semantic_chunking"]
+                        == FIXED_USE_SEMANTIC_CHUNKING
+                    )
+                    assert (
+                        call_kwargs["use_ocr_annotations"] == FIXED_USE_OCR_ANNOTATIONS
+                    )
+
+        asyncio.run(run_test())
+
+    def test_url_download_http_error(self) -> None:
+        """Test error handling when URL download fails with HTTP error."""
+
+        async def run_test() -> None:
+            with patch(
+                "mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
+            ) as mock_download:
+                mock_download.side_effect = httpx.HTTPStatusError(
+                    "Not Found",
+                    request=MagicMock(),
+                    response=MagicMock(status_code=404),
+                )
+
+                input_data = ParsePdfInput(
+                    pdf_path="https://example.com/nonexistent.pdf"
+                )
+                result = await parse_pdf_handler(input_data)
+
+                assert result.success is False
+                assert "Failed to download PDF" in result.error
+
+        asyncio.run(run_test())
+
+    def test_url_download_network_error(self) -> None:
+        """Test error handling when URL download fails with network error."""
+
+        async def run_test() -> None:
+            with patch(
+                "mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
+            ) as mock_download:
+                mock_download.side_effect = httpx.ConnectError("Connection refused")
+
+                input_data = ParsePdfInput(
+                    pdf_path="https://example.com/document.pdf"
+                )
+                result = await parse_pdf_handler(input_data)
+
+                assert result.success is False
+                assert "Failed to download PDF" in result.error
+
+        asyncio.run(run_test())
+
+
+# =============================================================================
+# Test Cost Tracking
+# =============================================================================
+
+
+class TestCostTracking:
+    """Tests for cost tracking in parse_pdf output."""
+
+    def test_costs_are_tracked_correctly(
+        self,
+        temp_pdf_file: Path,
+    ) -> None:
+        """Test that OCR and LLM costs are correctly tracked."""
+
+        async def run_test() -> None:
+            with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
+                mock_process_pdf.return_value = {
+                    "success": True,
+                    "document_name": "test-doc",
+                    "source_id": "test-doc",
+                    "pages": 50,
+                    "chunks_count": 100,
+                    "cost_ocr": 0.15,  # 50 pages * 0.003€
+                    "cost_llm": 0.25,
+                    "cost_total": 0.40,
+                    "output_dir": Path("output/test-doc"),
+                    "metadata": {},
+                    "error": None,
+                }
+
+                input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
+                result = await parse_pdf_handler(input_data)
+
+                assert result.cost_ocr == 0.15
+                assert result.cost_llm == 0.25
+                assert result.cost_total == 0.40
+
+        asyncio.run(run_test())
+
+    def test_cost_total_calculated_when_missing(
+        self,
+        temp_pdf_file: Path,
+    ) -> None:
+        """Test that cost_total is calculated if not provided."""
+
+        async def run_test() -> None:
+            with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
+                mock_process_pdf.return_value = {
+                    "success": True,
+                    "document_name": "test-doc",
+                    "source_id": "test-doc",
+                    "pages": 10,
+                    "chunks_count": 20,
+                    "cost_ocr": 0.03,
+                    "cost_llm": 0.05,
+                    # cost_total intentionally missing
+                    "output_dir": Path("output/test-doc"),
+                    "metadata": {},
+                    "error": None,
+                }
+
+                input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
+                result = await parse_pdf_handler(input_data)
+
+                assert result.cost_total == 0.08  # 0.03 + 0.05
+
+        asyncio.run(run_test())
+
+    def test_zero_costs_on_failure(
+        self,
+        temp_pdf_file: Path,
+    ) -> None:
+        """Test that costs are zero when processing fails early."""
+
+        async def run_test() -> None:
+            with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
+                mock_process_pdf.side_effect = RuntimeError("Early failure")
+
+                input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
+                result = await parse_pdf_handler(input_data)
+
+                assert result.success is False
+                assert result.cost_ocr == 0.0
+                assert result.cost_llm == 0.0
+                assert result.cost_total == 0.0
+
+        asyncio.run(run_test())
+
+
+# =============================================================================
+# Test Metadata Handling
+# =============================================================================
+
+
+class TestMetadataHandling:
+    """Tests for metadata extraction and handling."""
+
+    def test_metadata_extracted_correctly(
+        self,
+        temp_pdf_file: Path,
+    ) -> None:
+        """Test that metadata is correctly passed through."""
+
+        async def run_test() -> None:
+            with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
+                mock_process_pdf.return_value = {
+                    "success": True,
+                    "document_name": "platon-menon",
+                    "source_id": "platon-menon",
+                    "pages": 80,
+                    "chunks_count": 150,
+                    "cost_ocr": 0.24,
+                    "cost_llm": 0.30,
+                    "cost_total": 0.54,
+                    "output_dir": Path("output/platon-menon"),
+                    "metadata": {
+                        "title": "Ménon",
+                        "author": "Platon",
+                        "language": "fr",
+                        "year": -380,
+                        "genre": "dialogue",
+                    },
+                    "error": None,
+                }
+
+                input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
+                result = await parse_pdf_handler(input_data)
+
+                assert result.metadata["title"] == "Ménon"
+                assert result.metadata["author"] == "Platon"
+                assert result.metadata["language"] == "fr"
+                assert result.metadata["year"] == -380
+                assert result.metadata["genre"] == "dialogue"
+
+        asyncio.run(run_test())
+
+    def test_empty_metadata_handled(
+        self,
+        temp_pdf_file: Path,
+    ) -> None:
+        """Test that empty/None metadata is handled gracefully."""
+
+        async def run_test() -> None:
+            with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
+                mock_process_pdf.return_value = {
+                    "success": True,
+                    "document_name": "test-doc",
+                    "source_id": "test-doc",
+                    "pages": 10,
+                    "chunks_count": 20,
+                    "cost_ocr": 0.03,
+                    "cost_llm": 0.05,
+                    "cost_total": 0.08,
+                    "output_dir": Path("output/test-doc"),
+                    "metadata": None,  # Explicitly None
+                    "error": None,
+                }
+
+                input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
+                result = await parse_pdf_handler(input_data)
+
+                assert result.metadata == {}
+
+        asyncio.run(run_test())
+
+
+# =============================================================================
+# Test Output Schema Validation
+# =============================================================================
+
+
+class TestOutputSchemaValidation:
+    """Tests for ParsePdfOutput schema compliance."""
+
+    def test_output_is_valid_schema(
+        self,
+        temp_pdf_file: Path,
+        successful_pipeline_result: Dict[str, Any],
+    ) -> None:
+        """Test that output conforms to ParsePdfOutput schema."""
+
+        async def run_test() -> None:
+            with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
+                mock_process_pdf.return_value = successful_pipeline_result
+
+                input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
+                result = await parse_pdf_handler(input_data)
+
+                # Verify it's the correct type
+                assert isinstance(result, ParsePdfOutput)
+
+                # Verify all required fields are present
+                assert hasattr(result, "success")
+                assert hasattr(result, "document_name")
+                assert hasattr(result, "source_id")
+                assert hasattr(result, "pages")
+                assert hasattr(result, "chunks_count")
+                assert hasattr(result, "cost_ocr")
+                assert hasattr(result, "cost_llm")
+                assert hasattr(result, "cost_total")
+                assert hasattr(result, "output_dir")
+                assert hasattr(result, "metadata")
+                assert hasattr(result, "error")
+
+        asyncio.run(run_test())
+
+    def test_error_output_is_valid_schema(self) -> None:
+        """Test that error output conforms to ParsePdfOutput schema."""
+
+        async def run_test() -> None:
+            input_data = ParsePdfInput(pdf_path="/nonexistent/file.pdf")
+            result = await parse_pdf_handler(input_data)
+
+            assert isinstance(result, ParsePdfOutput)
+            assert result.success is False
+            assert result.error is not None
+            assert isinstance(result.error, str)
+
+        asyncio.run(run_test())
--- a/generations/library_rag/tests/mcp/test_retrieval_tools.py
+++ b/generations/library_rag/tests/mcp/test_retrieval_tools.py
--- a/generations/library_rag/tests/mcp/test_schemas.py
+++ b/generations/library_rag/tests/mcp/test_schemas.py
@@ -0,0 +1,256 @@
+"""
+Unit tests for MCP Pydantic schemas.
+
+Tests schema validation, field constraints, and JSON schema generation.
+"""
+
+import pytest
+from pydantic import ValidationError
+
+from mcp_tools.schemas import (
+    ParsePdfInput,
+    ParsePdfOutput,
+    SearchChunksInput,
+    SearchChunksOutput,
+    SearchSummariesInput,
+    GetDocumentInput,
+    ListDocumentsInput,
+    GetChunksByDocumentInput,
+    FilterByAuthorInput,
+    DeleteDocumentInput,
+    ChunkResult,
+    DocumentInfo,
+)
+
+
+class TestParsePdfInput:
+    """Test ParsePdfInput schema validation."""
+
+    def test_valid_path(self) -> None:
+        """Test valid PDF path is accepted."""
+        input_data = ParsePdfInput(pdf_path="/path/to/document.pdf")
+        assert input_data.pdf_path == "/path/to/document.pdf"
+
+    def test_valid_url(self) -> None:
+        """Test valid URL is accepted."""
+        input_data = ParsePdfInput(pdf_path="https://example.com/doc.pdf")
+        assert input_data.pdf_path == "https://example.com/doc.pdf"
+
+    def test_empty_path_rejected(self) -> None:
+        """Test empty path raises validation error."""
+        with pytest.raises(ValidationError) as exc_info:
+            ParsePdfInput(pdf_path="")
+        assert "string_too_short" in str(exc_info.value).lower()
+
+
+class TestParsePdfOutput:
+    """Test ParsePdfOutput schema."""
+
+    def test_full_output(self) -> None:
+        """Test creating complete output."""
+        output = ParsePdfOutput(
+            success=True,
+            document_name="test-doc",
+            source_id="test-doc-v1",
+            pages=10,
+            chunks_count=25,
+            cost_ocr=0.03,
+            cost_llm=0.01,
+            cost_total=0.04,
+            output_dir="/output/test-doc",
+            metadata={"title": "Test", "author": "Unknown"},
+        )
+        assert output.success is True
+        assert output.cost_total == 0.04
+        assert output.metadata["title"] == "Test"
+
+    def test_output_with_error(self) -> None:
+        """Test output with error field set."""
+        output = ParsePdfOutput(
+            success=False,
+            document_name="failed-doc",
+            source_id="",
+            pages=0,
+            chunks_count=0,
+            cost_ocr=0.0,
+            cost_llm=0.0,
+            cost_total=0.0,
+            output_dir="",
+            error="PDF processing failed: corrupted file",
+        )
+        assert output.success is False
+        assert "corrupted" in output.error  # type: ignore
+
+
+class TestSearchChunksInput:
+    """Test SearchChunksInput schema validation."""
+
+    def test_minimal_input(self) -> None:
+        """Test minimal valid input."""
+        input_data = SearchChunksInput(query="test query")
+        assert input_data.query == "test query"
+        assert input_data.limit == 10  # default
+        assert input_data.min_similarity == 0.0  # default
+
+    def test_full_input(self) -> None:
+        """Test input with all fields."""
+        input_data = SearchChunksInput(
+            query="What is justice?",
+            limit=20,
+            min_similarity=0.5,
+            author_filter="Platon",
+            work_filter="Republic",
+            language_filter="fr",
+        )
+        assert input_data.limit == 20
+        assert input_data.author_filter == "Platon"
+
+    def test_empty_query_rejected(self) -> None:
+        """Test empty query raises error."""
+        with pytest.raises(ValidationError):
+            SearchChunksInput(query="")
+
+    def test_query_too_long_rejected(self) -> None:
+        """Test query over 1000 chars is rejected."""
+        with pytest.raises(ValidationError):
+            SearchChunksInput(query="a" * 1001)
+
+    def test_limit_bounds(self) -> None:
+        """Test limit validation bounds."""
+        with pytest.raises(ValidationError):
+            SearchChunksInput(query="test", limit=0)
+        with pytest.raises(ValidationError):
+            SearchChunksInput(query="test", limit=101)
+
+    def test_similarity_bounds(self) -> None:
+        """Test similarity validation bounds."""
+        with pytest.raises(ValidationError):
+            SearchChunksInput(query="test", min_similarity=-0.1)
+        with pytest.raises(ValidationError):
+            SearchChunksInput(query="test", min_similarity=1.1)
+
+
+class TestSearchSummariesInput:
+    """Test SearchSummariesInput schema validation."""
+
+    def test_level_filters(self) -> None:
+        """Test min/max level filters."""
+        input_data = SearchSummariesInput(
+            query="test",
+            min_level=1,
+            max_level=3,
+        )
+        assert input_data.min_level == 1
+        assert input_data.max_level == 3
+
+    def test_level_bounds(self) -> None:
+        """Test level validation bounds."""
+        with pytest.raises(ValidationError):
+            SearchSummariesInput(query="test", min_level=0)
+        with pytest.raises(ValidationError):
+            SearchSummariesInput(query="test", max_level=6)
+
+
+class TestGetDocumentInput:
+    """Test GetDocumentInput schema validation."""
+
+    def test_defaults(self) -> None:
+        """Test default values."""
+        input_data = GetDocumentInput(source_id="doc-123")
+        assert input_data.include_chunks is False
+        assert input_data.chunk_limit == 50
+
+    def test_with_chunks(self) -> None:
+        """Test requesting chunks."""
+        input_data = GetDocumentInput(
+            source_id="doc-123",
+            include_chunks=True,
+            chunk_limit=100,
+        )
+        assert input_data.include_chunks is True
+        assert input_data.chunk_limit == 100
+
+
+class TestDeleteDocumentInput:
+    """Test DeleteDocumentInput schema validation."""
+
+    def test_requires_confirmation(self) -> None:
+        """Test confirm defaults to False."""
+        input_data = DeleteDocumentInput(source_id="doc-to-delete")
+        assert input_data.confirm is False
+
+    def test_with_confirmation(self) -> None:
+        """Test explicit confirmation."""
+        input_data = DeleteDocumentInput(
+            source_id="doc-to-delete",
+            confirm=True,
+        )
+        assert input_data.confirm is True
+
+
+class TestChunkResult:
+    """Test ChunkResult model."""
+
+    def test_full_chunk(self) -> None:
+        """Test creating full chunk result."""
+        chunk = ChunkResult(
+            text="This is the chunk content.",
+            similarity=0.85,
+            section_path="Chapter 1 > Section 1",
+            chapter_title="Introduction",
+            work_title="The Republic",
+            work_author="Platon",
+            order_index=5,
+        )
+        assert chunk.similarity == 0.85
+        assert chunk.order_index == 5
+
+
+class TestDocumentInfo:
+    """Test DocumentInfo model."""
+
+    def test_with_optional_fields(self) -> None:
+        """Test DocumentInfo with all fields."""
+        doc = DocumentInfo(
+            source_id="platon-republic",
+            work_title="The Republic",
+            work_author="Platon",
+            edition="GF Flammarion",
+            pages=500,
+            language="fr",
+            toc={"chapters": ["I", "II", "III"]},
+            hierarchy={"level": 1},
+        )
+        assert doc.toc is not None
+        assert doc.hierarchy is not None
+
+
+class TestJsonSchemaGeneration:
+    """Test JSON schema generation from Pydantic models."""
+
+    def test_schemas_have_descriptions(self) -> None:
+        """Test all fields have descriptions for JSON schema."""
+        schema = SearchChunksInput.model_json_schema()
+
+        # Check field descriptions exist
+        properties = schema["properties"]
+        assert "description" in properties["query"]
+        assert "description" in properties["limit"]
+        assert "description" in properties["min_similarity"]
+
+    def test_schema_includes_constraints(self) -> None:
+        """Test validation constraints are in JSON schema."""
+        schema = SearchChunksInput.model_json_schema()
+        props = schema["properties"]
+
+        # Check minLength constraint
+        assert props["query"].get("minLength") == 1
+        assert props["query"].get("maxLength") == 1000
+
+        # Check numeric constraints
+        assert props["limit"].get("minimum") == 1
+        assert props["limit"].get("maximum") == 100
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])