Add Library RAG project and cleanup root directory
- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
673
generations/library_rag/tests/mcp/test_parsing_tools.py
Normal file
673
generations/library_rag/tests/mcp/test_parsing_tools.py
Normal file
@@ -0,0 +1,673 @@
|
||||
"""
|
||||
Unit tests for MCP parsing tools.
|
||||
|
||||
Tests the parse_pdf tool handler with mocked dependencies to ensure:
|
||||
- Local file processing works correctly
|
||||
- URL-based PDF downloads work correctly
|
||||
- Error handling is comprehensive
|
||||
- Fixed parameters are used correctly
|
||||
- Cost tracking is accurate
|
||||
|
||||
Uses asyncio for async test support.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from mcp_tools.parsing_tools import (
|
||||
FIXED_LLM_MODEL,
|
||||
FIXED_LLM_PROVIDER,
|
||||
FIXED_USE_LLM,
|
||||
FIXED_USE_OCR_ANNOTATIONS,
|
||||
FIXED_USE_SEMANTIC_CHUNKING,
|
||||
download_pdf,
|
||||
extract_filename_from_url,
|
||||
is_url,
|
||||
parse_pdf_handler,
|
||||
)
|
||||
from mcp_tools.schemas import ParsePdfInput, ParsePdfOutput
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Test is_url Helper Function
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestIsUrl:
|
||||
"""Tests for the is_url helper function."""
|
||||
|
||||
def test_https_url(self) -> None:
|
||||
"""Test that HTTPS URLs are recognized."""
|
||||
assert is_url("https://example.com/document.pdf") is True
|
||||
|
||||
def test_http_url(self) -> None:
|
||||
"""Test that HTTP URLs are recognized."""
|
||||
assert is_url("http://example.com/document.pdf") is True
|
||||
|
||||
def test_local_path_unix(self) -> None:
|
||||
"""Test that Unix local paths are not recognized as URLs."""
|
||||
assert is_url("/path/to/document.pdf") is False
|
||||
|
||||
def test_local_path_windows(self) -> None:
|
||||
"""Test that Windows local paths are not recognized as URLs."""
|
||||
assert is_url("C:\\Documents\\document.pdf") is False
|
||||
|
||||
def test_relative_path(self) -> None:
|
||||
"""Test that relative paths are not recognized as URLs."""
|
||||
assert is_url("./documents/document.pdf") is False
|
||||
|
||||
def test_ftp_url_not_supported(self) -> None:
|
||||
"""Test that FTP URLs are not recognized (only HTTP/HTTPS supported)."""
|
||||
assert is_url("ftp://example.com/document.pdf") is False
|
||||
|
||||
def test_empty_string(self) -> None:
|
||||
"""Test that empty strings are not recognized as URLs."""
|
||||
assert is_url("") is False
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Test extract_filename_from_url Helper Function
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestExtractFilenameFromUrl:
|
||||
"""Tests for the extract_filename_from_url helper function."""
|
||||
|
||||
def test_url_with_pdf_filename(self) -> None:
|
||||
"""Test extraction when URL has a .pdf filename."""
|
||||
result = extract_filename_from_url("https://example.com/docs/aristotle.pdf")
|
||||
assert result == "aristotle.pdf"
|
||||
|
||||
def test_url_with_filename_no_extension(self) -> None:
|
||||
"""Test extraction when URL has a filename without extension."""
|
||||
result = extract_filename_from_url("https://example.com/docs/aristotle")
|
||||
assert result == "aristotle.pdf"
|
||||
|
||||
def test_url_without_path(self) -> None:
|
||||
"""Test extraction when URL has no path."""
|
||||
result = extract_filename_from_url("https://example.com/")
|
||||
assert result == "downloaded.pdf"
|
||||
|
||||
def test_url_with_api_endpoint(self) -> None:
|
||||
"""Test extraction when URL is an API endpoint."""
|
||||
result = extract_filename_from_url("https://api.example.com/download")
|
||||
assert result == "download.pdf"
|
||||
|
||||
def test_url_with_query_params(self) -> None:
|
||||
"""Test extraction when URL has query parameters."""
|
||||
result = extract_filename_from_url(
|
||||
"https://example.com/docs/kant.pdf?token=abc"
|
||||
)
|
||||
assert result == "kant.pdf"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Test download_pdf Function
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestDownloadPdf:
|
||||
"""Tests for the download_pdf async function."""
|
||||
|
||||
def test_successful_download(self) -> None:
|
||||
"""Test successful PDF download from URL."""
|
||||
|
||||
async def run_test() -> None:
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b"%PDF-1.4 test content"
|
||||
mock_response.headers = {"content-type": "application/pdf"}
|
||||
mock_response.raise_for_status = MagicMock()
|
||||
|
||||
with patch(
|
||||
"mcp_tools.parsing_tools.httpx.AsyncClient"
|
||||
) as mock_client_class:
|
||||
mock_client = AsyncMock()
|
||||
mock_client.get = AsyncMock(return_value=mock_response)
|
||||
mock_client_class.return_value.__aenter__ = AsyncMock(
|
||||
return_value=mock_client
|
||||
)
|
||||
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
|
||||
|
||||
result = await download_pdf("https://example.com/document.pdf")
|
||||
|
||||
assert result == b"%PDF-1.4 test content"
|
||||
mock_client.get.assert_called_once_with(
|
||||
"https://example.com/document.pdf"
|
||||
)
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
def test_download_with_non_pdf_content_type(self) -> None:
|
||||
"""Test download proceeds with warning when content-type is not PDF."""
|
||||
|
||||
async def run_test() -> None:
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b"%PDF-1.4 test content"
|
||||
mock_response.headers = {"content-type": "application/octet-stream"}
|
||||
mock_response.raise_for_status = MagicMock()
|
||||
|
||||
with patch(
|
||||
"mcp_tools.parsing_tools.httpx.AsyncClient"
|
||||
) as mock_client_class:
|
||||
mock_client = AsyncMock()
|
||||
mock_client.get = AsyncMock(return_value=mock_response)
|
||||
mock_client_class.return_value.__aenter__ = AsyncMock(
|
||||
return_value=mock_client
|
||||
)
|
||||
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
|
||||
|
||||
# Should still succeed, just logs a warning
|
||||
result = await download_pdf("https://example.com/document.pdf")
|
||||
assert result == b"%PDF-1.4 test content"
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
def test_download_http_error(self) -> None:
|
||||
"""Test that HTTP errors are propagated."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch(
|
||||
"mcp_tools.parsing_tools.httpx.AsyncClient"
|
||||
) as mock_client_class:
|
||||
mock_client = AsyncMock()
|
||||
mock_client.get = AsyncMock(
|
||||
side_effect=httpx.HTTPStatusError(
|
||||
"Not Found",
|
||||
request=MagicMock(),
|
||||
response=MagicMock(status_code=404),
|
||||
)
|
||||
)
|
||||
mock_client_class.return_value.__aenter__ = AsyncMock(
|
||||
return_value=mock_client
|
||||
)
|
||||
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
|
||||
|
||||
with pytest.raises(httpx.HTTPStatusError):
|
||||
await download_pdf("https://example.com/nonexistent.pdf")
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Test parse_pdf_handler - Local Files
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestParsePdfHandlerLocalFile:
|
||||
"""Tests for parse_pdf_handler with local file paths."""
|
||||
|
||||
def test_successful_local_file_processing(
|
||||
self,
|
||||
temp_pdf_file: Path,
|
||||
successful_pipeline_result: Dict[str, Any],
|
||||
) -> None:
|
||||
"""Test successful processing of a local PDF file."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
|
||||
mock_process_pdf.return_value = successful_pipeline_result
|
||||
|
||||
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
|
||||
result = await parse_pdf_handler(input_data)
|
||||
|
||||
assert result.success is True
|
||||
assert result.document_name == "test-document"
|
||||
assert result.pages == 10
|
||||
assert result.chunks_count == 25
|
||||
assert result.cost_ocr == 0.03
|
||||
assert result.cost_llm == 0.05
|
||||
assert result.cost_total == 0.08
|
||||
assert result.metadata["title"] == "Test Document Title"
|
||||
assert result.error is None
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
def test_local_file_uses_fixed_parameters(
|
||||
self,
|
||||
temp_pdf_file: Path,
|
||||
successful_pipeline_result: Dict[str, Any],
|
||||
) -> None:
|
||||
"""Test that local file processing uses the fixed optimal parameters."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
|
||||
mock_process_pdf.return_value = successful_pipeline_result
|
||||
|
||||
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
|
||||
await parse_pdf_handler(input_data)
|
||||
|
||||
# Verify fixed parameters are passed
|
||||
mock_process_pdf.assert_called_once()
|
||||
call_kwargs = mock_process_pdf.call_args.kwargs
|
||||
|
||||
assert call_kwargs["use_llm"] == FIXED_USE_LLM
|
||||
assert call_kwargs["llm_provider"] == FIXED_LLM_PROVIDER
|
||||
assert call_kwargs["llm_model"] == FIXED_LLM_MODEL
|
||||
assert call_kwargs["use_semantic_chunking"] == FIXED_USE_SEMANTIC_CHUNKING
|
||||
assert call_kwargs["use_ocr_annotations"] == FIXED_USE_OCR_ANNOTATIONS
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
def test_file_not_found_error(self) -> None:
|
||||
"""Test error handling when local file does not exist."""
|
||||
|
||||
async def run_test() -> None:
|
||||
input_data = ParsePdfInput(pdf_path="/nonexistent/path/document.pdf")
|
||||
result = await parse_pdf_handler(input_data)
|
||||
|
||||
assert result.success is False
|
||||
assert "not found" in result.error.lower()
|
||||
assert result.pages == 0
|
||||
assert result.chunks_count == 0
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
def test_pipeline_failure(
|
||||
self,
|
||||
temp_pdf_file: Path,
|
||||
failed_pipeline_result: Dict[str, Any],
|
||||
) -> None:
|
||||
"""Test handling when the pipeline returns a failure."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
|
||||
mock_process_pdf.return_value = failed_pipeline_result
|
||||
|
||||
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
|
||||
result = await parse_pdf_handler(input_data)
|
||||
|
||||
assert result.success is False
|
||||
assert "OCR processing failed" in result.error
|
||||
assert result.pages == 0
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
def test_pipeline_exception(
|
||||
self,
|
||||
temp_pdf_file: Path,
|
||||
) -> None:
|
||||
"""Test handling when the pipeline raises an exception."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
|
||||
mock_process_pdf.side_effect = RuntimeError("Unexpected error")
|
||||
|
||||
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
|
||||
result = await parse_pdf_handler(input_data)
|
||||
|
||||
assert result.success is False
|
||||
assert "Processing error" in result.error
|
||||
assert "Unexpected error" in result.error
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Test parse_pdf_handler - URL Downloads
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestParsePdfHandlerUrl:
|
||||
"""Tests for parse_pdf_handler with URL inputs."""
|
||||
|
||||
def test_successful_url_processing(
|
||||
self,
|
||||
sample_pdf_bytes: bytes,
|
||||
successful_pipeline_result: Dict[str, Any],
|
||||
) -> None:
|
||||
"""Test successful processing of a PDF from URL."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch(
|
||||
"mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
|
||||
) as mock_download:
|
||||
with patch(
|
||||
"mcp_tools.parsing_tools.process_pdf_bytes"
|
||||
) as mock_process:
|
||||
mock_download.return_value = sample_pdf_bytes
|
||||
mock_process.return_value = successful_pipeline_result
|
||||
|
||||
input_data = ParsePdfInput(
|
||||
pdf_path="https://example.com/philosophy/kant.pdf"
|
||||
)
|
||||
result = await parse_pdf_handler(input_data)
|
||||
|
||||
assert result.success is True
|
||||
assert result.document_name == "test-document"
|
||||
mock_download.assert_called_once_with(
|
||||
"https://example.com/philosophy/kant.pdf"
|
||||
)
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
def test_url_uses_extracted_filename(
|
||||
self,
|
||||
sample_pdf_bytes: bytes,
|
||||
successful_pipeline_result: Dict[str, Any],
|
||||
) -> None:
|
||||
"""Test that filename is extracted from URL for processing."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch(
|
||||
"mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
|
||||
) as mock_download:
|
||||
with patch(
|
||||
"mcp_tools.parsing_tools.process_pdf_bytes"
|
||||
) as mock_process:
|
||||
mock_download.return_value = sample_pdf_bytes
|
||||
mock_process.return_value = successful_pipeline_result
|
||||
|
||||
input_data = ParsePdfInput(
|
||||
pdf_path="https://example.com/docs/aristotle-metaphysics.pdf"
|
||||
)
|
||||
await parse_pdf_handler(input_data)
|
||||
|
||||
# Verify filename was extracted and passed
|
||||
mock_process.assert_called_once()
|
||||
call_kwargs = mock_process.call_args.kwargs
|
||||
assert call_kwargs["filename"] == "aristotle-metaphysics.pdf"
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
def test_url_uses_fixed_parameters(
|
||||
self,
|
||||
sample_pdf_bytes: bytes,
|
||||
successful_pipeline_result: Dict[str, Any],
|
||||
) -> None:
|
||||
"""Test that URL processing uses the fixed optimal parameters."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch(
|
||||
"mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
|
||||
) as mock_download:
|
||||
with patch(
|
||||
"mcp_tools.parsing_tools.process_pdf_bytes"
|
||||
) as mock_process:
|
||||
mock_download.return_value = sample_pdf_bytes
|
||||
mock_process.return_value = successful_pipeline_result
|
||||
|
||||
input_data = ParsePdfInput(
|
||||
pdf_path="https://example.com/document.pdf"
|
||||
)
|
||||
await parse_pdf_handler(input_data)
|
||||
|
||||
call_kwargs = mock_process.call_args.kwargs
|
||||
assert call_kwargs["llm_provider"] == FIXED_LLM_PROVIDER
|
||||
assert call_kwargs["llm_model"] == FIXED_LLM_MODEL
|
||||
assert (
|
||||
call_kwargs["use_semantic_chunking"]
|
||||
== FIXED_USE_SEMANTIC_CHUNKING
|
||||
)
|
||||
assert (
|
||||
call_kwargs["use_ocr_annotations"] == FIXED_USE_OCR_ANNOTATIONS
|
||||
)
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
def test_url_download_http_error(self) -> None:
|
||||
"""Test error handling when URL download fails with HTTP error."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch(
|
||||
"mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
|
||||
) as mock_download:
|
||||
mock_download.side_effect = httpx.HTTPStatusError(
|
||||
"Not Found",
|
||||
request=MagicMock(),
|
||||
response=MagicMock(status_code=404),
|
||||
)
|
||||
|
||||
input_data = ParsePdfInput(
|
||||
pdf_path="https://example.com/nonexistent.pdf"
|
||||
)
|
||||
result = await parse_pdf_handler(input_data)
|
||||
|
||||
assert result.success is False
|
||||
assert "Failed to download PDF" in result.error
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
def test_url_download_network_error(self) -> None:
|
||||
"""Test error handling when URL download fails with network error."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch(
|
||||
"mcp_tools.parsing_tools.download_pdf", new_callable=AsyncMock
|
||||
) as mock_download:
|
||||
mock_download.side_effect = httpx.ConnectError("Connection refused")
|
||||
|
||||
input_data = ParsePdfInput(
|
||||
pdf_path="https://example.com/document.pdf"
|
||||
)
|
||||
result = await parse_pdf_handler(input_data)
|
||||
|
||||
assert result.success is False
|
||||
assert "Failed to download PDF" in result.error
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Test Cost Tracking
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestCostTracking:
|
||||
"""Tests for cost tracking in parse_pdf output."""
|
||||
|
||||
def test_costs_are_tracked_correctly(
|
||||
self,
|
||||
temp_pdf_file: Path,
|
||||
) -> None:
|
||||
"""Test that OCR and LLM costs are correctly tracked."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
|
||||
mock_process_pdf.return_value = {
|
||||
"success": True,
|
||||
"document_name": "test-doc",
|
||||
"source_id": "test-doc",
|
||||
"pages": 50,
|
||||
"chunks_count": 100,
|
||||
"cost_ocr": 0.15, # 50 pages * 0.003€
|
||||
"cost_llm": 0.25,
|
||||
"cost_total": 0.40,
|
||||
"output_dir": Path("output/test-doc"),
|
||||
"metadata": {},
|
||||
"error": None,
|
||||
}
|
||||
|
||||
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
|
||||
result = await parse_pdf_handler(input_data)
|
||||
|
||||
assert result.cost_ocr == 0.15
|
||||
assert result.cost_llm == 0.25
|
||||
assert result.cost_total == 0.40
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
def test_cost_total_calculated_when_missing(
|
||||
self,
|
||||
temp_pdf_file: Path,
|
||||
) -> None:
|
||||
"""Test that cost_total is calculated if not provided."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
|
||||
mock_process_pdf.return_value = {
|
||||
"success": True,
|
||||
"document_name": "test-doc",
|
||||
"source_id": "test-doc",
|
||||
"pages": 10,
|
||||
"chunks_count": 20,
|
||||
"cost_ocr": 0.03,
|
||||
"cost_llm": 0.05,
|
||||
# cost_total intentionally missing
|
||||
"output_dir": Path("output/test-doc"),
|
||||
"metadata": {},
|
||||
"error": None,
|
||||
}
|
||||
|
||||
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
|
||||
result = await parse_pdf_handler(input_data)
|
||||
|
||||
assert result.cost_total == 0.08 # 0.03 + 0.05
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
def test_zero_costs_on_failure(
|
||||
self,
|
||||
temp_pdf_file: Path,
|
||||
) -> None:
|
||||
"""Test that costs are zero when processing fails early."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
|
||||
mock_process_pdf.side_effect = RuntimeError("Early failure")
|
||||
|
||||
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
|
||||
result = await parse_pdf_handler(input_data)
|
||||
|
||||
assert result.success is False
|
||||
assert result.cost_ocr == 0.0
|
||||
assert result.cost_llm == 0.0
|
||||
assert result.cost_total == 0.0
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Test Metadata Handling
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestMetadataHandling:
|
||||
"""Tests for metadata extraction and handling."""
|
||||
|
||||
def test_metadata_extracted_correctly(
|
||||
self,
|
||||
temp_pdf_file: Path,
|
||||
) -> None:
|
||||
"""Test that metadata is correctly passed through."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
|
||||
mock_process_pdf.return_value = {
|
||||
"success": True,
|
||||
"document_name": "platon-menon",
|
||||
"source_id": "platon-menon",
|
||||
"pages": 80,
|
||||
"chunks_count": 150,
|
||||
"cost_ocr": 0.24,
|
||||
"cost_llm": 0.30,
|
||||
"cost_total": 0.54,
|
||||
"output_dir": Path("output/platon-menon"),
|
||||
"metadata": {
|
||||
"title": "Ménon",
|
||||
"author": "Platon",
|
||||
"language": "fr",
|
||||
"year": -380,
|
||||
"genre": "dialogue",
|
||||
},
|
||||
"error": None,
|
||||
}
|
||||
|
||||
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
|
||||
result = await parse_pdf_handler(input_data)
|
||||
|
||||
assert result.metadata["title"] == "Ménon"
|
||||
assert result.metadata["author"] == "Platon"
|
||||
assert result.metadata["language"] == "fr"
|
||||
assert result.metadata["year"] == -380
|
||||
assert result.metadata["genre"] == "dialogue"
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
def test_empty_metadata_handled(
|
||||
self,
|
||||
temp_pdf_file: Path,
|
||||
) -> None:
|
||||
"""Test that empty/None metadata is handled gracefully."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
|
||||
mock_process_pdf.return_value = {
|
||||
"success": True,
|
||||
"document_name": "test-doc",
|
||||
"source_id": "test-doc",
|
||||
"pages": 10,
|
||||
"chunks_count": 20,
|
||||
"cost_ocr": 0.03,
|
||||
"cost_llm": 0.05,
|
||||
"cost_total": 0.08,
|
||||
"output_dir": Path("output/test-doc"),
|
||||
"metadata": None, # Explicitly None
|
||||
"error": None,
|
||||
}
|
||||
|
||||
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
|
||||
result = await parse_pdf_handler(input_data)
|
||||
|
||||
assert result.metadata == {}
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Test Output Schema Validation
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestOutputSchemaValidation:
|
||||
"""Tests for ParsePdfOutput schema compliance."""
|
||||
|
||||
def test_output_is_valid_schema(
|
||||
self,
|
||||
temp_pdf_file: Path,
|
||||
successful_pipeline_result: Dict[str, Any],
|
||||
) -> None:
|
||||
"""Test that output conforms to ParsePdfOutput schema."""
|
||||
|
||||
async def run_test() -> None:
|
||||
with patch("mcp_tools.parsing_tools.process_pdf") as mock_process_pdf:
|
||||
mock_process_pdf.return_value = successful_pipeline_result
|
||||
|
||||
input_data = ParsePdfInput(pdf_path=str(temp_pdf_file))
|
||||
result = await parse_pdf_handler(input_data)
|
||||
|
||||
# Verify it's the correct type
|
||||
assert isinstance(result, ParsePdfOutput)
|
||||
|
||||
# Verify all required fields are present
|
||||
assert hasattr(result, "success")
|
||||
assert hasattr(result, "document_name")
|
||||
assert hasattr(result, "source_id")
|
||||
assert hasattr(result, "pages")
|
||||
assert hasattr(result, "chunks_count")
|
||||
assert hasattr(result, "cost_ocr")
|
||||
assert hasattr(result, "cost_llm")
|
||||
assert hasattr(result, "cost_total")
|
||||
assert hasattr(result, "output_dir")
|
||||
assert hasattr(result, "metadata")
|
||||
assert hasattr(result, "error")
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
def test_error_output_is_valid_schema(self) -> None:
|
||||
"""Test that error output conforms to ParsePdfOutput schema."""
|
||||
|
||||
async def run_test() -> None:
|
||||
input_data = ParsePdfInput(pdf_path="/nonexistent/file.pdf")
|
||||
result = await parse_pdf_handler(input_data)
|
||||
|
||||
assert isinstance(result, ParsePdfOutput)
|
||||
assert result.success is False
|
||||
assert result.error is not None
|
||||
assert isinstance(result.error, str)
|
||||
|
||||
asyncio.run(run_test())
|
||||
Reference in New Issue
Block a user