Files
linear-coding-agent/generations/library_rag/tests/utils/test_toc_enricher.py
David Blanc Brioir d2f7165120 Add Library RAG project and cleanup root directory
- Add complete Library RAG application (Flask + MCP server)
  - PDF processing pipeline with OCR and LLM extraction
  - Weaviate vector database integration (BGE-M3 embeddings)
  - Flask web interface with search and document management
  - MCP server for Claude Desktop integration
  - Comprehensive test suite (134 tests)

- Clean up root directory
  - Remove obsolete documentation files
  - Remove backup and temporary files
  - Update autonomous agent configuration

- Update prompts
  - Enhance initializer bis prompt with better instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 11:57:12 +01:00

430 lines
15 KiB
Python

"""Unit tests for TOC enrichment module.
Tests the enrichment of chunk metadata with hierarchical information
from the table of contents (TOC).
"""
from typing import Any, Dict, List
import pytest
from utils.toc_enricher import (
enrich_chunks_with_toc,
extract_paragraph_number,
find_matching_toc_entry,
flatten_toc_with_paths,
)
from utils.types import FlatTOCEntryEnriched
class TestFlattenTocWithPaths:
"""Tests for flatten_toc_with_paths function."""
def test_flatten_simple_toc(self) -> None:
"""Test flattening a simple two-level TOC."""
toc: List[Dict[str, Any]] = [
{
"title": "Chapter 1",
"level": 1,
"children": [
{"title": "Section 1.1", "level": 2, "children": []},
{"title": "Section 1.2", "level": 2, "children": []},
],
},
]
flat_toc = flatten_toc_with_paths(toc, {})
assert len(flat_toc) == 3
assert flat_toc[0]["title"] == "Chapter 1"
assert flat_toc[0]["level"] == 1
assert flat_toc[0]["full_path"] == "Chapter 1"
assert flat_toc[1]["title"] == "Section 1.1"
assert flat_toc[1]["full_path"] == "Chapter 1 > Section 1.1"
assert flat_toc[1]["chapter_title"] == "Chapter 1"
def test_flatten_peirce_toc_with_cp_references(self) -> None:
"""Test flattening Peirce TOC with CP references."""
toc: List[Dict[str, Any]] = [
{
"title": "Peirce: CP 1.628",
"level": 1,
"children": [
{
"title": "628. It is the instincts...",
"level": 2,
"children": [],
},
],
},
]
flat_toc = flatten_toc_with_paths(toc, {})
assert len(flat_toc) == 2
# Level 1 entry should extract CP reference
assert flat_toc[0]["canonical_ref"] == "CP 1.628"
# Level 2 entry should inherit CP reference
assert flat_toc[1]["canonical_ref"] == "CP 1.628"
assert flat_toc[1]["full_path"] == "Peirce: CP 1.628 > 628. It is the instincts..."
assert flat_toc[1]["chapter_title"] == "Peirce: CP 1.628"
def test_flatten_empty_toc(self) -> None:
"""Test flattening an empty TOC."""
flat_toc = flatten_toc_with_paths([], {})
assert flat_toc == []
def test_flatten_nested_hierarchy(self) -> None:
"""Test flattening a deeply nested hierarchy."""
toc: List[Dict[str, Any]] = [
{
"title": "Part I",
"level": 1,
"children": [
{
"title": "Chapter 1",
"level": 2,
"children": [
{
"title": "Section 1.1",
"level": 3,
"children": [],
},
],
},
],
},
]
flat_toc = flatten_toc_with_paths(toc, {})
assert len(flat_toc) == 3
assert flat_toc[2]["full_path"] == "Part I > Chapter 1 > Section 1.1"
assert flat_toc[2]["parent_titles"] == ["Part I", "Chapter 1"]
assert flat_toc[2]["chapter_title"] == "Part I"
def test_flatten_stephanus_pagination(self) -> None:
"""Test flattening TOC with Stephanus pagination (e.g., Plato)."""
toc: List[Dict[str, Any]] = [
{
"title": "Ménon 80a",
"level": 1,
"children": [
{
"title": "80a. MÉNON : Socrate...",
"level": 2,
"children": [],
},
],
},
]
flat_toc = flatten_toc_with_paths(toc, {})
assert flat_toc[0]["canonical_ref"] == "Ménon 80a"
assert flat_toc[1]["canonical_ref"] == "Ménon 80a"
class TestExtractParagraphNumber:
"""Tests for extract_paragraph_number function."""
def test_extract_standard_paragraph(self) -> None:
"""Test extracting standard paragraph number."""
assert extract_paragraph_number("628. It is the instincts...") == "628"
assert extract_paragraph_number("42. On the nature of...") == "42"
def test_extract_stephanus_paragraph(self) -> None:
"""Test extracting Stephanus-style paragraph (with letter)."""
assert extract_paragraph_number("80a. SOCRATE: Sais-tu...") == "80a"
assert extract_paragraph_number("215c. Text here") == "215c"
def test_extract_section_symbol(self) -> None:
"""Test extracting paragraph with section symbol."""
assert extract_paragraph_number("§42 On the nature of...") == "42"
assert extract_paragraph_number("§ 628 Text") == "628"
def test_extract_cp_reference(self) -> None:
"""Test extracting paragraph from CP reference."""
assert extract_paragraph_number("CP 5.628. Text") == "628"
assert extract_paragraph_number("CP 1.42. My philosophy") == "42"
def test_extract_no_paragraph(self) -> None:
"""Test extraction when no paragraph number present."""
assert extract_paragraph_number("Introduction") is None
assert extract_paragraph_number("") is None
assert extract_paragraph_number("Chapter One") is None
class TestFindMatchingTocEntry:
"""Tests for find_matching_toc_entry function."""
def setup_method(self) -> None:
"""Set up test fixtures."""
self.flat_toc: List[FlatTOCEntryEnriched] = [
{
"title": "Peirce: CP 1.628",
"level": 1,
"full_path": "Peirce: CP 1.628",
"chapter_title": "Peirce: CP 1.628",
"canonical_ref": "CP 1.628",
"parent_titles": [],
"index_in_flat_list": 0,
},
{
"title": "628. It is the instincts...",
"level": 2,
"full_path": "Peirce: CP 1.628 > 628. It is the instincts...",
"chapter_title": "Peirce: CP 1.628",
"canonical_ref": "CP 1.628",
"parent_titles": ["Peirce: CP 1.628"],
"index_in_flat_list": 1,
},
{
"title": "Peirce: CP 1.42",
"level": 1,
"full_path": "Peirce: CP 1.42",
"chapter_title": "Peirce: CP 1.42",
"canonical_ref": "CP 1.42",
"parent_titles": [],
"index_in_flat_list": 2,
},
{
"title": "42. My philosophy resuscitates Hegel",
"level": 2,
"full_path": "Peirce: CP 1.42 > 42. My philosophy resuscitates Hegel",
"chapter_title": "Peirce: CP 1.42",
"canonical_ref": "CP 1.42",
"parent_titles": ["Peirce: CP 1.42"],
"index_in_flat_list": 3,
},
]
def test_exact_title_match(self) -> None:
"""Test exact title matching."""
chunk: Dict[str, Any] = {
"section": "628. It is the instincts...",
"order_index": 0,
}
entry = find_matching_toc_entry(chunk, self.flat_toc)
assert entry is not None
assert entry["title"] == "628. It is the instincts..."
assert entry["canonical_ref"] == "CP 1.628"
def test_paragraph_number_match(self) -> None:
"""Test paragraph number matching with text similarity."""
chunk: Dict[str, Any] = {
"section": "42. My philosophy resuscitates Hegel",
"order_index": 1,
}
entry = find_matching_toc_entry(chunk, self.flat_toc)
assert entry is not None
assert entry["canonical_ref"] == "CP 1.42"
def test_no_match_empty_toc(self) -> None:
"""Test behavior with empty TOC."""
chunk: Dict[str, Any] = {"section": "Some section", "order_index": 0}
entry = find_matching_toc_entry(chunk, [])
assert entry is None
def test_no_match_empty_section(self) -> None:
"""Test behavior with chunk having no section."""
chunk: Dict[str, Any] = {"text": "Some text", "order_index": 0}
entry = find_matching_toc_entry(chunk, self.flat_toc)
# Without section field, function returns None (doesn't guess)
# This is correct behavior - we don't want to match without text
assert entry is None
def test_proximity_match_fallback(self) -> None:
"""Test proximity matching when no text match found."""
chunk: Dict[str, Any] = {
"section": "Unknown section",
"order_index": 1,
}
entry = find_matching_toc_entry(chunk, self.flat_toc)
# Should return entry with closest index_in_flat_list to order_index=1
assert entry is not None
assert entry["index_in_flat_list"] == 1
class TestEnrichChunksWithToc:
"""Tests for enrich_chunks_with_toc function."""
def test_enrich_chunks_no_toc(self) -> None:
"""Test graceful fallback when TOC is absent."""
chunks: List[Dict[str, Any]] = [
{"text": "test", "section": "Intro"},
]
enriched = enrich_chunks_with_toc(chunks, [], {})
assert enriched == chunks # Unchanged
def test_enrich_chunks_with_match(self) -> None:
"""Test enrichment with successful TOC matching."""
chunks: List[Dict[str, Any]] = [
{"text": "test", "section": "628. It is the instincts..."},
]
toc: List[Dict[str, Any]] = [
{
"title": "Peirce: CP 1.628",
"level": 1,
"children": [
{
"title": "628. It is the instincts...",
"level": 2,
"children": [],
},
],
},
]
enriched = enrich_chunks_with_toc(chunks, toc, {})
assert len(enriched) == 1
assert "Peirce: CP 1.628" in enriched[0]["sectionPath"]
assert enriched[0]["chapterTitle"] == "Peirce: CP 1.628"
assert enriched[0]["canonical_reference"] == "CP 1.628"
def test_enrich_chunks_partial_match(self) -> None:
"""Test enrichment when only some chunks match."""
chunks: List[Dict[str, Any]] = [
{"text": "test1", "section": "628. It is the instincts...", "order_index": 0},
{"text": "test2", "section": "Unknown section", "order_index": 1},
]
toc: List[Dict[str, Any]] = [
{
"title": "Peirce: CP 1.628",
"level": 1,
"children": [
{
"title": "628. It is the instincts...",
"level": 2,
"children": [],
},
],
},
]
enriched = enrich_chunks_with_toc(chunks, toc, {})
# First chunk should be enriched
assert "Peirce: CP 1.628" in enriched[0]["sectionPath"]
assert enriched[0]["canonical_reference"] == "CP 1.628"
# Second chunk doesn't match, so uses proximity fallback
# Proximity matching will assign it to the closest TOC entry
assert "sectionPath" in enriched[1] # Should get proximity match
def test_enrich_chunks_preserves_original_fields(self) -> None:
"""Test that enrichment preserves other chunk fields."""
chunks: List[Dict[str, Any]] = [
{
"text": "test",
"section": "628. It is the instincts...",
"order_index": 42,
"keywords": ["test"],
},
]
toc: List[Dict[str, Any]] = [
{
"title": "Peirce: CP 1.628",
"level": 1,
"children": [
{
"title": "628. It is the instincts...",
"level": 2,
"children": [],
},
],
},
]
enriched = enrich_chunks_with_toc(chunks, toc, {})
# Original fields should be preserved
assert enriched[0]["text"] == "test"
assert enriched[0]["order_index"] == 42
assert enriched[0]["keywords"] == ["test"]
# New fields should be added
assert "canonical_reference" in enriched[0]
def test_enrich_chunks_empty_chunks_list(self) -> None:
"""Test behavior with empty chunks list."""
toc: List[Dict[str, Any]] = [
{"title": "Chapter 1", "level": 1, "children": []},
]
enriched = enrich_chunks_with_toc([], toc, {})
assert enriched == []
# Integration test combining multiple functions
class TestTocEnricherIntegration:
"""Integration tests for the complete enrichment pipeline."""
def test_full_peirce_enrichment_pipeline(self) -> None:
"""Test complete enrichment pipeline with Peirce data."""
# Realistic Peirce TOC structure
toc: List[Dict[str, Any]] = [
{
"title": "Peirce: CP 6.628",
"level": 1,
"children": [
{
"title": "628. I think we need to reflect...",
"level": 2,
"children": [],
},
{
"title": "629. The next point is...",
"level": 2,
"children": [],
},
],
},
]
# Realistic chunks from pdf_pipeline
chunks: List[Dict[str, Any]] = [
{
"text": "I think we need to reflect on the nature of signs...",
"section": "628. I think we need to reflect...",
"order_index": 0,
},
{
"text": "The next point is about interpretation...",
"section": "629. The next point is...",
"order_index": 1,
},
]
# Run enrichment
enriched = enrich_chunks_with_toc(chunks, toc, {})
# Verify results
assert len(enriched) == 2
# First chunk
assert enriched[0]["sectionPath"] == "Peirce: CP 6.628 > 628. I think we need to reflect..."
assert enriched[0]["chapterTitle"] == "Peirce: CP 6.628"
assert enriched[0]["canonical_reference"] == "CP 6.628"
# Second chunk
assert enriched[1]["sectionPath"] == "Peirce: CP 6.628 > 629. The next point is..."
assert enriched[1]["chapterTitle"] == "Peirce: CP 6.628"
assert enriched[1]["canonical_reference"] == "CP 6.628"