Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions
--- a/generations/library_rag/tests/utils/test_toc_enricher.py
+++ b/generations/library_rag/tests/utils/test_toc_enricher.py
@@ -0,0 +1,429 @@
+"""Unit tests for TOC enrichment module.
+
+Tests the enrichment of chunk metadata with hierarchical information
+from the table of contents (TOC).
+"""
+
+from typing import Any, Dict, List
+
+import pytest
+
+from utils.toc_enricher import (
+    enrich_chunks_with_toc,
+    extract_paragraph_number,
+    find_matching_toc_entry,
+    flatten_toc_with_paths,
+)
+from utils.types import FlatTOCEntryEnriched
+
+
+class TestFlattenTocWithPaths:
+    """Tests for flatten_toc_with_paths function."""
+
+    def test_flatten_simple_toc(self) -> None:
+        """Test flattening a simple two-level TOC."""
+        toc: List[Dict[str, Any]] = [
+            {
+                "title": "Chapter 1",
+                "level": 1,
+                "children": [
+                    {"title": "Section 1.1", "level": 2, "children": []},
+                    {"title": "Section 1.2", "level": 2, "children": []},
+                ],
+            },
+        ]
+
+        flat_toc = flatten_toc_with_paths(toc, {})
+
+        assert len(flat_toc) == 3
+        assert flat_toc[0]["title"] == "Chapter 1"
+        assert flat_toc[0]["level"] == 1
+        assert flat_toc[0]["full_path"] == "Chapter 1"
+        assert flat_toc[1]["title"] == "Section 1.1"
+        assert flat_toc[1]["full_path"] == "Chapter 1 > Section 1.1"
+        assert flat_toc[1]["chapter_title"] == "Chapter 1"
+
+    def test_flatten_peirce_toc_with_cp_references(self) -> None:
+        """Test flattening Peirce TOC with CP references."""
+        toc: List[Dict[str, Any]] = [
+            {
+                "title": "Peirce: CP 1.628",
+                "level": 1,
+                "children": [
+                    {
+                        "title": "628. It is the instincts...",
+                        "level": 2,
+                        "children": [],
+                    },
+                ],
+            },
+        ]
+
+        flat_toc = flatten_toc_with_paths(toc, {})
+
+        assert len(flat_toc) == 2
+        # Level 1 entry should extract CP reference
+        assert flat_toc[0]["canonical_ref"] == "CP 1.628"
+        # Level 2 entry should inherit CP reference
+        assert flat_toc[1]["canonical_ref"] == "CP 1.628"
+        assert flat_toc[1]["full_path"] == "Peirce: CP 1.628 > 628. It is the instincts..."
+        assert flat_toc[1]["chapter_title"] == "Peirce: CP 1.628"
+
+    def test_flatten_empty_toc(self) -> None:
+        """Test flattening an empty TOC."""
+        flat_toc = flatten_toc_with_paths([], {})
+        assert flat_toc == []
+
+    def test_flatten_nested_hierarchy(self) -> None:
+        """Test flattening a deeply nested hierarchy."""
+        toc: List[Dict[str, Any]] = [
+            {
+                "title": "Part I",
+                "level": 1,
+                "children": [
+                    {
+                        "title": "Chapter 1",
+                        "level": 2,
+                        "children": [
+                            {
+                                "title": "Section 1.1",
+                                "level": 3,
+                                "children": [],
+                            },
+                        ],
+                    },
+                ],
+            },
+        ]
+
+        flat_toc = flatten_toc_with_paths(toc, {})
+
+        assert len(flat_toc) == 3
+        assert flat_toc[2]["full_path"] == "Part I > Chapter 1 > Section 1.1"
+        assert flat_toc[2]["parent_titles"] == ["Part I", "Chapter 1"]
+        assert flat_toc[2]["chapter_title"] == "Part I"
+
+    def test_flatten_stephanus_pagination(self) -> None:
+        """Test flattening TOC with Stephanus pagination (e.g., Plato)."""
+        toc: List[Dict[str, Any]] = [
+            {
+                "title": "Ménon 80a",
+                "level": 1,
+                "children": [
+                    {
+                        "title": "80a. MÉNON : Socrate...",
+                        "level": 2,
+                        "children": [],
+                    },
+                ],
+            },
+        ]
+
+        flat_toc = flatten_toc_with_paths(toc, {})
+
+        assert flat_toc[0]["canonical_ref"] == "Ménon 80a"
+        assert flat_toc[1]["canonical_ref"] == "Ménon 80a"
+
+
+class TestExtractParagraphNumber:
+    """Tests for extract_paragraph_number function."""
+
+    def test_extract_standard_paragraph(self) -> None:
+        """Test extracting standard paragraph number."""
+        assert extract_paragraph_number("628. It is the instincts...") == "628"
+        assert extract_paragraph_number("42. On the nature of...") == "42"
+
+    def test_extract_stephanus_paragraph(self) -> None:
+        """Test extracting Stephanus-style paragraph (with letter)."""
+        assert extract_paragraph_number("80a. SOCRATE: Sais-tu...") == "80a"
+        assert extract_paragraph_number("215c. Text here") == "215c"
+
+    def test_extract_section_symbol(self) -> None:
+        """Test extracting paragraph with section symbol."""
+        assert extract_paragraph_number("§42 On the nature of...") == "42"
+        assert extract_paragraph_number("§ 628 Text") == "628"
+
+    def test_extract_cp_reference(self) -> None:
+        """Test extracting paragraph from CP reference."""
+        assert extract_paragraph_number("CP 5.628. Text") == "628"
+        assert extract_paragraph_number("CP 1.42. My philosophy") == "42"
+
+    def test_extract_no_paragraph(self) -> None:
+        """Test extraction when no paragraph number present."""
+        assert extract_paragraph_number("Introduction") is None
+        assert extract_paragraph_number("") is None
+        assert extract_paragraph_number("Chapter One") is None
+
+
+class TestFindMatchingTocEntry:
+    """Tests for find_matching_toc_entry function."""
+
+    def setup_method(self) -> None:
+        """Set up test fixtures."""
+        self.flat_toc: List[FlatTOCEntryEnriched] = [
+            {
+                "title": "Peirce: CP 1.628",
+                "level": 1,
+                "full_path": "Peirce: CP 1.628",
+                "chapter_title": "Peirce: CP 1.628",
+                "canonical_ref": "CP 1.628",
+                "parent_titles": [],
+                "index_in_flat_list": 0,
+            },
+            {
+                "title": "628. It is the instincts...",
+                "level": 2,
+                "full_path": "Peirce: CP 1.628 > 628. It is the instincts...",
+                "chapter_title": "Peirce: CP 1.628",
+                "canonical_ref": "CP 1.628",
+                "parent_titles": ["Peirce: CP 1.628"],
+                "index_in_flat_list": 1,
+            },
+            {
+                "title": "Peirce: CP 1.42",
+                "level": 1,
+                "full_path": "Peirce: CP 1.42",
+                "chapter_title": "Peirce: CP 1.42",
+                "canonical_ref": "CP 1.42",
+                "parent_titles": [],
+                "index_in_flat_list": 2,
+            },
+            {
+                "title": "42. My philosophy resuscitates Hegel",
+                "level": 2,
+                "full_path": "Peirce: CP 1.42 > 42. My philosophy resuscitates Hegel",
+                "chapter_title": "Peirce: CP 1.42",
+                "canonical_ref": "CP 1.42",
+                "parent_titles": ["Peirce: CP 1.42"],
+                "index_in_flat_list": 3,
+            },
+        ]
+
+    def test_exact_title_match(self) -> None:
+        """Test exact title matching."""
+        chunk: Dict[str, Any] = {
+            "section": "628. It is the instincts...",
+            "order_index": 0,
+        }
+
+        entry = find_matching_toc_entry(chunk, self.flat_toc)
+
+        assert entry is not None
+        assert entry["title"] == "628. It is the instincts..."
+        assert entry["canonical_ref"] == "CP 1.628"
+
+    def test_paragraph_number_match(self) -> None:
+        """Test paragraph number matching with text similarity."""
+        chunk: Dict[str, Any] = {
+            "section": "42. My philosophy resuscitates Hegel",
+            "order_index": 1,
+        }
+
+        entry = find_matching_toc_entry(chunk, self.flat_toc)
+
+        assert entry is not None
+        assert entry["canonical_ref"] == "CP 1.42"
+
+    def test_no_match_empty_toc(self) -> None:
+        """Test behavior with empty TOC."""
+        chunk: Dict[str, Any] = {"section": "Some section", "order_index": 0}
+
+        entry = find_matching_toc_entry(chunk, [])
+
+        assert entry is None
+
+    def test_no_match_empty_section(self) -> None:
+        """Test behavior with chunk having no section."""
+        chunk: Dict[str, Any] = {"text": "Some text", "order_index": 0}
+
+        entry = find_matching_toc_entry(chunk, self.flat_toc)
+
+        # Without section field, function returns None (doesn't guess)
+        # This is correct behavior - we don't want to match without text
+        assert entry is None
+
+    def test_proximity_match_fallback(self) -> None:
+        """Test proximity matching when no text match found."""
+        chunk: Dict[str, Any] = {
+            "section": "Unknown section",
+            "order_index": 1,
+        }
+
+        entry = find_matching_toc_entry(chunk, self.flat_toc)
+
+        # Should return entry with closest index_in_flat_list to order_index=1
+        assert entry is not None
+        assert entry["index_in_flat_list"] == 1
+
+
+class TestEnrichChunksWithToc:
+    """Tests for enrich_chunks_with_toc function."""
+
+    def test_enrich_chunks_no_toc(self) -> None:
+        """Test graceful fallback when TOC is absent."""
+        chunks: List[Dict[str, Any]] = [
+            {"text": "test", "section": "Intro"},
+        ]
+
+        enriched = enrich_chunks_with_toc(chunks, [], {})
+
+        assert enriched == chunks  # Unchanged
+
+    def test_enrich_chunks_with_match(self) -> None:
+        """Test enrichment with successful TOC matching."""
+        chunks: List[Dict[str, Any]] = [
+            {"text": "test", "section": "628. It is the instincts..."},
+        ]
+
+        toc: List[Dict[str, Any]] = [
+            {
+                "title": "Peirce: CP 1.628",
+                "level": 1,
+                "children": [
+                    {
+                        "title": "628. It is the instincts...",
+                        "level": 2,
+                        "children": [],
+                    },
+                ],
+            },
+        ]
+
+        enriched = enrich_chunks_with_toc(chunks, toc, {})
+
+        assert len(enriched) == 1
+        assert "Peirce: CP 1.628" in enriched[0]["sectionPath"]
+        assert enriched[0]["chapterTitle"] == "Peirce: CP 1.628"
+        assert enriched[0]["canonical_reference"] == "CP 1.628"
+
+    def test_enrich_chunks_partial_match(self) -> None:
+        """Test enrichment when only some chunks match."""
+        chunks: List[Dict[str, Any]] = [
+            {"text": "test1", "section": "628. It is the instincts...", "order_index": 0},
+            {"text": "test2", "section": "Unknown section", "order_index": 1},
+        ]
+
+        toc: List[Dict[str, Any]] = [
+            {
+                "title": "Peirce: CP 1.628",
+                "level": 1,
+                "children": [
+                    {
+                        "title": "628. It is the instincts...",
+                        "level": 2,
+                        "children": [],
+                    },
+                ],
+            },
+        ]
+
+        enriched = enrich_chunks_with_toc(chunks, toc, {})
+
+        # First chunk should be enriched
+        assert "Peirce: CP 1.628" in enriched[0]["sectionPath"]
+        assert enriched[0]["canonical_reference"] == "CP 1.628"
+
+        # Second chunk doesn't match, so uses proximity fallback
+        # Proximity matching will assign it to the closest TOC entry
+        assert "sectionPath" in enriched[1]  # Should get proximity match
+
+    def test_enrich_chunks_preserves_original_fields(self) -> None:
+        """Test that enrichment preserves other chunk fields."""
+        chunks: List[Dict[str, Any]] = [
+            {
+                "text": "test",
+                "section": "628. It is the instincts...",
+                "order_index": 42,
+                "keywords": ["test"],
+            },
+        ]
+
+        toc: List[Dict[str, Any]] = [
+            {
+                "title": "Peirce: CP 1.628",
+                "level": 1,
+                "children": [
+                    {
+                        "title": "628. It is the instincts...",
+                        "level": 2,
+                        "children": [],
+                    },
+                ],
+            },
+        ]
+
+        enriched = enrich_chunks_with_toc(chunks, toc, {})
+
+        # Original fields should be preserved
+        assert enriched[0]["text"] == "test"
+        assert enriched[0]["order_index"] == 42
+        assert enriched[0]["keywords"] == ["test"]
+        # New fields should be added
+        assert "canonical_reference" in enriched[0]
+
+    def test_enrich_chunks_empty_chunks_list(self) -> None:
+        """Test behavior with empty chunks list."""
+        toc: List[Dict[str, Any]] = [
+            {"title": "Chapter 1", "level": 1, "children": []},
+        ]
+
+        enriched = enrich_chunks_with_toc([], toc, {})
+
+        assert enriched == []
+
+
+# Integration test combining multiple functions
+class TestTocEnricherIntegration:
+    """Integration tests for the complete enrichment pipeline."""
+
+    def test_full_peirce_enrichment_pipeline(self) -> None:
+        """Test complete enrichment pipeline with Peirce data."""
+        # Realistic Peirce TOC structure
+        toc: List[Dict[str, Any]] = [
+            {
+                "title": "Peirce: CP 6.628",
+                "level": 1,
+                "children": [
+                    {
+                        "title": "628. I think we need to reflect...",
+                        "level": 2,
+                        "children": [],
+                    },
+                    {
+                        "title": "629. The next point is...",
+                        "level": 2,
+                        "children": [],
+                    },
+                ],
+            },
+        ]
+
+        # Realistic chunks from pdf_pipeline
+        chunks: List[Dict[str, Any]] = [
+            {
+                "text": "I think we need to reflect on the nature of signs...",
+                "section": "628. I think we need to reflect...",
+                "order_index": 0,
+            },
+            {
+                "text": "The next point is about interpretation...",
+                "section": "629. The next point is...",
+                "order_index": 1,
+            },
+        ]
+
+        # Run enrichment
+        enriched = enrich_chunks_with_toc(chunks, toc, {})
+
+        # Verify results
+        assert len(enriched) == 2
+
+        # First chunk
+        assert enriched[0]["sectionPath"] == "Peirce: CP 6.628 > 628. I think we need to reflect..."
+        assert enriched[0]["chapterTitle"] == "Peirce: CP 6.628"
+        assert enriched[0]["canonical_reference"] == "CP 6.628"
+
+        # Second chunk
+        assert enriched[1]["sectionPath"] == "Peirce: CP 6.628 > 629. The next point is..."
+        assert enriched[1]["chapterTitle"] == "Peirce: CP 6.628"
+        assert enriched[1]["canonical_reference"] == "CP 6.628"