- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
383 lines
13 KiB
Python
383 lines
13 KiB
Python
"""TOC Enrichment Module for Chunk Metadata Enhancement.
|
|
|
|
This module provides functions to enrich chunk metadata with hierarchical
|
|
information from the table of contents (TOC). It matches chunks to their
|
|
corresponding TOC entries and extracts:
|
|
- Full hierarchical paths (e.g., "Peirce: CP 1.628 > 628. It is...")
|
|
- Chapter titles
|
|
- Canonical academic references (e.g., "CP 1.628", "Ménon 80a")
|
|
|
|
The enrichment happens before Weaviate ingestion to ensure chunks have
|
|
complete metadata for rigorous academic citation.
|
|
|
|
Usage:
|
|
>>> from utils.toc_enricher import enrich_chunks_with_toc
|
|
>>> enriched_chunks = enrich_chunks_with_toc(chunks, toc, hierarchy)
|
|
|
|
See Also:
|
|
- utils.types: FlatTOCEntryEnriched type definition
|
|
- utils.weaviate_ingest: Integration point for enrichment
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from .types import FlatTOCEntryEnriched
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def flatten_toc_with_paths(
|
|
toc: List[Dict[str, Any]],
|
|
hierarchy: Dict[str, Any],
|
|
) -> List[FlatTOCEntryEnriched]:
|
|
"""Flatten hierarchical or flat TOC and build full paths with metadata.
|
|
|
|
Handles both hierarchical TOCs (with 'children' keys) and flat TOCs
|
|
(where parent-child relationships are inferred from 'level' field).
|
|
|
|
Traverses the TOC structure and creates enriched flat entries with:
|
|
- Full hierarchical path (e.g., "Peirce: CP 1.628 > 628. It is...")
|
|
- Canonical reference extraction (e.g., "CP 1.628")
|
|
- Chapter title tracking (first level 1 ancestor)
|
|
- Parent title list for context
|
|
|
|
Args:
|
|
toc: TOC structure with 'title' and 'level' fields, optionally 'children'
|
|
hierarchy: Document hierarchy (currently unused, reserved for future)
|
|
|
|
Returns:
|
|
List of enriched flat TOC entries with full metadata.
|
|
|
|
Example:
|
|
>>> toc = [
|
|
... {"title": "Peirce: CP 1.628", "level": 1},
|
|
... {"title": "628. It is the instincts...", "level": 2}
|
|
... ]
|
|
>>> flat = flatten_toc_with_paths(toc, {})
|
|
>>> flat[1]["full_path"]
|
|
'Peirce: CP 1.628 > 628. It is the instincts...'
|
|
>>> flat[1]["canonical_ref"]
|
|
'CP 1.628'
|
|
"""
|
|
flat_toc: List[FlatTOCEntryEnriched] = []
|
|
|
|
# Check if TOC is hierarchical (has children) or flat (level-based)
|
|
is_hierarchical = any("children" in entry for entry in toc if entry)
|
|
|
|
if is_hierarchical:
|
|
# Original recursive approach for hierarchical TOCs
|
|
def traverse(
|
|
entries: List[Dict[str, Any]],
|
|
parent_titles: List[str],
|
|
current_chapter: str,
|
|
current_canonical: Optional[str],
|
|
) -> None:
|
|
"""Recursively traverse TOC entries and build flat list."""
|
|
for entry in entries:
|
|
title = entry.get("title", "")
|
|
level = entry.get("level", 0)
|
|
children = entry.get("children", [])
|
|
|
|
# Build full path from parents + current title
|
|
full_path_parts = parent_titles + [title]
|
|
full_path = " > ".join(full_path_parts)
|
|
|
|
# Extract canonical reference if present in title
|
|
canonical_ref = current_canonical
|
|
cp_match = re.search(r'CP\s+(\d+\.\d+)', title)
|
|
stephanus_match = re.search(r'(\w+\s+\d+[a-z])', title)
|
|
|
|
if cp_match:
|
|
canonical_ref = f"CP {cp_match.group(1)}"
|
|
elif stephanus_match:
|
|
canonical_ref = stephanus_match.group(1)
|
|
|
|
# Update chapter title when entering level 1
|
|
chapter_title = current_chapter
|
|
if level == 1:
|
|
chapter_title = title
|
|
|
|
# Create enriched entry
|
|
enriched_entry: FlatTOCEntryEnriched = {
|
|
"title": title,
|
|
"level": level,
|
|
"full_path": full_path,
|
|
"chapter_title": chapter_title,
|
|
"canonical_ref": canonical_ref,
|
|
"parent_titles": parent_titles.copy(),
|
|
"index_in_flat_list": len(flat_toc),
|
|
}
|
|
flat_toc.append(enriched_entry)
|
|
|
|
# Recursively process children
|
|
if children:
|
|
traverse(
|
|
children,
|
|
parent_titles + [title],
|
|
chapter_title,
|
|
canonical_ref,
|
|
)
|
|
|
|
traverse(toc, [], "", None)
|
|
else:
|
|
# New iterative approach for flat TOCs (infer hierarchy from levels)
|
|
parent_stack: List[Dict[str, Any]] = [] # Stack of (level, title, canonical_ref)
|
|
current_chapter = ""
|
|
current_canonical: Optional[str] = None
|
|
|
|
for entry in toc:
|
|
title = entry.get("title", "")
|
|
level = entry.get("level", 1)
|
|
|
|
# Pop parents that are at same or deeper level
|
|
while parent_stack and parent_stack[-1]["level"] >= level:
|
|
parent_stack.pop()
|
|
|
|
# Build parent titles list
|
|
parent_titles = [p["title"] for p in parent_stack]
|
|
|
|
# Build full path
|
|
full_path_parts = parent_titles + [title]
|
|
full_path = " > ".join(full_path_parts)
|
|
|
|
# Extract canonical reference if present in title
|
|
cp_match = re.search(r'CP\s+(\d+\.\d+)', title)
|
|
stephanus_match = re.search(r'(\w+\s+\d+[a-z])', title)
|
|
|
|
if cp_match:
|
|
current_canonical = f"CP {cp_match.group(1)}"
|
|
elif stephanus_match:
|
|
current_canonical = stephanus_match.group(1)
|
|
elif level == 1:
|
|
# Reset canonical ref at level 1 if none found
|
|
current_canonical = None
|
|
|
|
# Inherit canonical ref from parent if not found
|
|
if not current_canonical and parent_stack:
|
|
current_canonical = parent_stack[-1].get("canonical_ref")
|
|
|
|
# Update chapter title when at level 1
|
|
if level == 1:
|
|
current_chapter = title
|
|
|
|
# Create enriched entry
|
|
enriched_entry: FlatTOCEntryEnriched = {
|
|
"title": title,
|
|
"level": level,
|
|
"full_path": full_path,
|
|
"chapter_title": current_chapter,
|
|
"canonical_ref": current_canonical,
|
|
"parent_titles": parent_titles.copy(),
|
|
"index_in_flat_list": len(flat_toc),
|
|
}
|
|
flat_toc.append(enriched_entry)
|
|
|
|
# Add current entry to parent stack for next iteration
|
|
parent_stack.append({
|
|
"level": level,
|
|
"title": title,
|
|
"canonical_ref": current_canonical,
|
|
})
|
|
|
|
return flat_toc
|
|
|
|
|
|
def extract_paragraph_number(section_text: str) -> Optional[str]:
|
|
"""Extract paragraph number from section text.
|
|
|
|
Handles various academic paragraph numbering formats:
|
|
- "628. Text..." → "628"
|
|
- "§42 Text..." → "42"
|
|
- "80a. Text..." → "80a" (Stephanus pagination)
|
|
- "CP 5.628. Text..." → "628"
|
|
|
|
Args:
|
|
section_text: Section title or path text
|
|
|
|
Returns:
|
|
Extracted paragraph number or None if not found.
|
|
|
|
Example:
|
|
>>> extract_paragraph_number("628. It is the instincts...")
|
|
'628'
|
|
>>> extract_paragraph_number("§42 On the nature of...")
|
|
'42'
|
|
>>> extract_paragraph_number("80a. SOCRATE: Sais-tu...")
|
|
'80a'
|
|
"""
|
|
if not section_text:
|
|
return None
|
|
|
|
# Pattern 1: Standard paragraph number at start "628. Text"
|
|
match = re.match(r'^(\d+[a-z]?)\.\s', section_text)
|
|
if match:
|
|
return match.group(1)
|
|
|
|
# Pattern 2: Section symbol "§42 Text"
|
|
match = re.match(r'^§\s*(\d+[a-z]?)\s', section_text)
|
|
if match:
|
|
return match.group(1)
|
|
|
|
# Pattern 3: CP reference "CP 5.628. Text" → extract paragraph only
|
|
match = re.match(r'^CP\s+\d+\.(\d+)\.\s', section_text)
|
|
if match:
|
|
return match.group(1)
|
|
|
|
return None
|
|
|
|
|
|
def find_matching_toc_entry(
|
|
chunk: Dict[str, Any],
|
|
flat_toc: List[FlatTOCEntryEnriched],
|
|
) -> Optional[FlatTOCEntryEnriched]:
|
|
"""Find matching TOC entry for a chunk using multi-strategy matching.
|
|
|
|
Matching strategies (in priority order):
|
|
1. **Exact text match**: chunk.section == toc.title
|
|
2. **Paragraph number match**: Extract paragraph number from both and compare
|
|
3. **Proximity match**: Use order_index to find nearest TOC entry
|
|
|
|
Args:
|
|
chunk: Chunk dict with 'section', 'sectionPath', 'order_index' fields
|
|
flat_toc: Flattened TOC with enriched metadata
|
|
|
|
Returns:
|
|
Best matching TOC entry or None if no match found.
|
|
|
|
Example:
|
|
>>> chunk = {"section": "628. It is the instincts...", "order_index": 42}
|
|
>>> toc_entry = find_matching_toc_entry(chunk, flat_toc)
|
|
>>> toc_entry["canonical_ref"]
|
|
'CP 1.628'
|
|
"""
|
|
if not flat_toc:
|
|
return None
|
|
|
|
chunk_section = chunk.get("section", chunk.get("sectionPath", ""))
|
|
if not chunk_section:
|
|
return None
|
|
|
|
# Strategy 1: Exact title match
|
|
for entry in flat_toc:
|
|
if entry["title"] == chunk_section:
|
|
return entry
|
|
|
|
# Strategy 2: Paragraph number match
|
|
chunk_para = extract_paragraph_number(chunk_section)
|
|
if chunk_para:
|
|
# Look for matching paragraph in level 2 entries (actual content)
|
|
for i, entry in enumerate(flat_toc):
|
|
if entry["level"] == 2:
|
|
entry_para = extract_paragraph_number(entry["title"])
|
|
if entry_para == chunk_para:
|
|
# Additional text similarity check to disambiguate
|
|
# Get first significant word from chunk section
|
|
chunk_words = [w for w in chunk_section.split() if len(w) > 3]
|
|
entry_words = [w for w in entry["title"].split() if len(w) > 3]
|
|
|
|
if chunk_words and entry_words:
|
|
# Check if first significant words match
|
|
if chunk_words[0].lower() in entry["title"].lower():
|
|
return entry
|
|
else:
|
|
# No text to compare, return paragraph match
|
|
return entry
|
|
|
|
# Strategy 3: Proximity match using order_index
|
|
chunk_order = chunk.get("order_index")
|
|
if chunk_order is not None and flat_toc:
|
|
# Find TOC entry with closest index_in_flat_list to chunk order
|
|
# This is a fallback heuristic assuming TOC and chunks follow similar order
|
|
closest_entry = min(
|
|
flat_toc,
|
|
key=lambda e: abs(e["index_in_flat_list"] - chunk_order),
|
|
)
|
|
return closest_entry
|
|
|
|
return None
|
|
|
|
|
|
def enrich_chunks_with_toc(
|
|
chunks: List[Dict[str, Any]],
|
|
toc: List[Dict[str, Any]],
|
|
hierarchy: Dict[str, Any],
|
|
) -> List[Dict[str, Any]]:
|
|
"""Enrich chunks with hierarchical metadata from TOC.
|
|
|
|
Main orchestration function that:
|
|
1. Checks if TOC is available (guard clause)
|
|
2. Flattens TOC once for efficiency
|
|
3. Matches each chunk to its TOC entry
|
|
4. Updates chunk metadata: sectionPath, chapterTitle, canonical_reference
|
|
|
|
Args:
|
|
chunks: List of chunk dicts from pdf_pipeline
|
|
toc: Hierarchical TOC structure (may be empty)
|
|
hierarchy: Document hierarchy dict (may be empty)
|
|
|
|
Returns:
|
|
List of chunks with enriched metadata (same objects, modified in place).
|
|
If TOC is empty, returns chunks unchanged (no regression).
|
|
|
|
Example:
|
|
>>> chunks = [{"text": "...", "section": "628. It is..."}]
|
|
>>> toc = [
|
|
... {"title": "Peirce: CP 1.628", "level": 1, "children": [
|
|
... {"title": "628. It is...", "level": 2, "children": []}
|
|
... ]}
|
|
... ]
|
|
>>> enriched = enrich_chunks_with_toc(chunks, toc, {})
|
|
>>> enriched[0]["sectionPath"]
|
|
'Peirce: CP 1.628 > 628. It is the instincts...'
|
|
>>> enriched[0]["chapterTitle"]
|
|
'Peirce: CP 1.628'
|
|
>>> enriched[0]["canonical_reference"]
|
|
'CP 1.628'
|
|
"""
|
|
# Guard: If no TOC, return chunks unchanged (graceful fallback)
|
|
if not toc:
|
|
logger.info("No TOC available, skipping chunk enrichment")
|
|
return chunks
|
|
|
|
logger.info(f"Enriching {len(chunks)} chunks with TOC metadata...")
|
|
|
|
# Flatten TOC once for efficient matching
|
|
try:
|
|
flat_toc = flatten_toc_with_paths(toc, hierarchy)
|
|
logger.info(f"Flattened TOC: {len(flat_toc)} entries")
|
|
except Exception as e:
|
|
logger.error(f"Failed to flatten TOC: {e}")
|
|
return chunks # Fallback on error
|
|
|
|
# Match each chunk to TOC entry and enrich
|
|
enriched_count = 0
|
|
for chunk in chunks:
|
|
matching_entry = find_matching_toc_entry(chunk, flat_toc)
|
|
|
|
if matching_entry:
|
|
# Update sectionPath with full hierarchical path
|
|
chunk["sectionPath"] = matching_entry["full_path"]
|
|
|
|
# Update chapterTitle
|
|
chunk["chapterTitle"] = matching_entry["chapter_title"]
|
|
|
|
# Add canonicalReference if available
|
|
if matching_entry["canonical_ref"]:
|
|
chunk["canonicalReference"] = matching_entry["canonical_ref"]
|
|
|
|
enriched_count += 1
|
|
|
|
if chunks:
|
|
logger.info(
|
|
f"Enriched {enriched_count}/{len(chunks)} chunks "
|
|
f"({100 * enriched_count / len(chunks):.1f}%)"
|
|
)
|
|
else:
|
|
logger.info("No chunks to enrich")
|
|
|
|
return chunks
|