fix: Use prefix matching for sectionPath to find chunks in sections

Problem:
- Summary.sectionPath: "Peirce: CP 2.504"
- Chunk.sectionPath: "Peirce: CP 2.504 > 504. Text..."
- Filter.equal() found 0 matches (no exact match exists)

Solution:
- Single semantic query to get all relevant chunks
- Distribute chunks to sections using Python startswith()
- This correctly matches chunks to their parent sections

Performance improvement:
- 1 query instead of N queries (one per section)
- Python-side filtering is fast for small result sets

Result: Chunks should now appear in their corresponding sections
This commit is contained in:
2026-01-01 15:45:37 +01:00
parent 474edf75e5
commit 47cf21867f

View File

@@ -421,37 +421,33 @@ def hierarchical_search(
} }
# ═══════════════════════════════════════════════════════════════ # ═══════════════════════════════════════════════════════════════
# STAGE 2: Search Chunk collection filtered by sections # STAGE 2: Search Chunk collection and distribute to sections
# ═══════════════════════════════════════════════════════════════ # ═══════════════════════════════════════════════════════════════
# Note: Summary.sectionPath != Chunk.sectionPath exactly
# Summary: "Peirce: CP 2.504"
# Chunk: "Peirce: CP 2.504 > 504. Text..."
# We use prefix matching in Python instead of Weaviate filters
chunk_collection = client.collections.get("Chunk") chunk_collection = client.collections.get("Chunk")
all_chunks = []
for section in sections_data:
section_path = section["section_path"]
# Build filters
filters: Optional[Any] = wvq.Filter.by_property("sectionPath").equal(section_path)
# Build filters (author/work only, no sectionPath filter)
filters: Optional[Any] = None
if author_filter: if author_filter:
author_filter_obj = wvq.Filter.by_property("workAuthor").equal(author_filter) filters = wvq.Filter.by_property("workAuthor").equal(author_filter)
filters = filters & author_filter_obj
if work_filter: if work_filter:
work_filter_obj = wvq.Filter.by_property("workTitle").equal(work_filter) work_filter_obj = wvq.Filter.by_property("workTitle").equal(work_filter)
filters = filters & work_filter_obj filters = filters & work_filter_obj if filters else work_filter_obj
# Search chunks in this section # Single query to get all relevant chunks
# Note: Don't specify return_properties to get nested objects (work, document)
chunks_result = chunk_collection.query.near_text( chunks_result = chunk_collection.query.near_text(
query=query, query=query,
limit=limit, limit=limit * len(sections_data), # Get enough for all sections
filters=filters, filters=filters,
return_metadata=wvq.MetadataQuery(distance=True), return_metadata=wvq.MetadataQuery(distance=True),
) )
# Add chunks to section # Convert to list
section_chunks = [ all_chunks_list = [
{ {
"uuid": str(obj.uuid), "uuid": str(obj.uuid),
"distance": obj.metadata.distance if obj.metadata else None, "distance": obj.metadata.distance if obj.metadata else None,
@@ -461,9 +457,22 @@ def hierarchical_search(
for obj in chunks_result.objects for obj in chunks_result.objects
] ]
section["chunks"] = section_chunks # Distribute chunks to sections using prefix matching
section["chunks_count"] = len(section_chunks) all_chunks = []
all_chunks.extend(section_chunks) for section in sections_data:
section_ref = section["section_path"] # e.g., "Peirce: CP 2.504"
# Find chunks whose sectionPath starts with this reference
section_chunks = [
chunk for chunk in all_chunks_list
if chunk.get("sectionPath", "").startswith(section_ref)
]
# Sort by similarity and limit per section
section_chunks.sort(key=lambda x: x.get("similarity", 0) or 0, reverse=True)
section["chunks"] = section_chunks[:limit]
section["chunks_count"] = len(section["chunks"])
all_chunks.extend(section["chunks"])
# Sort all chunks by similarity (descending) # Sort all chunks by similarity (descending)
all_chunks.sort(key=lambda x: x.get("similarity", 0) or 0, reverse=True) all_chunks.sort(key=lambda x: x.get("similarity", 0) or 0, reverse=True)