feat: Group chunks under sections in hierarchical search

- Stage 2 now searches chunks for EACH section using section summary as query - Chunks distributed across sections (limit / sections_limit) - Template displays sections with nested chunks underneath - Each section shows: title, summary, concepts, chunk count, and passages - Removes separate global passages list - now fully grouped by section Structure: Section 1 → Chunks 1-3, Section 2 → Chunks 4-6, etc.
2026-01-01 18:25:11 +01:00
parent 65adc02d6e
commit 1cec07b284
2 changed files with 81 additions and 77 deletions
--- a/generations/library_rag/flask_app.py
+++ b/generations/library_rag/flask_app.py
@@ -421,56 +421,55 @@ def hierarchical_search(
                }

            # ═══════════════════════════════════════════════════════════════
-            # STAGE 2: Search Chunk collection and distribute to sections
+            # STAGE 2: Search chunks for EACH section (grouped display)
            # ═══════════════════════════════════════════════════════════════
-            # Note: Summary.sectionPath != Chunk.sectionPath exactly
-            #   Summary: "Peirce: CP 2.504"
-            #   Chunk:   "Peirce: CP 2.504 > 504. Text..."
-            # We use prefix matching in Python instead of Weaviate filters
+            # For each section, search chunks using the section's summary text
+            # This groups chunks under their relevant sections

            chunk_collection = client.collections.get("Chunk")

-            # Build filters (author/work only, no sectionPath filter)
-            filters: Optional[Any] = None
+            # Build base filters (author/work only)
+            base_filters: Optional[Any] = None
            if author_filter:
-                filters = wvq.Filter.by_property("workAuthor").equal(author_filter)
+                base_filters = wvq.Filter.by_property("workAuthor").equal(author_filter)
            if work_filter:
                work_filter_obj = wvq.Filter.by_property("workTitle").equal(work_filter)
-                filters = filters & work_filter_obj if filters else work_filter_obj
+                base_filters = base_filters & work_filter_obj if base_filters else work_filter_obj

-            # Single query to get all relevant chunks
-            chunks_result = chunk_collection.query.near_text(
-                query=query,
-                limit=limit * len(sections_data),  # Get enough for all sections
-                filters=filters,
-                return_metadata=wvq.MetadataQuery(distance=True),
-            )
+            all_chunks = []
+            chunks_per_section = max(3, limit // len(sections_data))  # Distribute chunks across sections

-            # Convert to list
-            all_chunks_list = [
-                {
-                    "uuid": str(obj.uuid),
-                    "distance": obj.metadata.distance if obj.metadata else None,
-                    "similarity": round((1 - obj.metadata.distance) * 100, 1) if obj.metadata and obj.metadata.distance else None,
-                    **obj.properties
-                }
-                for obj in chunks_result.objects
-            ]
-
-            # NOTE: Summary.sectionPath format doesn't match Chunk.sectionPath
-            # This is a data quality issue that needs to be fixed at ingestion
-            # For now, sections provide context, chunks are shown globally
-            print(f"[HIERARCHICAL] Got {len(all_chunks_list)} chunks total")
-            print(f"[HIERARCHICAL] Found {len(sections_data)} relevant sections")
-
-            all_chunks = all_chunks_list
-
-            # Clear chunks from sections (they're displayed separately)
            for section in sections_data:
-                section["chunks"] = []
-                section["chunks_count"] = 0
+                # Use section's summary text as query to find relevant chunks
+                # This ensures chunks are semantically related to the section
+                section_query = section["summary_text"] or section["title"] or query

-            # Sort all chunks by similarity (descending)
+                chunks_result = chunk_collection.query.near_text(
+                    query=section_query,
+                    limit=chunks_per_section,
+                    filters=base_filters,
+                    return_metadata=wvq.MetadataQuery(distance=True),
+                )
+
+                # Convert to list and attach to section
+                section_chunks = [
+                    {
+                        "uuid": str(obj.uuid),
+                        "distance": obj.metadata.distance if obj.metadata else None,
+                        "similarity": round((1 - obj.metadata.distance) * 100, 1) if obj.metadata and obj.metadata.distance else None,
+                        **obj.properties
+                    }
+                    for obj in chunks_result.objects
+                ]
+
+                section["chunks"] = section_chunks
+                section["chunks_count"] = len(section_chunks)
+                all_chunks.extend(section_chunks)
+
+            print(f"[HIERARCHICAL] Got {len(all_chunks)} chunks total across {len(sections_data)} sections")
+            print(f"[HIERARCHICAL] Average {len(all_chunks) / len(sections_data):.1f} chunks per section")
+
+            # Sort all chunks globally by similarity for the flat results list
            all_chunks.sort(key=lambda x: x.get("similarity", 0) or 0, reverse=True)

            return {