feat: Group chunks under sections in hierarchical search
- Stage 2 now searches chunks for EACH section using section summary as query - Chunks distributed across sections (limit / sections_limit) - Template displays sections with nested chunks underneath - Each section shows: title, summary, concepts, chunk count, and passages - Removes separate global passages list - now fully grouped by section Structure: Section 1 → Chunks 1-3, Section 2 → Chunks 4-6, etc.
This commit is contained in:
@@ -421,33 +421,38 @@ def hierarchical_search(
|
|||||||
}
|
}
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════
|
||||||
# STAGE 2: Search Chunk collection and distribute to sections
|
# STAGE 2: Search chunks for EACH section (grouped display)
|
||||||
# ═══════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════
|
||||||
# Note: Summary.sectionPath != Chunk.sectionPath exactly
|
# For each section, search chunks using the section's summary text
|
||||||
# Summary: "Peirce: CP 2.504"
|
# This groups chunks under their relevant sections
|
||||||
# Chunk: "Peirce: CP 2.504 > 504. Text..."
|
|
||||||
# We use prefix matching in Python instead of Weaviate filters
|
|
||||||
|
|
||||||
chunk_collection = client.collections.get("Chunk")
|
chunk_collection = client.collections.get("Chunk")
|
||||||
|
|
||||||
# Build filters (author/work only, no sectionPath filter)
|
# Build base filters (author/work only)
|
||||||
filters: Optional[Any] = None
|
base_filters: Optional[Any] = None
|
||||||
if author_filter:
|
if author_filter:
|
||||||
filters = wvq.Filter.by_property("workAuthor").equal(author_filter)
|
base_filters = wvq.Filter.by_property("workAuthor").equal(author_filter)
|
||||||
if work_filter:
|
if work_filter:
|
||||||
work_filter_obj = wvq.Filter.by_property("workTitle").equal(work_filter)
|
work_filter_obj = wvq.Filter.by_property("workTitle").equal(work_filter)
|
||||||
filters = filters & work_filter_obj if filters else work_filter_obj
|
base_filters = base_filters & work_filter_obj if base_filters else work_filter_obj
|
||||||
|
|
||||||
|
all_chunks = []
|
||||||
|
chunks_per_section = max(3, limit // len(sections_data)) # Distribute chunks across sections
|
||||||
|
|
||||||
|
for section in sections_data:
|
||||||
|
# Use section's summary text as query to find relevant chunks
|
||||||
|
# This ensures chunks are semantically related to the section
|
||||||
|
section_query = section["summary_text"] or section["title"] or query
|
||||||
|
|
||||||
# Single query to get all relevant chunks
|
|
||||||
chunks_result = chunk_collection.query.near_text(
|
chunks_result = chunk_collection.query.near_text(
|
||||||
query=query,
|
query=section_query,
|
||||||
limit=limit * len(sections_data), # Get enough for all sections
|
limit=chunks_per_section,
|
||||||
filters=filters,
|
filters=base_filters,
|
||||||
return_metadata=wvq.MetadataQuery(distance=True),
|
return_metadata=wvq.MetadataQuery(distance=True),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert to list
|
# Convert to list and attach to section
|
||||||
all_chunks_list = [
|
section_chunks = [
|
||||||
{
|
{
|
||||||
"uuid": str(obj.uuid),
|
"uuid": str(obj.uuid),
|
||||||
"distance": obj.metadata.distance if obj.metadata else None,
|
"distance": obj.metadata.distance if obj.metadata else None,
|
||||||
@@ -457,20 +462,14 @@ def hierarchical_search(
|
|||||||
for obj in chunks_result.objects
|
for obj in chunks_result.objects
|
||||||
]
|
]
|
||||||
|
|
||||||
# NOTE: Summary.sectionPath format doesn't match Chunk.sectionPath
|
section["chunks"] = section_chunks
|
||||||
# This is a data quality issue that needs to be fixed at ingestion
|
section["chunks_count"] = len(section_chunks)
|
||||||
# For now, sections provide context, chunks are shown globally
|
all_chunks.extend(section_chunks)
|
||||||
print(f"[HIERARCHICAL] Got {len(all_chunks_list)} chunks total")
|
|
||||||
print(f"[HIERARCHICAL] Found {len(sections_data)} relevant sections")
|
|
||||||
|
|
||||||
all_chunks = all_chunks_list
|
print(f"[HIERARCHICAL] Got {len(all_chunks)} chunks total across {len(sections_data)} sections")
|
||||||
|
print(f"[HIERARCHICAL] Average {len(all_chunks) / len(sections_data):.1f} chunks per section")
|
||||||
|
|
||||||
# Clear chunks from sections (they're displayed separately)
|
# Sort all chunks globally by similarity for the flat results list
|
||||||
for section in sections_data:
|
|
||||||
section["chunks"] = []
|
|
||||||
section["chunks_count"] = 0
|
|
||||||
|
|
||||||
# Sort all chunks by similarity (descending)
|
|
||||||
all_chunks.sort(key=lambda x: x.get("similarity", 0) or 0, reverse=True)
|
all_chunks.sort(key=lambda x: x.get("similarity", 0) or 0, reverse=True)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -172,38 +172,37 @@
|
|||||||
|
|
||||||
{% if results_data.results %}
|
{% if results_data.results %}
|
||||||
|
|
||||||
<!-- Hierarchical display -->
|
<!-- Hierarchical display: sections with grouped chunks -->
|
||||||
{% if results_data.mode == "hierarchical" and results_data.sections %}
|
{% if results_data.mode == "hierarchical" and results_data.sections %}
|
||||||
<!-- Show relevant sections as context -->
|
<div>
|
||||||
<div style="margin-bottom: 2rem;">
|
<h3 style="font-size: 1.3em; margin-bottom: 1.5rem; color: var(--color-accent);">📚 Sections pertinentes avec passages</h3>
|
||||||
<h3 style="font-size: 1.2em; margin-bottom: 1rem; color: var(--color-accent);">📚 Sections pertinentes trouvées</h3>
|
|
||||||
<div style="display: flex; flex-direction: column; gap: 1rem;">
|
|
||||||
{% for section in results_data.sections %}
|
{% for section in results_data.sections %}
|
||||||
<div style="padding: 1rem; border-left: 3px solid var(--color-accent); background: rgba(125, 110, 88, 0.05); border-radius: 4px;">
|
<div class="section-group" style="margin-bottom: 2rem; border: 1px solid rgba(125, 110, 88, 0.2); border-radius: 8px; padding: 1.5rem; background: linear-gradient(135deg, var(--color-bg-main) 0%, #ffffff 100%); box-shadow: 0 2px 8px rgba(125, 110, 88, 0.08);">
|
||||||
<div style="display: flex; align-items: center; gap: 0.5rem; margin-bottom: 0.5rem;">
|
<!-- Section header -->
|
||||||
<strong>{{ section.title[:80] }}{% if section.title|length > 80 %}...{% endif %}</strong>
|
<div class="section-header" style="margin-bottom: 1rem; padding-bottom: 0.75rem; border-bottom: 2px solid var(--color-accent);">
|
||||||
|
<div style="display: flex; align-items: center; gap: 0.5rem; margin-bottom: 0.5rem; flex-wrap: wrap;">
|
||||||
|
<h4 style="margin: 0; font-size: 1.1em; color: var(--color-text-strong);">{{ section.title[:100] }}{% if section.title|length > 100 %}...{% endif %}</h4>
|
||||||
<span class="badge badge-similarity">⚡ {{ section.similarity }}% similaire</span>
|
<span class="badge badge-similarity">⚡ {{ section.similarity }}% similaire</span>
|
||||||
|
<span class="badge" style="background-color: rgba(125, 110, 88, 0.15); color: var(--color-accent); font-size: 0.85em;">{{ section.chunks_count }} passage{% if section.chunks_count > 1 %}s{% endif %}</span>
|
||||||
</div>
|
</div>
|
||||||
{% if section.summary_text and section.summary_text != section.title and section.summary_text != section.section_path %}
|
{% if section.summary_text and section.summary_text != section.title and section.summary_text != section.section_path %}
|
||||||
<p style="margin: 0; font-size: 0.9em; color: var(--color-text-main); font-style: italic;">{{ section.summary_text[:150] }}{% if section.summary_text|length > 150 %}...{% endif %}</p>
|
<p class="summary-text" style="margin: 0.5rem 0 0 0; font-size: 0.9em; color: var(--color-text-main); font-style: italic; line-height: 1.5;">{{ section.summary_text[:200] }}{% if section.summary_text|length > 200 %}...{% endif %}</p>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% if section.concepts %}
|
{% if section.concepts %}
|
||||||
<div style="margin-top: 0.5rem;">
|
<div class="concepts" style="margin-top: 0.5rem;">
|
||||||
{% for concept in section.concepts %}
|
{% for concept in section.concepts %}
|
||||||
<span class="badge" style="background-color: rgba(125, 110, 88, 0.15); color: var(--color-accent); border: 1px solid rgba(125, 110, 88, 0.3); margin-right: 0.25rem; font-size: 0.85em;">{{ concept }}</span>
|
<span class="badge" style="background-color: rgba(125, 110, 88, 0.15); color: var(--color-accent); border: 1px solid rgba(125, 110, 88, 0.3); margin-right: 0.25rem; font-size: 0.85em;">{{ concept }}</span>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
{% endfor %}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Show top relevant chunks globally -->
|
<!-- Chunks under this section -->
|
||||||
<div>
|
{% if section.chunks %}
|
||||||
<h3 style="font-size: 1.2em; margin-bottom: 1rem; color: var(--color-accent);">📄 Passages les plus pertinents</h3>
|
<div class="chunks-list" style="margin-left: 0; margin-top: 1rem;">
|
||||||
{% for chunk in results_data.results[:20] %}
|
{% for chunk in section.chunks %}
|
||||||
<div class="chunk-item" style="background: white; padding: 1rem; margin-bottom: 0.75rem; border-left: 3px solid var(--color-accent-alt); border-radius: 4px;">
|
<div class="chunk-item" style="background: white; padding: 1rem; margin-bottom: 0.75rem; border-left: 3px solid var(--color-accent-alt); border-radius: 4px; box-shadow: 0 1px 3px rgba(85, 107, 99, 0.12);">
|
||||||
<div style="margin-bottom: 0.5rem; display: flex; gap: 0.5rem; flex-wrap: wrap; align-items: center;">
|
<div style="margin-bottom: 0.5rem; display: flex; gap: 0.5rem; flex-wrap: wrap; align-items: center;">
|
||||||
{% if chunk.work and chunk.work.author %}
|
{% if chunk.work and chunk.work.author %}
|
||||||
<span class="badge badge-author">{{ chunk.work.author }}</span>
|
<span class="badge badge-author">{{ chunk.work.author }}</span>
|
||||||
@@ -229,6 +228,12 @@
|
|||||||
</div>
|
</div>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
|
{% else %}
|
||||||
|
<p style="margin-left: 1rem; color: #999; font-style: italic;">Aucun passage trouvé pour cette section</p>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
|
||||||
<!-- Simple display (original) -->
|
<!-- Simple display (original) -->
|
||||||
{% else %}
|
{% else %}
|
||||||
|
|||||||
Reference in New Issue
Block a user