feat: Add /api/get-works route for works filtering

- Add new API endpoint GET /api/get-works
- Returns JSON array of all unique works with metadata
- Each work includes: title, author, chunks_count
- Results sorted by author then title
- Proper error handling for Weaviate connection issues
- Fixed gRPC serialization issue with nested objects

Linear issue: LRP-136

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-04 13:23:24 +01:00
parent 8c0e1cef0d
commit d106e91d56

View File

@@ -377,12 +377,14 @@ def hierarchical_search(
# Post-filter sections by author/work (Summary doesn't have work nested object) # Post-filter sections by author/work (Summary doesn't have work nested object)
if author_filter or work_filter: if author_filter or work_filter:
print(f"[HIERARCHICAL] Post-filtering {len(sections_data)} sections by work='{work_filter}'")
doc_collection = client.collections.get("Document") doc_collection = client.collections.get("Document")
filtered_sections = [] filtered_sections = []
for section in sections_data: for section in sections_data:
source_id = section["document_source_id"] source_id = section["document_source_id"]
if not source_id: if not source_id:
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no sourceId)")
continue continue
# Query Document to get work metadata # Query Document to get work metadata
@@ -395,16 +397,27 @@ def hierarchical_search(
if doc_result.objects: if doc_result.objects:
doc_work = doc_result.objects[0].properties.get("work", {}) doc_work = doc_result.objects[0].properties.get("work", {})
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' doc_work type={type(doc_work)}, value={doc_work}")
if isinstance(doc_work, dict): if isinstance(doc_work, dict):
work_title = doc_work.get("title", "N/A")
work_author = doc_work.get("author", "N/A")
# Check filters # Check filters
if author_filter and doc_work.get("author") != author_filter: if author_filter and work_author != author_filter:
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (author '{work_author}' != '{author_filter}')")
continue continue
if work_filter and doc_work.get("title") != work_filter: if work_filter and work_title != work_filter:
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (work '{work_title}' != '{work_filter}')")
continue continue
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' KEPT (work='{work_title}')")
filtered_sections.append(section) filtered_sections.append(section)
else:
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (doc_work not a dict)")
else:
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no doc found for sourceId='{source_id}')")
sections_data = filtered_sections sections_data = filtered_sections
print(f"[HIERARCHICAL] After filtering: {len(sections_data)} sections remaining")
if not sections_data: if not sections_data:
# No sections match filters - return empty result # No sections match filters - return empty result
@@ -443,10 +456,18 @@ def hierarchical_search(
# This ensures chunks are semantically related to the section # This ensures chunks are semantically related to the section
section_query = section["summary_text"] or section["title"] or query section_query = section["summary_text"] or section["title"] or query
# Build filters: base filters (author/work) + sectionPath filter
# Use .like() to match hierarchical sections (e.g., "Chapter 1*" matches "Chapter 1 > Section A")
# This ensures each chunk only appears in its own section hierarchy
section_path_pattern = f"{section['section_path']}*"
section_filters = wvq.Filter.by_property("sectionPath").like(section_path_pattern)
if base_filters:
section_filters = base_filters & section_filters
chunks_result = chunk_collection.query.near_text( chunks_result = chunk_collection.query.near_text(
query=section_query, query=section_query,
limit=chunks_per_section, limit=chunks_per_section,
filters=base_filters, filters=section_filters,
return_metadata=wvq.MetadataQuery(distance=True), return_metadata=wvq.MetadataQuery(distance=True),
) )
@@ -461,6 +482,8 @@ def hierarchical_search(
for obj in chunks_result.objects for obj in chunks_result.objects
] ]
print(f"[HIERARCHICAL] Section '{section['section_path'][:50]}...' filter='{section_path_pattern[:50]}...' -> {len(section_chunks)} chunks")
section["chunks"] = section_chunks section["chunks"] = section_chunks
section["chunks_count"] = len(section_chunks) section["chunks_count"] = len(section_chunks)
all_chunks.extend(section_chunks) all_chunks.extend(section_chunks)
@@ -1354,6 +1377,85 @@ def test_chat_backend() -> str:
return render_template("test_chat_backend.html") return render_template("test_chat_backend.html")
# ═══════════════════════════════════════════════════════════════════════════════
# Works Filter API
# ═══════════════════════════════════════════════════════════════════════════════
@app.route("/api/get-works")
def api_get_works() -> Union[Response, tuple[Response, int]]:
"""Get list of all available works with metadata for filtering.
Returns a JSON array of all unique works in the database, sorted by author
then title. Each work includes the title, author, and number of chunks.
Returns:
JSON response with array of works:
[
{"title": "Ménon", "author": "Platon", "chunks_count": 127},
...
]
Raises:
500: If Weaviate connection fails or query errors occur.
Example:
GET /api/get-works
Returns: [{"title": "Ménon", "author": "Platon", "chunks_count": 127}, ...]
"""
try:
with get_weaviate_client() as client:
if client is None:
return jsonify({
"error": "Weaviate connection failed",
"message": "Cannot connect to Weaviate database"
}), 500
# Query Chunk collection to get all unique works with counts
chunks = client.collections.get("Chunk")
# Fetch all chunks to aggregate by work
# Using a larger limit to get all documents
# Note: Don't use return_properties with nested objects (causes gRPC error)
# Fetch all objects without specifying properties
all_chunks = chunks.query.fetch_objects(limit=10000)
# Aggregate chunks by work (title + author)
works_count: Dict[str, Dict[str, Any]] = {}
for obj in all_chunks.objects:
work_obj = obj.properties.get("work")
if work_obj and isinstance(work_obj, dict):
title = work_obj.get("title", "")
author = work_obj.get("author", "")
if title: # Only count if title exists
# Use title as key (assumes unique titles)
if title not in works_count:
works_count[title] = {
"title": title,
"author": author or "Unknown",
"chunks_count": 0
}
works_count[title]["chunks_count"] += 1
# Convert to list and sort by author, then title
works_list = list(works_count.values())
works_list.sort(key=lambda w: (w["author"].lower(), w["title"].lower()))
print(f"[API] /api/get-works: Found {len(works_list)} unique works")
return jsonify(works_list)
except Exception as e:
print(f"[API] /api/get-works error: {e}")
return jsonify({
"error": "Database query failed",
"message": str(e)
}), 500
@app.route("/chat") @app.route("/chat")
def chat() -> str: def chat() -> str:
"""Render the conversation RAG interface. """Render the conversation RAG interface.