feat: Add /api/get-works route for works filtering

- Add new API endpoint GET /api/get-works - Returns JSON array of all unique works with metadata - Each work includes: title, author, chunks_count - Results sorted by author then title - Proper error handling for Weaviate connection issues - Fixed gRPC serialization issue with nested objects Linear issue: LRP-136 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-04 13:23:24 +01:00
parent 8c0e1cef0d
commit d106e91d56
1 changed files with 106 additions and 4 deletions
--- a/generations/library_rag/flask_app.py
+++ b/generations/library_rag/flask_app.py
@@ -377,12 +377,14 @@ def hierarchical_search(

            # Post-filter sections by author/work (Summary doesn't have work nested object)
            if author_filter or work_filter:
+                print(f"[HIERARCHICAL] Post-filtering {len(sections_data)} sections by work='{work_filter}'")
                doc_collection = client.collections.get("Document")
                filtered_sections = []

                for section in sections_data:
                    source_id = section["document_source_id"]
                    if not source_id:
+                        print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no sourceId)")
                        continue

                    # Query Document to get work metadata
@@ -395,16 +397,27 @@ def hierarchical_search(

                    if doc_result.objects:
                        doc_work = doc_result.objects[0].properties.get("work", {})
+                        print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' doc_work type={type(doc_work)}, value={doc_work}")
                        if isinstance(doc_work, dict):
+                            work_title = doc_work.get("title", "N/A")
+                            work_author = doc_work.get("author", "N/A")
                            # Check filters
-                            if author_filter and doc_work.get("author") != author_filter:
+                            if author_filter and work_author != author_filter:
+                                print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (author '{work_author}' != '{author_filter}')")
                                continue
-                            if work_filter and doc_work.get("title") != work_filter:
+                            if work_filter and work_title != work_filter:
+                                print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (work '{work_title}' != '{work_filter}')")
                                continue

+                            print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' KEPT (work='{work_title}')")
                            filtered_sections.append(section)
+                        else:
+                            print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (doc_work not a dict)")
+                    else:
+                        print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no doc found for sourceId='{source_id}')")

                sections_data = filtered_sections
+                print(f"[HIERARCHICAL] After filtering: {len(sections_data)} sections remaining")

            if not sections_data:
                # No sections match filters - return empty result
@@ -443,10 +456,18 @@ def hierarchical_search(
                # This ensures chunks are semantically related to the section
                section_query = section["summary_text"] or section["title"] or query

+                # Build filters: base filters (author/work) + sectionPath filter
+                # Use .like() to match hierarchical sections (e.g., "Chapter 1*" matches "Chapter 1 > Section A")
+                # This ensures each chunk only appears in its own section hierarchy
+                section_path_pattern = f"{section['section_path']}*"
+                section_filters = wvq.Filter.by_property("sectionPath").like(section_path_pattern)
+                if base_filters:
+                    section_filters = base_filters & section_filters
+
                chunks_result = chunk_collection.query.near_text(
                    query=section_query,
                    limit=chunks_per_section,
-                    filters=base_filters,
+                    filters=section_filters,
                    return_metadata=wvq.MetadataQuery(distance=True),
                )

@@ -461,6 +482,8 @@ def hierarchical_search(
                    for obj in chunks_result.objects
                ]

+                print(f"[HIERARCHICAL] Section '{section['section_path'][:50]}...' filter='{section_path_pattern[:50]}...' -> {len(section_chunks)} chunks")
+
                section["chunks"] = section_chunks
                section["chunks_count"] = len(section_chunks)
                all_chunks.extend(section_chunks)
@@ -1354,6 +1377,85 @@ def test_chat_backend() -> str:
    return render_template("test_chat_backend.html")


+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Works Filter API
+# ═══════════════════════════════════════════════════════════════════════════════
+
+@app.route("/api/get-works")
+def api_get_works() -> Union[Response, tuple[Response, int]]:
+    """Get list of all available works with metadata for filtering.
+
+    Returns a JSON array of all unique works in the database, sorted by author
+    then title. Each work includes the title, author, and number of chunks.
+
+    Returns:
+        JSON response with array of works:
+        [
+            {"title": "Ménon", "author": "Platon", "chunks_count": 127},
+            ...
+        ]
+
+    Raises:
+        500: If Weaviate connection fails or query errors occur.
+
+    Example:
+        GET /api/get-works
+        Returns: [{"title": "Ménon", "author": "Platon", "chunks_count": 127}, ...]
+    """
+    try:
+        with get_weaviate_client() as client:
+            if client is None:
+                return jsonify({
+                    "error": "Weaviate connection failed",
+                    "message": "Cannot connect to Weaviate database"
+                }), 500
+
+            # Query Chunk collection to get all unique works with counts
+            chunks = client.collections.get("Chunk")
+
+            # Fetch all chunks to aggregate by work
+            # Using a larger limit to get all documents
+            # Note: Don't use return_properties with nested objects (causes gRPC error)
+            # Fetch all objects without specifying properties
+            all_chunks = chunks.query.fetch_objects(limit=10000)
+
+            # Aggregate chunks by work (title + author)
+            works_count: Dict[str, Dict[str, Any]] = {}
+
+            for obj in all_chunks.objects:
+                work_obj = obj.properties.get("work")
+                if work_obj and isinstance(work_obj, dict):
+                    title = work_obj.get("title", "")
+                    author = work_obj.get("author", "")
+
+                    if title:  # Only count if title exists
+                        # Use title as key (assumes unique titles)
+                        if title not in works_count:
+                            works_count[title] = {
+                                "title": title,
+                                "author": author or "Unknown",
+                                "chunks_count": 0
+                            }
+                        works_count[title]["chunks_count"] += 1
+
+            # Convert to list and sort by author, then title
+            works_list = list(works_count.values())
+            works_list.sort(key=lambda w: (w["author"].lower(), w["title"].lower()))
+
+            print(f"[API] /api/get-works: Found {len(works_list)} unique works")
+
+            return jsonify(works_list)
+
+    except Exception as e:
+        print(f"[API] /api/get-works error: {e}")
+        return jsonify({
+            "error": "Database query failed",
+            "message": str(e)
+        }), 500
+
+
+
@app.route("/chat")
 def chat() -> str:
    """Render the conversation RAG interface.