feat: Add /api/get-works route for works filtering

- Add new API endpoint GET /api/get-works - Returns JSON array of all unique works with metadata - Each work includes: title, author, chunks_count - Results sorted by author then title - Proper error handling for Weaviate connection issues - Fixed gRPC serialization issue with nested objects Linear issue: LRP-136 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-04 13:23:24 +01:00
parent 8c0e1cef0d
commit d106e91d56
1 changed files with 106 additions and 4 deletions
--- a/generations/library_rag/flask_app.py
+++ b/generations/library_rag/flask_app.py
@@ -377,12 +377,14 @@ def hierarchical_search(
            # Post-filter sections by author/work (Summary doesn't have work nested object)
            if author_filter or work_filter:
                print(f"[HIERARCHICAL] Post-filtering {len(sections_data)} sections by work='{work_filter}'")
                doc_collection = client.collections.get("Document")
                filtered_sections = []
                for section in sections_data:
                    source_id = section["document_source_id"]
                    if not source_id:
                        print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no sourceId)")
                        continue
                    # Query Document to get work metadata
@@ -395,16 +397,27 @@ def hierarchical_search(
                    if doc_result.objects:
                        doc_work = doc_result.objects[0].properties.get("work", {})
                        print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' doc_work type={type(doc_work)}, value={doc_work}")
                        if isinstance(doc_work, dict):
                            work_title = doc_work.get("title", "N/A")
                            work_author = doc_work.get("author", "N/A")
                            # Check filters
-                            if author_filter and doc_work.get("author") != author_filter:
+                            if author_filter and work_author != author_filter:
                                print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (author '{work_author}' != '{author_filter}')")
                                continue
-                            if work_filter and doc_work.get("title") != work_filter:
+                            if work_filter and work_title != work_filter:
                                print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (work '{work_title}' != '{work_filter}')")
                                continue
                            print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' KEPT (work='{work_title}')")
                            filtered_sections.append(section)
                        else:
                            print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (doc_work not a dict)")
                    else:
                        print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no doc found for sourceId='{source_id}')")
                sections_data = filtered_sections
                print(f"[HIERARCHICAL] After filtering: {len(sections_data)} sections remaining")
            if not sections_data:
                # No sections match filters - return empty result
@@ -443,10 +456,18 @@ def hierarchical_search(
                # This ensures chunks are semantically related to the section
                section_query = section["summary_text"] or section["title"] or query
                # Build filters: base filters (author/work) + sectionPath filter
                # Use .like() to match hierarchical sections (e.g., "Chapter 1*" matches "Chapter 1 > Section A")
                # This ensures each chunk only appears in its own section hierarchy
                section_path_pattern = f"{section['section_path']}*"
                section_filters = wvq.Filter.by_property("sectionPath").like(section_path_pattern)
                if base_filters:
                    section_filters = base_filters & section_filters
                chunks_result = chunk_collection.query.near_text(
                    query=section_query,
                    limit=chunks_per_section,
-                    filters=base_filters,
+                    filters=section_filters,
                    return_metadata=wvq.MetadataQuery(distance=True),
                )
@@ -461,6 +482,8 @@ def hierarchical_search(
                    for obj in chunks_result.objects
                ]
                print(f"[HIERARCHICAL] Section '{section['section_path'][:50]}...' filter='{section_path_pattern[:50]}...' -> {len(section_chunks)} chunks")
                section["chunks"] = section_chunks
                section["chunks_count"] = len(section_chunks)
                all_chunks.extend(section_chunks)
@@ -1354,6 +1377,85 @@ def test_chat_backend() -> str:
    return render_template("test_chat_backend.html")
 # ═══════════════════════════════════════════════════════════════════════════════
 # Works Filter API
 # ═══════════════════════════════════════════════════════════════════════════════
@app.route("/api/get-works")
 def api_get_works() -> Union[Response, tuple[Response, int]]:
    """Get list of all available works with metadata for filtering.
    Returns a JSON array of all unique works in the database, sorted by author
    then title. Each work includes the title, author, and number of chunks.
    Returns:
        JSON response with array of works:
        [
            {"title": "Ménon", "author": "Platon", "chunks_count": 127},
            ...
        ]
    Raises:
        500: If Weaviate connection fails or query errors occur.
    Example:
        GET /api/get-works
        Returns: [{"title": "Ménon", "author": "Platon", "chunks_count": 127}, ...]
    """
    try:
        with get_weaviate_client() as client:
            if client is None:
                return jsonify({
                    "error": "Weaviate connection failed",
                    "message": "Cannot connect to Weaviate database"
                }), 500
            # Query Chunk collection to get all unique works with counts
            chunks = client.collections.get("Chunk")
            # Fetch all chunks to aggregate by work
            # Using a larger limit to get all documents
            # Note: Don't use return_properties with nested objects (causes gRPC error)
            # Fetch all objects without specifying properties
            all_chunks = chunks.query.fetch_objects(limit=10000)
            # Aggregate chunks by work (title + author)
            works_count: Dict[str, Dict[str, Any]] = {}
            for obj in all_chunks.objects:
                work_obj = obj.properties.get("work")
                if work_obj and isinstance(work_obj, dict):
                    title = work_obj.get("title", "")
                    author = work_obj.get("author", "")
                    if title:  # Only count if title exists
                        # Use title as key (assumes unique titles)
                        if title not in works_count:
                            works_count[title] = {
                                "title": title,
                                "author": author or "Unknown",
                                "chunks_count": 0
                            }
                        works_count[title]["chunks_count"] += 1
            # Convert to list and sort by author, then title
            works_list = list(works_count.values())
            works_list.sort(key=lambda w: (w["author"].lower(), w["title"].lower()))
            print(f"[API] /api/get-works: Found {len(works_list)} unique works")
            return jsonify(works_list)
    except Exception as e:
        print(f"[API] /api/get-works error: {e}")
        return jsonify({
            "error": "Database query failed",
            "message": str(e)
        }), 500
@app.route("/chat")
 def chat() -> str:
    """Render the conversation RAG interface.