"""Flask web application for Library RAG - Philosophical Text Search. This module provides a web interface for the Library RAG application, enabling users to upload PDF documents, process them through the OCR/LLM pipeline, and perform semantic searches on the indexed philosophical texts stored in Weaviate. Architecture: The application is built on Flask and connects to a local Weaviate instance for vector storage and semantic search. PDF processing is handled asynchronously using background threads with Server-Sent Events (SSE) for real-time progress. Routes: - ``/`` : Home page with collection statistics (passages, authors, works) - ``/passages`` : Paginated list of all passages with author/work filters - ``/search`` : Semantic search interface using vector similarity - ``/upload`` : PDF upload form with processing options - ``/upload/progress/`` : SSE endpoint for real-time processing updates - ``/upload/status/`` : JSON endpoint to check job status - ``/documents`` : List of all processed documents - ``/documents//view`` : Detailed view of a processed document - ``/documents/delete/`` : Delete a document and its Weaviate data - ``/output/`` : Static file server for processed outputs SSE Implementation: The upload progress system uses Server-Sent Events to stream real-time processing updates to the browser. Each processing step emits events:: {"type": "step", "step": "OCR", "status": "running", "detail": "Page 1/10"} {"type": "complete", "redirect": "/documents/doc_name/view"} {"type": "error", "message": "OCR failed"} The SSE endpoint includes keep-alive messages every 30 seconds to maintain the connection and detect stale jobs. Weaviate Connection: The application uses a context manager ``get_weaviate_client()`` to handle Weaviate connections. This ensures proper cleanup of connections even when errors occur. The client connects to localhost:8080 (HTTP) and localhost:50051 (gRPC) by default. Configuration: - ``SECRET_KEY`` : Flask session secret (set via environment variable) - ``UPLOAD_FOLDER`` : Directory for processed PDF outputs (default: ./output) - ``MAX_CONTENT_LENGTH`` : Maximum upload size (default: 50MB) Example: Start the application in development mode:: $ python flask_app.py Or with production settings:: $ export SECRET_KEY="your-production-secret" $ gunicorn -w 4 flask_app:app Access the web interface at http://localhost:5000 Dependencies: - Flask 3.0+ for web framework - Weaviate Python client for vector database - utils.pdf_pipeline for PDF processing - utils.weaviate_ingest for database operations See Also: - ``utils/pdf_pipeline.py`` : PDF processing pipeline - ``utils/weaviate_ingest.py`` : Weaviate ingestion functions - ``schema.py`` : Weaviate collection schemas """ import os import json import uuid import threading import queue import time from pathlib import Path from typing import Any, Dict, Generator, Iterator, List, Optional, Union from flask import Flask, render_template, request, jsonify, redirect, url_for, send_from_directory, Response, flash from contextlib import contextmanager from werkzeug.utils import secure_filename from werkzeug.wrappers import Response as WerkzeugResponse import weaviate import weaviate.classes.query as wvq from utils.types import ( CollectionStats, ProcessingOptions, SSEEvent, ) app = Flask(__name__) # Configuration Flask app.config["SECRET_KEY"] = os.environ.get("SECRET_KEY", "dev-secret-key-change-in-production") # Configuration upload app.config["UPLOAD_FOLDER"] = Path(__file__).parent / "output" app.config["MAX_CONTENT_LENGTH"] = 50 * 1024 * 1024 # 50 MB max ALLOWED_EXTENSIONS = {"pdf", "md", "docx"} # Stockage des jobs de traitement en cours processing_jobs: Dict[str, Dict[str, Any]] = {} # {job_id: {"status": str, "queue": Queue, "result": dict}} # Stockage des sessions de chat en cours chat_sessions: Dict[str, Dict[str, Any]] = {} # {session_id: {"status": str, "queue": Queue, "context": list}} # Stockage des jobs TTS en cours tts_jobs: Dict[str, Dict[str, Any]] = {} # {job_id: {"status": str, "filepath": Path, "error": str}} # ═══════════════════════════════════════════════════════════════════════════════ # Weaviate Connection # ═══════════════════════════════════════════════════════════════════════════════ @contextmanager def get_weaviate_client() -> Generator[Optional[weaviate.WeaviateClient], None, None]: """Context manager for Weaviate connection. Yields: WeaviateClient if connection succeeds, None otherwise. """ client: Optional[weaviate.WeaviateClient] = None try: client = weaviate.connect_to_local( host="localhost", port=8080, grpc_port=50051, ) yield client except Exception as e: print(f"Erreur connexion Weaviate: {e}") yield None finally: if client: try: client.close() except Exception as e: print(f"Erreur fermeture client Weaviate: {e}") def get_collection_stats() -> Optional[CollectionStats]: """Get statistics about Weaviate collections. Returns: CollectionStats with passage counts and unique values, or None on error. """ try: with get_weaviate_client() as client: if client is None: return None stats: CollectionStats = {} # Chunk stats (renamed from Passage) passages = client.collections.get("Chunk") passage_count = passages.aggregate.over_all(total_count=True) stats["passages"] = passage_count.total_count or 0 # Get unique authors and works (from nested objects) all_passages = passages.query.fetch_objects(limit=1000) authors: set[str] = set() works: set[str] = set() languages: set[str] = set() for obj in all_passages.objects: # Work is now a nested object with {title, author} work_obj = obj.properties.get("work") if work_obj and isinstance(work_obj, dict): if work_obj.get("author"): authors.add(str(work_obj["author"])) if work_obj.get("title"): works.add(str(work_obj["title"])) if obj.properties.get("language"): languages.add(str(obj.properties["language"])) stats["authors"] = len(authors) stats["works"] = len(works) stats["languages"] = len(languages) stats["author_list"] = sorted(authors) stats["work_list"] = sorted(works) stats["language_list"] = sorted(languages) return stats except Exception as e: print(f"Erreur stats: {e}") return None def get_all_passages( limit: int = 50, offset: int = 0, ) -> List[Dict[str, Any]]: """Fetch all passages with pagination. Args: limit: Maximum number of passages to return. offset: Number of passages to skip (for pagination). Returns: List of passage dictionaries with uuid and properties. Note: Author/work filters are disabled due to Weaviate 1.34.4 limitation: nested object filtering is not yet supported (GitHub issue #3694). """ try: with get_weaviate_client() as client: if client is None: return [] chunks = client.collections.get("Chunk") result = chunks.query.fetch_objects( limit=limit, offset=offset, return_properties=[ "text", "sectionPath", "sectionLevel", "chapterTitle", "canonicalReference", "unitType", "keywords", "orderIndex", "language" ], ) return [ { "uuid": str(obj.uuid), **obj.properties } for obj in result.objects ] except Exception as e: print(f"Erreur passages: {e}") return [] def simple_search( query: str, limit: int = 10, author_filter: Optional[str] = None, work_filter: Optional[str] = None, ) -> List[Dict[str, Any]]: """Single-stage semantic search on Chunk collection (original implementation). Args: query: Search query text. limit: Maximum number of results to return. author_filter: Filter by author name (uses workAuthor property). work_filter: Filter by work title (uses workTitle property). Returns: List of passage dictionaries with uuid, similarity, and properties. """ try: with get_weaviate_client() as client: if client is None: return [] chunks = client.collections.get("Chunk") # Build filters using top-level properties (workAuthor, workTitle) filters: Optional[Any] = None if author_filter: filters = wvq.Filter.by_property("workAuthor").equal(author_filter) if work_filter: work_filter_obj = wvq.Filter.by_property("workTitle").equal(work_filter) filters = filters & work_filter_obj if filters else work_filter_obj result = chunks.query.near_text( query=query, limit=limit, filters=filters, return_metadata=wvq.MetadataQuery(distance=True), return_properties=[ "text", "sectionPath", "sectionLevel", "chapterTitle", "canonicalReference", "unitType", "keywords", "orderIndex", "language" ], ) return [ { "uuid": str(obj.uuid), "distance": obj.metadata.distance if obj.metadata else None, "similarity": round((1 - obj.metadata.distance) * 100, 1) if obj.metadata and obj.metadata.distance else None, **obj.properties } for obj in result.objects ] except Exception as e: print(f"Erreur recherche: {e}") return [] def hierarchical_search( query: str, limit: int = 10, author_filter: Optional[str] = None, work_filter: Optional[str] = None, sections_limit: int = 5, force_hierarchical: bool = False, ) -> Dict[str, Any]: """Two-stage hierarchical semantic search: Summary → Chunks. Stage 1: Find top-N relevant sections via Summary collection. Stage 2: Search chunks within those sections for better precision. Args: query: Search query text. limit: Maximum number of chunks to return per section. author_filter: Filter by author name. work_filter: Filter by work title. sections_limit: Number of top sections to retrieve (default: 5). force_hierarchical: If True, never fallback to simple search (for testing). Returns: Dictionary with hierarchical search results: - mode: "hierarchical" - sections: List of section dictionaries with nested chunks - results: Flat list of all chunks (for compatibility) - total_chunks: Total number of chunks found - fallback_reason: Explanation if forced but 0 results (optional) """ with get_weaviate_client() as client: if client is None: # Return empty result - let caller decide fallback return { "mode": "hierarchical" if force_hierarchical else "error", "sections": [], "results": [], "total_chunks": 0, "fallback_reason": "Weaviate client unavailable", } try: # ═══════════════════════════════════════════════════════════════ # STAGE 1: Search Summary collection for relevant sections # ═══════════════════════════════════════════════════════════════ summary_collection = client.collections.get("Summary") summaries_result = summary_collection.query.near_text( query=query, limit=sections_limit, return_metadata=wvq.MetadataQuery(distance=True), # Note: Don't specify return_properties - let Weaviate return all properties # including nested objects like "document" which we need for source_id ) if not summaries_result.objects: # No summaries found - return empty result return { "mode": "hierarchical" if force_hierarchical else "error", "sections": [], "results": [], "total_chunks": 0, "fallback_reason": f"Aucune section pertinente trouvée (0/{sections_limit} summaries)", } # Extract section data sections_data = [] for summary_obj in summaries_result.objects: props = summary_obj.properties # Try to get document.sourceId if available (nested object might still be returned) doc_obj = props.get("document") source_id = "" if doc_obj and isinstance(doc_obj, dict): source_id = doc_obj.get("sourceId", "") sections_data.append({ "section_path": props.get("sectionPath", ""), "title": props.get("title", ""), "summary_text": props.get("text", ""), "level": props.get("level", 1), "concepts": props.get("concepts", []), "document_source_id": source_id, "summary_uuid": str(summary_obj.uuid), # Keep UUID for later retrieval if needed "similarity": round((1 - summary_obj.metadata.distance) * 100, 1) if summary_obj.metadata and summary_obj.metadata.distance else 0, }) # Post-filter sections by author/work (Summary doesn't have work nested object) if author_filter or work_filter: print(f"[HIERARCHICAL] Post-filtering {len(sections_data)} sections by work='{work_filter}'") doc_collection = client.collections.get("Document") filtered_sections = [] for section in sections_data: source_id = section["document_source_id"] if not source_id: print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no sourceId)") continue # Query Document to get work metadata # Note: 'work' is a nested object, so we don't specify it in return_properties # Weaviate should return it automatically doc_result = doc_collection.query.fetch_objects( filters=wvq.Filter.by_property("sourceId").equal(source_id), limit=1, ) if doc_result.objects: doc_work = doc_result.objects[0].properties.get("work", {}) print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' doc_work type={type(doc_work)}, value={doc_work}") if isinstance(doc_work, dict): work_title = doc_work.get("title", "N/A") work_author = doc_work.get("author", "N/A") # Check filters if author_filter and work_author != author_filter: print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (author '{work_author}' != '{author_filter}')") continue if work_filter and work_title != work_filter: print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (work '{work_title}' != '{work_filter}')") continue print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' KEPT (work='{work_title}')") filtered_sections.append(section) else: print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (doc_work not a dict)") else: print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no doc found for sourceId='{source_id}')") sections_data = filtered_sections print(f"[HIERARCHICAL] After filtering: {len(sections_data)} sections remaining") if not sections_data: # No sections match filters - return empty result filters_str = f"author={author_filter}" if author_filter else "" if work_filter: filters_str += f", work={work_filter}" if filters_str else f"work={work_filter}" return { "mode": "hierarchical" if force_hierarchical else "error", "sections": [], "results": [], "total_chunks": 0, "fallback_reason": f"Aucune section ne correspond aux filtres ({filters_str})", } # ═══════════════════════════════════════════════════════════════ # STAGE 2: Search chunks for EACH section (grouped display) # ═══════════════════════════════════════════════════════════════ # For each section, search chunks using the section's summary text # This groups chunks under their relevant sections chunk_collection = client.collections.get("Chunk") # Build base filters (author/work only) base_filters: Optional[Any] = None if author_filter: base_filters = wvq.Filter.by_property("workAuthor").equal(author_filter) if work_filter: work_filter_obj = wvq.Filter.by_property("workTitle").equal(work_filter) base_filters = base_filters & work_filter_obj if base_filters else work_filter_obj all_chunks = [] chunks_per_section = max(3, limit // len(sections_data)) # Distribute chunks across sections for section in sections_data: # Use section's summary text as query to find relevant chunks # This ensures chunks are semantically related to the section section_query = section["summary_text"] or section["title"] or query # Build filters: base filters (author/work) + sectionPath filter # Use .like() to match hierarchical sections (e.g., "Chapter 1*" matches "Chapter 1 > Section A") # This ensures each chunk only appears in its own section hierarchy section_path_pattern = f"{section['section_path']}*" section_filters = wvq.Filter.by_property("sectionPath").like(section_path_pattern) if base_filters: section_filters = base_filters & section_filters chunks_result = chunk_collection.query.near_text( query=section_query, limit=chunks_per_section, filters=section_filters, return_metadata=wvq.MetadataQuery(distance=True), ) # Convert to list and attach to section section_chunks = [ { "uuid": str(obj.uuid), "distance": obj.metadata.distance if obj.metadata else None, "similarity": round((1 - obj.metadata.distance) * 100, 1) if obj.metadata and obj.metadata.distance else None, **obj.properties } for obj in chunks_result.objects ] print(f"[HIERARCHICAL] Section '{section['section_path'][:50]}...' filter='{section_path_pattern[:50]}...' -> {len(section_chunks)} chunks") section["chunks"] = section_chunks section["chunks_count"] = len(section_chunks) all_chunks.extend(section_chunks) print(f"[HIERARCHICAL] Got {len(all_chunks)} chunks total across {len(sections_data)} sections") print(f"[HIERARCHICAL] Average {len(all_chunks) / len(sections_data):.1f} chunks per section") # Sort all chunks globally by similarity for the flat results list all_chunks.sort(key=lambda x: x.get("similarity", 0) or 0, reverse=True) return { "mode": "hierarchical", "sections": sections_data, "results": all_chunks, "total_chunks": len(all_chunks), } except Exception as e: # Handle errors within the try block (inside 'with') print(f"Erreur recherche hiérarchique: {e}") import traceback traceback.print_exc() # Return empty result (don't call simple_search here!) return { "mode": "hierarchical" if force_hierarchical else "error", "sections": [], "results": [], "total_chunks": 0, "fallback_reason": f"Erreur lors de la recherche: {str(e)}", } def should_use_hierarchical_search(query: str) -> bool: """Detect if a query would benefit from hierarchical 2-stage search. Hierarchical search is recommended for: - Long queries (≥15 characters) indicating complex questions - Multi-concept queries (2+ significant words) - Queries with logical connectors (et, ou, mais, donc, car) Args: query: Search query text. Returns: True if hierarchical search is recommended, False for simple search. Examples: >>> should_use_hierarchical_search("justice") False # Short query, single concept >>> should_use_hierarchical_search("Qu'est-ce que la justice selon Platon ?") True # Long query, multi-concept, philosophical question >>> should_use_hierarchical_search("vertu et sagesse") True # Multi-concept with connector """ if not query or len(query.strip()) == 0: return False query_lower = query.lower().strip() # Criterion 1: Long queries (≥15 chars) suggest complexity if len(query_lower) >= 15: return True # Criterion 2: Presence of logical connectors connectors = ["et", "ou", "mais", "donc", "car", "parce que", "puisque", "si"] if any(f" {connector} " in f" {query_lower} " for connector in connectors): return True # Criterion 3: Multi-concept (2+ significant words, excluding stop words) stop_words = { "le", "la", "les", "un", "une", "des", "du", "de", "d", "ce", "cette", "ces", "mon", "ma", "mes", "ton", "ta", "tes", "à", "au", "aux", "dans", "sur", "pour", "par", "avec", "que", "qui", "quoi", "dont", "où", "est", "sont", "a", "qu", "c", "l", "s", "n", "m", "t", "j", "y", } words = query_lower.split() significant_words = [w for w in words if len(w) > 2 and w not in stop_words] if len(significant_words) >= 2: return True # Default: use simple search for short, single-concept queries return False def summary_only_search( query: str, limit: int = 10, author_filter: Optional[str] = None, work_filter: Optional[str] = None, ) -> List[Dict[str, Any]]: """Summary-only semantic search on Summary collection (90% visibility). Searches high-level section summaries instead of detailed chunks. Offers 90% visibility of rich documents vs 10% for direct chunk search due to Peirce chunk dominance (5,068/5,230 = 97% of chunks). Args: query: Search query text. limit: Maximum number of summary results to return. author_filter: Filter by author name (uses document.author property). work_filter: Filter by work title (uses document.title property). Returns: List of summary dictionaries formatted as "results" with: - uuid, similarity, text, title, concepts, doc_icon, doc_name - author, year, chunks_count, section_path """ try: with get_weaviate_client() as client: if client is None: return [] summaries = client.collections.get("Summary") # Note: Cannot filter by nested document properties directly in Weaviate v4 # Must fetch all and filter in Python if author/work filters are present # Semantic search results = summaries.query.near_text( query=query, limit=limit * 3 if (author_filter or work_filter) else limit, # Fetch more if filtering return_metadata=wvq.MetadataQuery(distance=True) ) # Format and filter results formatted_results: List[Dict[str, Any]] = [] for obj in results.objects: props = obj.properties similarity = 1 - obj.metadata.distance # Apply filters (Python-side since nested properties) if author_filter and props["document"].get("author", "") != author_filter: continue if work_filter and props["document"].get("title", "") != work_filter: continue # Determine document icon and name doc_id = props["document"]["sourceId"].lower() if "tiercelin" in doc_id: doc_icon = "🟡" doc_name = "Tiercelin" elif "platon" in doc_id or "menon" in doc_id: doc_icon = "🟢" doc_name = "Platon" elif "haugeland" in doc_id: doc_icon = "🟣" doc_name = "Haugeland" elif "logique" in doc_id: doc_icon = "🔵" doc_name = "Logique" else: doc_icon = "⚪" doc_name = "Peirce" # Format result (compatible with existing template expectations) result = { "uuid": str(obj.uuid), "similarity": round(similarity * 100, 1), # Convert to percentage "text": props.get("text", ""), "title": props["title"], "concepts": props.get("concepts", []), "doc_icon": doc_icon, "doc_name": doc_name, "author": props["document"].get("author", ""), "year": props["document"].get("year", 0), "chunks_count": props.get("chunksCount", 0), "section_path": props.get("sectionPath", ""), "sectionPath": props.get("sectionPath", ""), # Alias for template compatibility # Add work info for template compatibility "work": { "title": props["document"].get("title", ""), "author": props["document"].get("author", ""), }, } formatted_results.append(result) # Stop if we have enough results after filtering if len(formatted_results) >= limit: break return formatted_results except Exception as e: print(f"Error in summary_only_search: {e}") return [] def search_passages( query: str, limit: int = 10, author_filter: Optional[str] = None, work_filter: Optional[str] = None, sections_limit: int = 5, force_mode: Optional[str] = None, ) -> Dict[str, Any]: """Intelligent semantic search dispatcher with auto-detection. Automatically chooses between simple (1-stage), hierarchical (2-stage), or summary-only search based on query complexity or user selection. Args: query: Search query text. limit: Maximum number of chunks to return (per section if hierarchical). author_filter: Filter by author name (uses workAuthor property). work_filter: Filter by work title (uses workTitle property). sections_limit: Number of top sections for hierarchical search (default: 5). force_mode: Force search mode ("simple", "hierarchical", "summary", or None for auto). Returns: Dictionary with search results: - mode: "simple", "hierarchical", or "summary" - results: List of passage/summary dictionaries (flat) - sections: List of section dicts with nested chunks (hierarchical only) - total_chunks: Total number of chunks/summaries found Examples: >>> # Short query → auto-detects simple search >>> search_passages("justice", limit=10) {"mode": "simple", "results": [...], "total_chunks": 10} >>> # Complex query → auto-detects hierarchical search >>> search_passages("Qu'est-ce que la vertu selon Aristote ?", limit=5) {"mode": "hierarchical", "sections": [...], "results": [...], "total_chunks": 15} >>> # Force summary-only mode (90% visibility, high-level overviews) >>> search_passages("What is the Turing test?", force_mode="summary", limit=10) {"mode": "summary", "results": [...], "total_chunks": 7} """ # Handle summary-only mode if force_mode == "summary": results = summary_only_search(query, limit, author_filter, work_filter) return { "mode": "summary", "results": results, "total_chunks": len(results), } # Determine search mode for simple vs hierarchical if force_mode == "simple": use_hierarchical = False elif force_mode == "hierarchical": use_hierarchical = True else: # Auto-detection use_hierarchical = should_use_hierarchical_search(query) # Execute appropriate search strategy if use_hierarchical: result = hierarchical_search( query=query, limit=limit, author_filter=author_filter, work_filter=work_filter, sections_limit=sections_limit, force_hierarchical=(force_mode == "hierarchical"), # No fallback if explicitly forced ) # If hierarchical search failed and wasn't forced, fallback to simple search if result.get("mode") == "error" and force_mode != "hierarchical": results = simple_search(query, limit, author_filter, work_filter) return { "mode": "simple", "results": results, "total_chunks": len(results), } return result else: results = simple_search(query, limit, author_filter, work_filter) return { "mode": "simple", "results": results, "total_chunks": len(results), } # ═══════════════════════════════════════════════════════════════════════════════ # Routes # ═══════════════════════════════════════════════════════════════════════════════ @app.route("/") def index() -> str: """Render the home page with collection statistics. Displays an overview of the Library RAG application with statistics about indexed passages, works, authors, and supported languages from Weaviate. Returns: Rendered HTML template (index.html) with collection statistics including: - Total passage count - Number of unique authors and works - List of available languages Note: If Weaviate connection fails, stats will be None and the template should handle displaying an appropriate fallback message. """ from utils.types import CollectionStats stats: Optional[CollectionStats] = get_collection_stats() return render_template("index.html", stats=stats) @app.route("/passages") def passages() -> str: """Render the passages list page with pagination and filtering. Displays a paginated list of all indexed passages from Weaviate with optional filtering by author and/or work title. Includes statistics and filter options in the sidebar. Query Parameters: page (int): Page number for pagination. Defaults to 1. per_page (int): Number of passages per page. Defaults to 20. author (str, optional): Filter passages by author name. work (str, optional): Filter passages by work title. Returns: Rendered HTML template (passages.html) with: - List of passages for the current page - Collection statistics for sidebar filters - Pagination controls - Current filter state Example: GET /passages?page=2&per_page=50&author=Platon Returns page 2 with 50 passages per page, filtered by author "Platon". """ page: int = request.args.get("page", 1, type=int) per_page: int = request.args.get("per_page", 20, type=int) author: Optional[str] = request.args.get("author", None) work: Optional[str] = request.args.get("work", None) # Clean filters if author == "": author = None if work == "": work = None offset: int = (page - 1) * per_page from utils.types import CollectionStats stats: Optional[CollectionStats] = get_collection_stats() passages_list: List[Dict[str, Any]] = get_all_passages( limit=per_page, offset=offset, ) return render_template( "passages.html", chunks=passages_list, stats=stats, page=page, per_page=per_page, author_filter=author, work_filter=work, ) @app.route("/search") def search() -> str: """Render the semantic search page with vector similarity results. Provides a search interface for finding passages using semantic similarity via Weaviate's near_text query. Results include similarity scores and can be filtered by author and/or work. Query Parameters: q (str): Search query text. Empty string shows no results. limit (int): Maximum number of chunks per section. Defaults to 10. author (str, optional): Filter results by author name. work (str, optional): Filter results by work title. sections_limit (int): Number of sections for hierarchical search. Defaults to 5. mode (str, optional): Force search mode ("simple", "hierarchical", or "" for auto). Returns: Rendered HTML template (search.html) with: - Search form with current query - List of matching passages with similarity percentages - Collection statistics for filter dropdowns - Current filter state - Search mode indicator (simple vs hierarchical) Example: GET /search?q=la%20mort%20et%20le%20temps&limit=5§ions_limit=3 Auto-detects hierarchical search, returns top 3 sections with 5 chunks each. """ query: str = request.args.get("q", "") limit: int = request.args.get("limit", 10, type=int) author: Optional[str] = request.args.get("author", None) work: Optional[str] = request.args.get("work", None) sections_limit: int = request.args.get("sections_limit", 5, type=int) mode: Optional[str] = request.args.get("mode", None) # Clean filters if author == "": author = None if work == "": work = None if mode == "": mode = None from utils.types import CollectionStats stats: Optional[CollectionStats] = get_collection_stats() results_data: Optional[Dict[str, Any]] = None if query: results_data = search_passages( query=query, limit=limit, author_filter=author, work_filter=work, sections_limit=sections_limit, force_mode=mode, ) return render_template( "search.html", query=query, results_data=results_data, stats=stats, limit=limit, sections_limit=sections_limit, mode=mode, author_filter=author, work_filter=work, ) def rag_search(query: str, limit: int = 5) -> List[Dict[str, Any]]: """Search passages for RAG context with formatted results. Wraps the existing search_passages() function but returns results formatted specifically for RAG prompt construction. Includes author, work, and section information needed to build context for LLM generation. Args: query: The user's question or search query. limit: Maximum number of context chunks to retrieve. Defaults to 5. Returns: List of context dictionaries with keys: - text (str): The passage text content - author (str): Author name (from workAuthor) - work (str): Work title (from workTitle) - section (str): Section path or chapter title - similarity (float): Similarity score 0-100 - uuid (str): Weaviate chunk UUID Example: >>> results = rag_search("Qu'est-ce que la vertu ?", limit=3) >>> results[0]["author"] 'Platon' >>> results[0]["work"] 'République' """ import time start_time = time.time() try: with get_weaviate_client() as client: if client is None: print("[RAG Search] Weaviate client unavailable") return [] chunks = client.collections.get("Chunk") # Query with properties needed for RAG context result = chunks.query.near_text( query=query, limit=limit, return_metadata=wvq.MetadataQuery(distance=True), return_properties=[ "text", "workAuthor", # Top-level author property "workTitle", # Top-level work property "sectionPath", "chapterTitle", "canonicalReference", ], ) # Format results for RAG prompt construction formatted_results = [] for obj in result.objects: props = obj.properties similarity = round((1 - obj.metadata.distance) * 100, 1) if obj.metadata and obj.metadata.distance else 0.0 formatted_results.append({ "text": props.get("text", ""), "author": props.get("workAuthor", "Auteur inconnu"), "work": props.get("workTitle", "Œuvre inconnue"), "section": props.get("sectionPath") or props.get("chapterTitle") or "Section inconnue", "similarity": similarity, "uuid": str(obj.uuid), }) # Log search metrics elapsed = time.time() - start_time print(f"[RAG Search] Query: '{query[:50]}...' | Results: {len(formatted_results)} | Time: {elapsed:.2f}s") return formatted_results except Exception as e: print(f"[RAG Search] Error: {e}") return [] def diverse_author_search( query: str, limit: int = 10, initial_pool: int = 100, max_authors: int = 5, chunks_per_author: int = 2 ) -> List[Dict[str, Any]]: """Search passages with author diversity to avoid corpus imbalance bias. This function addresses the problem where prolific authors (e.g., Peirce with 300 works) dominate search results over less represented but equally relevant authors (e.g., Tiercelin with 1 work). Algorithm: 1. Retrieve large initial pool of chunks (e.g., 100) 2. Group chunks by author 3. Compute average similarity score of top-3 chunks per author 4. Select top-N authors by average score 5. Extract best chunks from each selected author 6. Return diversified chunk list Args: query: The user's question or search query. limit: Maximum number of chunks to return (default: 10). initial_pool: Size of initial candidate pool (default: 100). max_authors: Maximum number of distinct authors to include (default: 5). chunks_per_author: Number of chunks per selected author (default: 2). Returns: List of context dictionaries with keys: - text (str): The passage text content - author (str): Author name (from workAuthor) - work (str): Work title (from workTitle) - section (str): Section path or chapter title - similarity (float): Similarity score 0-100 - uuid (str): Weaviate chunk UUID Example: >>> results = diverse_author_search("Scotus et Peirce", limit=10) >>> authors = set(r["author"] for r in results) >>> len(authors) # Multiple authors guaranteed 5 >>> [r["author"] for r in results].count("Peirce") # Max chunks_per_author 2 Note: This prevents a single prolific author from dominating all results. For "Scotus et Peirce", ensures results from Peirce, Tiercelin, Scotus, Boler, and other relevant commentators. """ import time start_time = time.time() print(f"[Diverse Search] CALLED with query='{query[:50]}...', initial_pool={initial_pool}, max_authors={max_authors}, chunks_per_author={chunks_per_author}") try: # Step 1: Retrieve large initial pool print(f"[Diverse Search] Calling rag_search with limit={initial_pool}") candidates = rag_search(query, limit=initial_pool) print(f"[Diverse Search] rag_search returned {len(candidates)} candidates") if not candidates: print("[Diverse Search] No candidates found, returning empty list") return [] # Step 2: Group chunks by author by_author: Dict[str, List[Dict[str, Any]]] = {} for chunk in candidates: author = chunk.get("author", "Auteur inconnu") if author not in by_author: by_author[author] = [] by_author[author].append(chunk) print(f"[Diverse Search] Found {len(by_author)} distinct authors in pool of {len(candidates)} chunks") # Step 3: Compute average similarity of top-3 chunks per author author_scores: Dict[str, float] = {} for author, chunks in by_author.items(): # Sort by similarity descending sorted_chunks = sorted(chunks, key=lambda x: x["similarity"], reverse=True) # Take top-3 (or all if fewer than 3) top_chunks = sorted_chunks[:3] # Average similarity avg_score = sum(c["similarity"] for c in top_chunks) / len(top_chunks) author_scores[author] = avg_score # Step 4: Select top-N authors by average score top_authors = sorted(author_scores.items(), key=lambda x: x[1], reverse=True)[:max_authors] print(f"[Diverse Search] Top {len(top_authors)} authors: {[author for author, score in top_authors]}") for author, score in top_authors: print(f" - {author}: avg_score={score:.1f}%, {len(by_author[author])} chunks in pool") # Step 5: Extract best chunks from each selected author # SMART ALLOCATION: If only 1-2 authors, take more chunks per author to reach target limit num_authors = len(top_authors) if num_authors == 1: # Only one author: take up to 'limit' chunks from that author adaptive_chunks_per_author = limit print(f"[Diverse Search] Only 1 author found → taking up to {adaptive_chunks_per_author} chunks") elif num_authors <= 3: # Few authors (2-3): take more chunks per author adaptive_chunks_per_author = max(chunks_per_author, limit // num_authors) print(f"[Diverse Search] Only {num_authors} authors → taking up to {adaptive_chunks_per_author} chunks per author") else: # Many authors (4+): stick to original limit for diversity adaptive_chunks_per_author = chunks_per_author print(f"[Diverse Search] {num_authors} authors → taking {adaptive_chunks_per_author} chunks per author") final_chunks: List[Dict[str, Any]] = [] for author, avg_score in top_authors: # Get best chunks for this author author_chunks = sorted(by_author[author], key=lambda x: x["similarity"], reverse=True) selected = author_chunks[:adaptive_chunks_per_author] final_chunks.extend(selected) # Cap at limit final_chunks = final_chunks[:limit] # Log final metrics final_authors = set(c["author"] for c in final_chunks) elapsed = time.time() - start_time print(f"[Diverse Search] Final: {len(final_chunks)} chunks from {len(final_authors)} authors | Time: {elapsed:.2f}s") return final_chunks except Exception as e: import traceback print(f"[Diverse Search] EXCEPTION CAUGHT: {e}") print(f"[Diverse Search] Traceback: {traceback.format_exc()}") print(f"[Diverse Search] Falling back to standard rag_search with limit={limit}") # Fallback to standard search return rag_search(query, limit) def build_prompt_with_context(user_question: str, rag_context: List[Dict[str, Any]]) -> str: """Build a prompt for LLM generation using RAG context. Constructs a comprehensive prompt that includes a system instruction, formatted RAG context chunks with author/work metadata, and the user's question. The prompt is designed to work with all LLM providers (Ollama, Mistral, Anthropic, OpenAI). Args: user_question: The user's question in natural language. rag_context: List of context dictionaries from rag_search() with keys: - text: Passage text - author: Author name - work: Work title - section: Section or chapter - similarity: Similarity score (0-100) Returns: Formatted prompt string ready for LLM generation. Example: >>> context = rag_search("Qu'est-ce que la justice ?", limit=2) >>> prompt = build_prompt_with_context("Qu'est-ce que la justice ?", context) >>> print(prompt[:100]) 'Vous êtes un assistant spécialisé en philosophie...' """ # System instruction system_instruction = """Vous êtes un assistant expert en philosophie. Votre rôle est de fournir des analyses APPROFONDIES et DÉTAILLÉES en vous appuyant sur les passages philosophiques fournis. INSTRUCTIONS IMPÉRATIVES : - Fournissez une réponse LONGUE et DÉVELOPPÉE (minimum 500-800 mots) - Analysez EN PROFONDEUR tous les aspects de la question - Citez ABONDAMMENT les passages fournis avec références précises (auteur, œuvre) - Développez les concepts philosophiques, ne vous contentez PAS de résumés superficiels - Explorez les NUANCES, les implications, les relations entre les idées - Structurez votre réponse en sections claires (introduction, développement avec sous-parties, conclusion) - Si les passages ne couvrent pas tous les aspects, indiquez-le mais développez ce qui est disponible - Adoptez un style académique rigoureux digne d'une analyse philosophique universitaire - N'inventez JAMAIS d'informations absentes des passages, mais exploitez à fond celles qui y sont""" # Build context section context_section = "\n\nPASSAGES PHILOSOPHIQUES :\n\n" if not rag_context: context_section += "(Aucun passage trouvé)\n" else: for i, chunk in enumerate(rag_context, 1): author = chunk.get("author", "Auteur inconnu") work = chunk.get("work", "Œuvre inconnue") section = chunk.get("section", "") text = chunk.get("text", "") similarity = chunk.get("similarity", 0) # Truncate very long passages (keep first 2000 chars max per chunk for deep analysis) if len(text) > 2000: text = text[:2000] + "..." context_section += f"**Passage {i}** [Score de pertinence: {similarity}%]\n" context_section += f"**Auteur :** {author}\n" context_section += f"**Œuvre :** {work}\n" if section: context_section += f"**Section :** {section}\n" context_section += f"\n{text}\n\n" context_section += "---\n\n" # User question question_section = f"\nQUESTION :\n{user_question}\n\n" # Final instruction final_instruction = """CONSIGNE FINALE : Répondez à cette question en produisant une analyse philosophique COMPLÈTE et APPROFONDIE (minimum 500-800 mots). Votre réponse doit : 1. Commencer par une introduction contextualisant la question 2. Développer une analyse détaillée en plusieurs parties, citant abondamment les passages 3. Explorer les implications philosophiques, les concepts-clés, les relations entre les idées 4. Conclure en synthétisant l'apport des passages à la question posée Ne vous limitez PAS à un résumé superficiel. Développez, analysez, approfondissez. C'est une discussion philosophique universitaire, pas un tweet.""" # Combine all sections full_prompt = system_instruction + context_section + question_section + final_instruction # Truncate if too long (max ~30000 chars - modern LLMs have 128k+ context windows) if len(full_prompt) > 30000: # Reduce number of context chunks print(f"[Prompt Builder] Warning: Prompt too long ({len(full_prompt)} chars), truncating context") truncated_context = rag_context[:min(3, len(rag_context))] # Keep only top 3 chunks return build_prompt_with_context(user_question, truncated_context) return full_prompt @app.route("/test-rag") def test_rag() -> Dict[str, Any]: """Test endpoint for RAG search function. Example: GET /test-rag?q=vertu&limit=3 """ query = request.args.get("q", "Qu'est-ce que la vertu ?") limit = request.args.get("limit", 5, type=int) results = rag_search(query, limit) return jsonify({ "query": query, "limit": limit, "results_count": len(results), "results": results }) @app.route("/test-prompt") def test_prompt() -> str: """Test endpoint for prompt construction with RAG context. Example: GET /test-prompt?q=Qu'est-ce que la justice ?&limit=3 Returns: HTML page displaying the constructed prompt. """ query = request.args.get("q", "Qu'est-ce que la vertu ?") limit = request.args.get("limit", 3, type=int) # Get RAG context rag_context = rag_search(query, limit) # Build prompt prompt = build_prompt_with_context(query, rag_context) # Display as preformatted text in HTML html = f""" Test Prompt RAG

🧪 Test Prompt Construction RAG

Question: {query}
Contextes RAG: {len(rag_context)} passages
Longueur prompt: {len(prompt)} caractères

Prompt généré :

{prompt}
Chunks utilisés :
{chr(10).join([f"- {c['author']} - {c['work']} (similarité: {c['similarity']}%)" for c in rag_context])}
""" return html @app.route("/test-llm") def test_llm() -> WerkzeugResponse: """Test endpoint for LLM streaming. Example: GET /test-llm?provider=ollama&model=qwen2.5:7b&prompt=Hello Returns: Plain text streamed response. """ from utils.llm_chat import call_llm, LLMError provider = request.args.get("provider", "ollama") model = request.args.get("model", "qwen2.5:7b") prompt = request.args.get("prompt", "Réponds en une phrase: Qu'est-ce que la philosophie ?") def generate() -> Iterator[str]: try: yield f"[Test LLM Streaming]\n" yield f"Provider: {provider}\n" yield f"Model: {model}\n" yield f"Prompt: {prompt}\n\n" yield "Response:\n" for token in call_llm(prompt, provider, model, stream=True): yield token yield "\n\n[Done]" except LLMError as e: yield f"\n\n[Error] {str(e)}" except Exception as e: yield f"\n\n[Unexpected Error] {str(e)}" return Response(generate(), mimetype='text/plain') @app.route("/test-chat-backend") def test_chat_backend() -> str: """Test page for chat backend.""" return render_template("test_chat_backend.html")