- Add 'summary' field to Chunk collection (vectorized with text2vec) - Migrate from Dynamic index to HNSW + RQ for both Chunk and Summary - Add LLM summarizer module (utils/llm_summarizer.py) - Add migration scripts (migrate_add_summary.py, restore_*.py) - Add summary generation utilities and progress tracking - Add testing and cleaning tools (outils_test_and_cleaning/) - Add comprehensive documentation (ANALYSE_*.md, guides) - Remove obsolete files (linear_config.py, old test files) - Update .gitignore to exclude backups and temp files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1379 lines
55 KiB
Python
1379 lines
55 KiB
Python
"""Flask web application for Library RAG - Philosophical Text Search.
|
|
|
|
This module provides a web interface for the Library RAG application, enabling
|
|
users to upload PDF documents, process them through the OCR/LLM pipeline, and
|
|
perform semantic searches on the indexed philosophical texts stored in Weaviate.
|
|
|
|
Architecture:
|
|
The application is built on Flask and connects to a local Weaviate instance
|
|
for vector storage and semantic search. PDF processing is handled asynchronously
|
|
using background threads with Server-Sent Events (SSE) for real-time progress.
|
|
|
|
Routes:
|
|
- ``/`` : Home page with collection statistics (passages, authors, works)
|
|
- ``/passages`` : Paginated list of all passages with author/work filters
|
|
- ``/search`` : Semantic search interface using vector similarity
|
|
- ``/upload`` : PDF upload form with processing options
|
|
- ``/upload/progress/<job_id>`` : SSE endpoint for real-time processing updates
|
|
- ``/upload/status/<job_id>`` : JSON endpoint to check job status
|
|
- ``/documents`` : List of all processed documents
|
|
- ``/documents/<doc_name>/view`` : Detailed view of a processed document
|
|
- ``/documents/delete/<doc_name>`` : Delete a document and its Weaviate data
|
|
- ``/output/<filepath>`` : Static file server for processed outputs
|
|
|
|
SSE Implementation:
|
|
The upload progress system uses Server-Sent Events to stream real-time
|
|
processing updates to the browser. Each processing step emits events::
|
|
|
|
{"type": "step", "step": "OCR", "status": "running", "detail": "Page 1/10"}
|
|
{"type": "complete", "redirect": "/documents/doc_name/view"}
|
|
{"type": "error", "message": "OCR failed"}
|
|
|
|
The SSE endpoint includes keep-alive messages every 30 seconds to maintain
|
|
the connection and detect stale jobs.
|
|
|
|
Weaviate Connection:
|
|
The application uses a context manager ``get_weaviate_client()`` to handle
|
|
Weaviate connections. This ensures proper cleanup of connections even when
|
|
errors occur. The client connects to localhost:8080 (HTTP) and localhost:50051
|
|
(gRPC) by default.
|
|
|
|
Configuration:
|
|
- ``SECRET_KEY`` : Flask session secret (set via environment variable)
|
|
- ``UPLOAD_FOLDER`` : Directory for processed PDF outputs (default: ./output)
|
|
- ``MAX_CONTENT_LENGTH`` : Maximum upload size (default: 50MB)
|
|
|
|
Example:
|
|
Start the application in development mode::
|
|
|
|
$ python flask_app.py
|
|
|
|
Or with production settings::
|
|
|
|
$ export SECRET_KEY="your-production-secret"
|
|
$ gunicorn -w 4 flask_app:app
|
|
|
|
Access the web interface at http://localhost:5000
|
|
|
|
Dependencies:
|
|
- Flask 3.0+ for web framework
|
|
- Weaviate Python client for vector database
|
|
- utils.pdf_pipeline for PDF processing
|
|
- utils.weaviate_ingest for database operations
|
|
|
|
See Also:
|
|
- ``utils/pdf_pipeline.py`` : PDF processing pipeline
|
|
- ``utils/weaviate_ingest.py`` : Weaviate ingestion functions
|
|
- ``schema.py`` : Weaviate collection schemas
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import uuid
|
|
import threading
|
|
import queue
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Any, Dict, Generator, Iterator, List, Optional, Union
|
|
|
|
from flask import Flask, render_template, request, jsonify, redirect, url_for, send_from_directory, Response, flash
|
|
from contextlib import contextmanager
|
|
from werkzeug.utils import secure_filename
|
|
from werkzeug.wrappers import Response as WerkzeugResponse
|
|
import weaviate
|
|
import weaviate.classes.query as wvq
|
|
|
|
from utils.types import (
|
|
CollectionStats,
|
|
ProcessingOptions,
|
|
SSEEvent,
|
|
)
|
|
|
|
app = Flask(__name__)
|
|
|
|
# Configuration Flask
|
|
app.config["SECRET_KEY"] = os.environ.get("SECRET_KEY", "dev-secret-key-change-in-production")
|
|
|
|
# Configuration upload
|
|
app.config["UPLOAD_FOLDER"] = Path(__file__).parent / "output"
|
|
app.config["MAX_CONTENT_LENGTH"] = 50 * 1024 * 1024 # 50 MB max
|
|
ALLOWED_EXTENSIONS = {"pdf", "md", "docx"}
|
|
|
|
# Stockage des jobs de traitement en cours
|
|
processing_jobs: Dict[str, Dict[str, Any]] = {} # {job_id: {"status": str, "queue": Queue, "result": dict}}
|
|
|
|
# Stockage des sessions de chat en cours
|
|
chat_sessions: Dict[str, Dict[str, Any]] = {} # {session_id: {"status": str, "queue": Queue, "context": list}}
|
|
|
|
# Stockage des jobs TTS en cours
|
|
tts_jobs: Dict[str, Dict[str, Any]] = {} # {job_id: {"status": str, "filepath": Path, "error": str}}
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# Weaviate Connection
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
@contextmanager
|
|
def get_weaviate_client() -> Generator[Optional[weaviate.WeaviateClient], None, None]:
|
|
"""Context manager for Weaviate connection.
|
|
|
|
Yields:
|
|
WeaviateClient if connection succeeds, None otherwise.
|
|
"""
|
|
client: Optional[weaviate.WeaviateClient] = None
|
|
try:
|
|
client = weaviate.connect_to_local(
|
|
host="localhost",
|
|
port=8080,
|
|
grpc_port=50051,
|
|
)
|
|
yield client
|
|
except Exception as e:
|
|
print(f"Erreur connexion Weaviate: {e}")
|
|
yield None
|
|
finally:
|
|
if client:
|
|
try:
|
|
client.close()
|
|
except Exception as e:
|
|
print(f"Erreur fermeture client Weaviate: {e}")
|
|
|
|
|
|
def get_collection_stats() -> Optional[CollectionStats]:
|
|
"""Get statistics about Weaviate collections.
|
|
|
|
Returns:
|
|
CollectionStats with passage counts and unique values, or None on error.
|
|
"""
|
|
try:
|
|
with get_weaviate_client() as client:
|
|
if client is None:
|
|
return None
|
|
|
|
stats: CollectionStats = {}
|
|
|
|
# Chunk stats (renamed from Passage)
|
|
passages = client.collections.get("Chunk")
|
|
passage_count = passages.aggregate.over_all(total_count=True)
|
|
stats["passages"] = passage_count.total_count or 0
|
|
|
|
# Get unique authors and works (from nested objects)
|
|
all_passages = passages.query.fetch_objects(limit=1000)
|
|
authors: set[str] = set()
|
|
works: set[str] = set()
|
|
languages: set[str] = set()
|
|
|
|
for obj in all_passages.objects:
|
|
# Work is now a nested object with {title, author}
|
|
work_obj = obj.properties.get("work")
|
|
if work_obj and isinstance(work_obj, dict):
|
|
if work_obj.get("author"):
|
|
authors.add(str(work_obj["author"]))
|
|
if work_obj.get("title"):
|
|
works.add(str(work_obj["title"]))
|
|
if obj.properties.get("language"):
|
|
languages.add(str(obj.properties["language"]))
|
|
|
|
stats["authors"] = len(authors)
|
|
stats["works"] = len(works)
|
|
stats["languages"] = len(languages)
|
|
stats["author_list"] = sorted(authors)
|
|
stats["work_list"] = sorted(works)
|
|
stats["language_list"] = sorted(languages)
|
|
|
|
return stats
|
|
except Exception as e:
|
|
print(f"Erreur stats: {e}")
|
|
return None
|
|
|
|
|
|
def get_all_passages(
|
|
limit: int = 50,
|
|
offset: int = 0,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Fetch all passages with pagination.
|
|
|
|
Args:
|
|
limit: Maximum number of passages to return.
|
|
offset: Number of passages to skip (for pagination).
|
|
|
|
Returns:
|
|
List of passage dictionaries with uuid and properties.
|
|
|
|
Note:
|
|
Author/work filters are disabled due to Weaviate 1.34.4 limitation:
|
|
nested object filtering is not yet supported (GitHub issue #3694).
|
|
"""
|
|
try:
|
|
with get_weaviate_client() as client:
|
|
if client is None:
|
|
return []
|
|
|
|
chunks = client.collections.get("Chunk")
|
|
|
|
result = chunks.query.fetch_objects(
|
|
limit=limit,
|
|
offset=offset,
|
|
return_properties=[
|
|
"text", "sectionPath", "sectionLevel", "chapterTitle",
|
|
"canonicalReference", "unitType", "keywords", "orderIndex", "language"
|
|
],
|
|
)
|
|
|
|
return [
|
|
{
|
|
"uuid": str(obj.uuid),
|
|
**obj.properties
|
|
}
|
|
for obj in result.objects
|
|
]
|
|
except Exception as e:
|
|
print(f"Erreur passages: {e}")
|
|
return []
|
|
|
|
|
|
def simple_search(
|
|
query: str,
|
|
limit: int = 10,
|
|
author_filter: Optional[str] = None,
|
|
work_filter: Optional[str] = None,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Single-stage semantic search on Chunk collection (original implementation).
|
|
|
|
Args:
|
|
query: Search query text.
|
|
limit: Maximum number of results to return.
|
|
author_filter: Filter by author name (uses workAuthor property).
|
|
work_filter: Filter by work title (uses workTitle property).
|
|
|
|
Returns:
|
|
List of passage dictionaries with uuid, similarity, and properties.
|
|
"""
|
|
try:
|
|
with get_weaviate_client() as client:
|
|
if client is None:
|
|
return []
|
|
|
|
chunks = client.collections.get("Chunk")
|
|
|
|
# Build filters using top-level properties (workAuthor, workTitle)
|
|
filters: Optional[Any] = None
|
|
if author_filter:
|
|
filters = wvq.Filter.by_property("workAuthor").equal(author_filter)
|
|
if work_filter:
|
|
work_filter_obj = wvq.Filter.by_property("workTitle").equal(work_filter)
|
|
filters = filters & work_filter_obj if filters else work_filter_obj
|
|
|
|
result = chunks.query.near_text(
|
|
query=query,
|
|
limit=limit,
|
|
filters=filters,
|
|
return_metadata=wvq.MetadataQuery(distance=True),
|
|
return_properties=[
|
|
"text", "sectionPath", "sectionLevel", "chapterTitle",
|
|
"canonicalReference", "unitType", "keywords", "orderIndex", "language"
|
|
],
|
|
)
|
|
|
|
return [
|
|
{
|
|
"uuid": str(obj.uuid),
|
|
"distance": obj.metadata.distance if obj.metadata else None,
|
|
"similarity": round((1 - obj.metadata.distance) * 100, 1) if obj.metadata and obj.metadata.distance else None,
|
|
**obj.properties
|
|
}
|
|
for obj in result.objects
|
|
]
|
|
except Exception as e:
|
|
print(f"Erreur recherche: {e}")
|
|
return []
|
|
|
|
|
|
def hierarchical_search(
|
|
query: str,
|
|
limit: int = 10,
|
|
author_filter: Optional[str] = None,
|
|
work_filter: Optional[str] = None,
|
|
sections_limit: int = 5,
|
|
force_hierarchical: bool = False,
|
|
) -> Dict[str, Any]:
|
|
"""Two-stage hierarchical semantic search: Summary → Chunks.
|
|
|
|
Stage 1: Find top-N relevant sections via Summary collection.
|
|
Stage 2: Search chunks within those sections for better precision.
|
|
|
|
Args:
|
|
query: Search query text.
|
|
limit: Maximum number of chunks to return per section.
|
|
author_filter: Filter by author name.
|
|
work_filter: Filter by work title.
|
|
sections_limit: Number of top sections to retrieve (default: 5).
|
|
force_hierarchical: If True, never fallback to simple search (for testing).
|
|
|
|
Returns:
|
|
Dictionary with hierarchical search results:
|
|
- mode: "hierarchical"
|
|
- sections: List of section dictionaries with nested chunks
|
|
- results: Flat list of all chunks (for compatibility)
|
|
- total_chunks: Total number of chunks found
|
|
- fallback_reason: Explanation if forced but 0 results (optional)
|
|
"""
|
|
with get_weaviate_client() as client:
|
|
if client is None:
|
|
# Return empty result - let caller decide fallback
|
|
return {
|
|
"mode": "hierarchical" if force_hierarchical else "error",
|
|
"sections": [],
|
|
"results": [],
|
|
"total_chunks": 0,
|
|
"fallback_reason": "Weaviate client unavailable",
|
|
}
|
|
|
|
try:
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# STAGE 1: Search Summary collection for relevant sections
|
|
# ═══════════════════════════════════════════════════════════════
|
|
|
|
summary_collection = client.collections.get("Summary")
|
|
|
|
summaries_result = summary_collection.query.near_text(
|
|
query=query,
|
|
limit=sections_limit,
|
|
return_metadata=wvq.MetadataQuery(distance=True),
|
|
# Note: Don't specify return_properties - let Weaviate return all properties
|
|
# including nested objects like "document" which we need for source_id
|
|
)
|
|
|
|
if not summaries_result.objects:
|
|
# No summaries found - return empty result
|
|
return {
|
|
"mode": "hierarchical" if force_hierarchical else "error",
|
|
"sections": [],
|
|
"results": [],
|
|
"total_chunks": 0,
|
|
"fallback_reason": f"Aucune section pertinente trouvée (0/{sections_limit} summaries)",
|
|
}
|
|
|
|
# Extract section data
|
|
sections_data = []
|
|
for summary_obj in summaries_result.objects:
|
|
props = summary_obj.properties
|
|
|
|
# Try to get document.sourceId if available (nested object might still be returned)
|
|
doc_obj = props.get("document")
|
|
source_id = ""
|
|
if doc_obj and isinstance(doc_obj, dict):
|
|
source_id = doc_obj.get("sourceId", "")
|
|
|
|
sections_data.append({
|
|
"section_path": props.get("sectionPath", ""),
|
|
"title": props.get("title", ""),
|
|
"summary_text": props.get("text", ""),
|
|
"level": props.get("level", 1),
|
|
"concepts": props.get("concepts", []),
|
|
"document_source_id": source_id,
|
|
"summary_uuid": str(summary_obj.uuid), # Keep UUID for later retrieval if needed
|
|
"similarity": round((1 - summary_obj.metadata.distance) * 100, 1) if summary_obj.metadata and summary_obj.metadata.distance else 0,
|
|
})
|
|
|
|
# Post-filter sections by author/work (Summary doesn't have work nested object)
|
|
if author_filter or work_filter:
|
|
print(f"[HIERARCHICAL] Post-filtering {len(sections_data)} sections by work='{work_filter}'")
|
|
doc_collection = client.collections.get("Document")
|
|
filtered_sections = []
|
|
|
|
for section in sections_data:
|
|
source_id = section["document_source_id"]
|
|
if not source_id:
|
|
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no sourceId)")
|
|
continue
|
|
|
|
# Query Document to get work metadata
|
|
# Note: 'work' is a nested object, so we don't specify it in return_properties
|
|
# Weaviate should return it automatically
|
|
doc_result = doc_collection.query.fetch_objects(
|
|
filters=wvq.Filter.by_property("sourceId").equal(source_id),
|
|
limit=1,
|
|
)
|
|
|
|
if doc_result.objects:
|
|
doc_work = doc_result.objects[0].properties.get("work", {})
|
|
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' doc_work type={type(doc_work)}, value={doc_work}")
|
|
if isinstance(doc_work, dict):
|
|
work_title = doc_work.get("title", "N/A")
|
|
work_author = doc_work.get("author", "N/A")
|
|
# Check filters
|
|
if author_filter and work_author != author_filter:
|
|
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (author '{work_author}' != '{author_filter}')")
|
|
continue
|
|
if work_filter and work_title != work_filter:
|
|
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (work '{work_title}' != '{work_filter}')")
|
|
continue
|
|
|
|
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' KEPT (work='{work_title}')")
|
|
filtered_sections.append(section)
|
|
else:
|
|
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (doc_work not a dict)")
|
|
else:
|
|
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no doc found for sourceId='{source_id}')")
|
|
|
|
sections_data = filtered_sections
|
|
print(f"[HIERARCHICAL] After filtering: {len(sections_data)} sections remaining")
|
|
|
|
if not sections_data:
|
|
# No sections match filters - return empty result
|
|
filters_str = f"author={author_filter}" if author_filter else ""
|
|
if work_filter:
|
|
filters_str += f", work={work_filter}" if filters_str else f"work={work_filter}"
|
|
return {
|
|
"mode": "hierarchical" if force_hierarchical else "error",
|
|
"sections": [],
|
|
"results": [],
|
|
"total_chunks": 0,
|
|
"fallback_reason": f"Aucune section ne correspond aux filtres ({filters_str})",
|
|
}
|
|
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# STAGE 2: Search chunks for EACH section (grouped display)
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# For each section, search chunks using the section's summary text
|
|
# This groups chunks under their relevant sections
|
|
|
|
chunk_collection = client.collections.get("Chunk")
|
|
|
|
# Build base filters (author/work only)
|
|
base_filters: Optional[Any] = None
|
|
if author_filter:
|
|
base_filters = wvq.Filter.by_property("workAuthor").equal(author_filter)
|
|
if work_filter:
|
|
work_filter_obj = wvq.Filter.by_property("workTitle").equal(work_filter)
|
|
base_filters = base_filters & work_filter_obj if base_filters else work_filter_obj
|
|
|
|
all_chunks = []
|
|
chunks_per_section = max(3, limit // len(sections_data)) # Distribute chunks across sections
|
|
|
|
for section in sections_data:
|
|
# Use section's summary text as query to find relevant chunks
|
|
# This ensures chunks are semantically related to the section
|
|
section_query = section["summary_text"] or section["title"] or query
|
|
|
|
# Build filters: base filters (author/work) + sectionPath filter
|
|
# Use .like() to match hierarchical sections (e.g., "Chapter 1*" matches "Chapter 1 > Section A")
|
|
# This ensures each chunk only appears in its own section hierarchy
|
|
section_path_pattern = f"{section['section_path']}*"
|
|
section_filters = wvq.Filter.by_property("sectionPath").like(section_path_pattern)
|
|
if base_filters:
|
|
section_filters = base_filters & section_filters
|
|
|
|
chunks_result = chunk_collection.query.near_text(
|
|
query=section_query,
|
|
limit=chunks_per_section,
|
|
filters=section_filters,
|
|
return_metadata=wvq.MetadataQuery(distance=True),
|
|
)
|
|
|
|
# Convert to list and attach to section
|
|
section_chunks = [
|
|
{
|
|
"uuid": str(obj.uuid),
|
|
"distance": obj.metadata.distance if obj.metadata else None,
|
|
"similarity": round((1 - obj.metadata.distance) * 100, 1) if obj.metadata and obj.metadata.distance else None,
|
|
**obj.properties
|
|
}
|
|
for obj in chunks_result.objects
|
|
]
|
|
|
|
print(f"[HIERARCHICAL] Section '{section['section_path'][:50]}...' filter='{section_path_pattern[:50]}...' -> {len(section_chunks)} chunks")
|
|
|
|
section["chunks"] = section_chunks
|
|
section["chunks_count"] = len(section_chunks)
|
|
all_chunks.extend(section_chunks)
|
|
|
|
print(f"[HIERARCHICAL] Got {len(all_chunks)} chunks total across {len(sections_data)} sections")
|
|
print(f"[HIERARCHICAL] Average {len(all_chunks) / len(sections_data):.1f} chunks per section")
|
|
|
|
# Sort all chunks globally by similarity for the flat results list
|
|
all_chunks.sort(key=lambda x: x.get("similarity", 0) or 0, reverse=True)
|
|
|
|
return {
|
|
"mode": "hierarchical",
|
|
"sections": sections_data,
|
|
"results": all_chunks,
|
|
"total_chunks": len(all_chunks),
|
|
}
|
|
|
|
except Exception as e:
|
|
# Handle errors within the try block (inside 'with')
|
|
print(f"Erreur recherche hiérarchique: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
# Return empty result (don't call simple_search here!)
|
|
return {
|
|
"mode": "hierarchical" if force_hierarchical else "error",
|
|
"sections": [],
|
|
"results": [],
|
|
"total_chunks": 0,
|
|
"fallback_reason": f"Erreur lors de la recherche: {str(e)}",
|
|
}
|
|
|
|
|
|
def should_use_hierarchical_search(query: str) -> bool:
|
|
"""Detect if a query would benefit from hierarchical 2-stage search.
|
|
|
|
Hierarchical search is recommended for:
|
|
- Long queries (≥15 characters) indicating complex questions
|
|
- Multi-concept queries (2+ significant words)
|
|
- Queries with logical connectors (et, ou, mais, donc, car)
|
|
|
|
Args:
|
|
query: Search query text.
|
|
|
|
Returns:
|
|
True if hierarchical search is recommended, False for simple search.
|
|
|
|
Examples:
|
|
>>> should_use_hierarchical_search("justice")
|
|
False # Short query, single concept
|
|
>>> should_use_hierarchical_search("Qu'est-ce que la justice selon Platon ?")
|
|
True # Long query, multi-concept, philosophical question
|
|
>>> should_use_hierarchical_search("vertu et sagesse")
|
|
True # Multi-concept with connector
|
|
"""
|
|
if not query or len(query.strip()) == 0:
|
|
return False
|
|
|
|
query_lower = query.lower().strip()
|
|
|
|
# Criterion 1: Long queries (≥15 chars) suggest complexity
|
|
if len(query_lower) >= 15:
|
|
return True
|
|
|
|
# Criterion 2: Presence of logical connectors
|
|
connectors = ["et", "ou", "mais", "donc", "car", "parce que", "puisque", "si"]
|
|
if any(f" {connector} " in f" {query_lower} " for connector in connectors):
|
|
return True
|
|
|
|
# Criterion 3: Multi-concept (2+ significant words, excluding stop words)
|
|
stop_words = {
|
|
"le", "la", "les", "un", "une", "des", "du", "de", "d",
|
|
"ce", "cette", "ces", "mon", "ma", "mes", "ton", "ta", "tes",
|
|
"à", "au", "aux", "dans", "sur", "pour", "par", "avec",
|
|
"que", "qui", "quoi", "dont", "où", "est", "sont", "a",
|
|
"qu", "c", "l", "s", "n", "m", "t", "j", "y",
|
|
}
|
|
|
|
words = query_lower.split()
|
|
significant_words = [w for w in words if len(w) > 2 and w not in stop_words]
|
|
|
|
if len(significant_words) >= 2:
|
|
return True
|
|
|
|
# Default: use simple search for short, single-concept queries
|
|
return False
|
|
|
|
|
|
def summary_only_search(
|
|
query: str,
|
|
limit: int = 10,
|
|
author_filter: Optional[str] = None,
|
|
work_filter: Optional[str] = None,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Summary-only semantic search on Summary collection (90% visibility).
|
|
|
|
Searches high-level section summaries instead of detailed chunks. Offers
|
|
90% visibility of rich documents vs 10% for direct chunk search due to
|
|
Peirce chunk dominance (5,068/5,230 = 97% of chunks).
|
|
|
|
Args:
|
|
query: Search query text.
|
|
limit: Maximum number of summary results to return.
|
|
author_filter: Filter by author name (uses document.author property).
|
|
work_filter: Filter by work title (uses document.title property).
|
|
|
|
Returns:
|
|
List of summary dictionaries formatted as "results" with:
|
|
- uuid, similarity, text, title, concepts, doc_icon, doc_name
|
|
- author, year, chunks_count, section_path
|
|
"""
|
|
try:
|
|
with get_weaviate_client() as client:
|
|
if client is None:
|
|
return []
|
|
|
|
summaries = client.collections.get("Summary")
|
|
|
|
# Note: Cannot filter by nested document properties directly in Weaviate v4
|
|
# Must fetch all and filter in Python if author/work filters are present
|
|
|
|
# Semantic search
|
|
results = summaries.query.near_text(
|
|
query=query,
|
|
limit=limit * 3 if (author_filter or work_filter) else limit, # Fetch more if filtering
|
|
return_metadata=wvq.MetadataQuery(distance=True)
|
|
)
|
|
|
|
# Format and filter results
|
|
formatted_results: List[Dict[str, Any]] = []
|
|
for obj in results.objects:
|
|
props = obj.properties
|
|
similarity = 1 - obj.metadata.distance
|
|
|
|
# Apply filters (Python-side since nested properties)
|
|
if author_filter and props["document"].get("author", "") != author_filter:
|
|
continue
|
|
if work_filter and props["document"].get("title", "") != work_filter:
|
|
continue
|
|
|
|
# Determine document icon and name
|
|
doc_id = props["document"]["sourceId"].lower()
|
|
if "tiercelin" in doc_id:
|
|
doc_icon = "🟡"
|
|
doc_name = "Tiercelin"
|
|
elif "platon" in doc_id or "menon" in doc_id:
|
|
doc_icon = "🟢"
|
|
doc_name = "Platon"
|
|
elif "haugeland" in doc_id:
|
|
doc_icon = "🟣"
|
|
doc_name = "Haugeland"
|
|
elif "logique" in doc_id:
|
|
doc_icon = "🔵"
|
|
doc_name = "Logique"
|
|
else:
|
|
doc_icon = "⚪"
|
|
doc_name = "Peirce"
|
|
|
|
# Format result (compatible with existing template expectations)
|
|
result = {
|
|
"uuid": str(obj.uuid),
|
|
"similarity": round(similarity * 100, 1), # Convert to percentage
|
|
"text": props.get("text", ""),
|
|
"title": props["title"],
|
|
"concepts": props.get("concepts", []),
|
|
"doc_icon": doc_icon,
|
|
"doc_name": doc_name,
|
|
"author": props["document"].get("author", ""),
|
|
"year": props["document"].get("year", 0),
|
|
"chunks_count": props.get("chunksCount", 0),
|
|
"section_path": props.get("sectionPath", ""),
|
|
"sectionPath": props.get("sectionPath", ""), # Alias for template compatibility
|
|
# Add work info for template compatibility
|
|
"work": {
|
|
"title": props["document"].get("title", ""),
|
|
"author": props["document"].get("author", ""),
|
|
},
|
|
}
|
|
|
|
formatted_results.append(result)
|
|
|
|
# Stop if we have enough results after filtering
|
|
if len(formatted_results) >= limit:
|
|
break
|
|
|
|
return formatted_results
|
|
|
|
except Exception as e:
|
|
print(f"Error in summary_only_search: {e}")
|
|
return []
|
|
|
|
|
|
def search_passages(
|
|
query: str,
|
|
limit: int = 10,
|
|
author_filter: Optional[str] = None,
|
|
work_filter: Optional[str] = None,
|
|
sections_limit: int = 5,
|
|
force_mode: Optional[str] = None,
|
|
) -> Dict[str, Any]:
|
|
"""Intelligent semantic search dispatcher with auto-detection.
|
|
|
|
Automatically chooses between simple (1-stage), hierarchical (2-stage),
|
|
or summary-only search based on query complexity or user selection.
|
|
|
|
Args:
|
|
query: Search query text.
|
|
limit: Maximum number of chunks to return (per section if hierarchical).
|
|
author_filter: Filter by author name (uses workAuthor property).
|
|
work_filter: Filter by work title (uses workTitle property).
|
|
sections_limit: Number of top sections for hierarchical search (default: 5).
|
|
force_mode: Force search mode ("simple", "hierarchical", "summary", or None for auto).
|
|
|
|
Returns:
|
|
Dictionary with search results:
|
|
- mode: "simple", "hierarchical", or "summary"
|
|
- results: List of passage/summary dictionaries (flat)
|
|
- sections: List of section dicts with nested chunks (hierarchical only)
|
|
- total_chunks: Total number of chunks/summaries found
|
|
|
|
Examples:
|
|
>>> # Short query → auto-detects simple search
|
|
>>> search_passages("justice", limit=10)
|
|
{"mode": "simple", "results": [...], "total_chunks": 10}
|
|
|
|
>>> # Complex query → auto-detects hierarchical search
|
|
>>> search_passages("Qu'est-ce que la vertu selon Aristote ?", limit=5)
|
|
{"mode": "hierarchical", "sections": [...], "results": [...], "total_chunks": 15}
|
|
|
|
>>> # Force summary-only mode (90% visibility, high-level overviews)
|
|
>>> search_passages("What is the Turing test?", force_mode="summary", limit=10)
|
|
{"mode": "summary", "results": [...], "total_chunks": 7}
|
|
"""
|
|
# Handle summary-only mode
|
|
if force_mode == "summary":
|
|
results = summary_only_search(query, limit, author_filter, work_filter)
|
|
return {
|
|
"mode": "summary",
|
|
"results": results,
|
|
"total_chunks": len(results),
|
|
}
|
|
|
|
# Determine search mode for simple vs hierarchical
|
|
if force_mode == "simple":
|
|
use_hierarchical = False
|
|
elif force_mode == "hierarchical":
|
|
use_hierarchical = True
|
|
else:
|
|
# Auto-detection
|
|
use_hierarchical = should_use_hierarchical_search(query)
|
|
|
|
# Execute appropriate search strategy
|
|
if use_hierarchical:
|
|
result = hierarchical_search(
|
|
query=query,
|
|
limit=limit,
|
|
author_filter=author_filter,
|
|
work_filter=work_filter,
|
|
sections_limit=sections_limit,
|
|
force_hierarchical=(force_mode == "hierarchical"), # No fallback if explicitly forced
|
|
)
|
|
|
|
# If hierarchical search failed and wasn't forced, fallback to simple search
|
|
if result.get("mode") == "error" and force_mode != "hierarchical":
|
|
results = simple_search(query, limit, author_filter, work_filter)
|
|
return {
|
|
"mode": "simple",
|
|
"results": results,
|
|
"total_chunks": len(results),
|
|
}
|
|
|
|
return result
|
|
else:
|
|
results = simple_search(query, limit, author_filter, work_filter)
|
|
return {
|
|
"mode": "simple",
|
|
"results": results,
|
|
"total_chunks": len(results),
|
|
}
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# Routes
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
@app.route("/")
|
|
def index() -> str:
|
|
"""Render the home page with collection statistics.
|
|
|
|
Displays an overview of the Library RAG application with statistics about
|
|
indexed passages, works, authors, and supported languages from Weaviate.
|
|
|
|
Returns:
|
|
Rendered HTML template (index.html) with collection statistics including:
|
|
- Total passage count
|
|
- Number of unique authors and works
|
|
- List of available languages
|
|
|
|
Note:
|
|
If Weaviate connection fails, stats will be None and the template
|
|
should handle displaying an appropriate fallback message.
|
|
"""
|
|
from utils.types import CollectionStats
|
|
stats: Optional[CollectionStats] = get_collection_stats()
|
|
return render_template("index.html", stats=stats)
|
|
|
|
|
|
@app.route("/passages")
|
|
def passages() -> str:
|
|
"""Render the passages list page with pagination and filtering.
|
|
|
|
Displays a paginated list of all indexed passages from Weaviate with optional
|
|
filtering by author and/or work title. Includes statistics and filter options
|
|
in the sidebar.
|
|
|
|
Query Parameters:
|
|
page (int): Page number for pagination. Defaults to 1.
|
|
per_page (int): Number of passages per page. Defaults to 20.
|
|
author (str, optional): Filter passages by author name.
|
|
work (str, optional): Filter passages by work title.
|
|
|
|
Returns:
|
|
Rendered HTML template (passages.html) with:
|
|
- List of passages for the current page
|
|
- Collection statistics for sidebar filters
|
|
- Pagination controls
|
|
- Current filter state
|
|
|
|
Example:
|
|
GET /passages?page=2&per_page=50&author=Platon
|
|
Returns page 2 with 50 passages per page, filtered by author "Platon".
|
|
"""
|
|
page: int = request.args.get("page", 1, type=int)
|
|
per_page: int = request.args.get("per_page", 20, type=int)
|
|
author: Optional[str] = request.args.get("author", None)
|
|
work: Optional[str] = request.args.get("work", None)
|
|
|
|
# Clean filters
|
|
if author == "":
|
|
author = None
|
|
if work == "":
|
|
work = None
|
|
|
|
offset: int = (page - 1) * per_page
|
|
|
|
from utils.types import CollectionStats
|
|
stats: Optional[CollectionStats] = get_collection_stats()
|
|
passages_list: List[Dict[str, Any]] = get_all_passages(
|
|
limit=per_page,
|
|
offset=offset,
|
|
)
|
|
|
|
return render_template(
|
|
"passages.html",
|
|
chunks=passages_list,
|
|
stats=stats,
|
|
page=page,
|
|
per_page=per_page,
|
|
author_filter=author,
|
|
work_filter=work,
|
|
)
|
|
|
|
|
|
@app.route("/search")
|
|
def search() -> str:
|
|
"""Render the semantic search page with vector similarity results.
|
|
|
|
Provides a search interface for finding passages using semantic similarity
|
|
via Weaviate's near_text query. Results include similarity scores and can
|
|
be filtered by author and/or work.
|
|
|
|
Query Parameters:
|
|
q (str): Search query text. Empty string shows no results.
|
|
limit (int): Maximum number of chunks per section. Defaults to 10.
|
|
author (str, optional): Filter results by author name.
|
|
work (str, optional): Filter results by work title.
|
|
sections_limit (int): Number of sections for hierarchical search. Defaults to 5.
|
|
mode (str, optional): Force search mode ("simple", "hierarchical", or "" for auto).
|
|
|
|
Returns:
|
|
Rendered HTML template (search.html) with:
|
|
- Search form with current query
|
|
- List of matching passages with similarity percentages
|
|
- Collection statistics for filter dropdowns
|
|
- Current filter state
|
|
- Search mode indicator (simple vs hierarchical)
|
|
|
|
Example:
|
|
GET /search?q=la%20mort%20et%20le%20temps&limit=5§ions_limit=3
|
|
Auto-detects hierarchical search, returns top 3 sections with 5 chunks each.
|
|
"""
|
|
query: str = request.args.get("q", "")
|
|
limit: int = request.args.get("limit", 10, type=int)
|
|
author: Optional[str] = request.args.get("author", None)
|
|
work: Optional[str] = request.args.get("work", None)
|
|
sections_limit: int = request.args.get("sections_limit", 5, type=int)
|
|
mode: Optional[str] = request.args.get("mode", None)
|
|
|
|
# Clean filters
|
|
if author == "":
|
|
author = None
|
|
if work == "":
|
|
work = None
|
|
if mode == "":
|
|
mode = None
|
|
|
|
from utils.types import CollectionStats
|
|
stats: Optional[CollectionStats] = get_collection_stats()
|
|
results_data: Optional[Dict[str, Any]] = None
|
|
|
|
if query:
|
|
results_data = search_passages(
|
|
query=query,
|
|
limit=limit,
|
|
author_filter=author,
|
|
work_filter=work,
|
|
sections_limit=sections_limit,
|
|
force_mode=mode,
|
|
)
|
|
|
|
return render_template(
|
|
"search.html",
|
|
query=query,
|
|
results_data=results_data,
|
|
stats=stats,
|
|
limit=limit,
|
|
sections_limit=sections_limit,
|
|
mode=mode,
|
|
author_filter=author,
|
|
work_filter=work,
|
|
)
|
|
|
|
|
|
def rag_search(query: str, limit: int = 5) -> List[Dict[str, Any]]:
|
|
"""Search passages for RAG context with formatted results.
|
|
|
|
Wraps the existing search_passages() function but returns results formatted
|
|
specifically for RAG prompt construction. Includes author, work, and section
|
|
information needed to build context for LLM generation.
|
|
|
|
Args:
|
|
query: The user's question or search query.
|
|
limit: Maximum number of context chunks to retrieve. Defaults to 5.
|
|
|
|
Returns:
|
|
List of context dictionaries with keys:
|
|
- text (str): The passage text content
|
|
- author (str): Author name (from workAuthor)
|
|
- work (str): Work title (from workTitle)
|
|
- section (str): Section path or chapter title
|
|
- similarity (float): Similarity score 0-100
|
|
- uuid (str): Weaviate chunk UUID
|
|
|
|
Example:
|
|
>>> results = rag_search("Qu'est-ce que la vertu ?", limit=3)
|
|
>>> results[0]["author"]
|
|
'Platon'
|
|
>>> results[0]["work"]
|
|
'République'
|
|
"""
|
|
import time
|
|
start_time = time.time()
|
|
|
|
try:
|
|
with get_weaviate_client() as client:
|
|
if client is None:
|
|
print("[RAG Search] Weaviate client unavailable")
|
|
return []
|
|
|
|
chunks = client.collections.get("Chunk")
|
|
|
|
# Query with properties needed for RAG context
|
|
result = chunks.query.near_text(
|
|
query=query,
|
|
limit=limit,
|
|
return_metadata=wvq.MetadataQuery(distance=True),
|
|
return_properties=[
|
|
"text",
|
|
"workAuthor", # Top-level author property
|
|
"workTitle", # Top-level work property
|
|
"sectionPath",
|
|
"chapterTitle",
|
|
"canonicalReference",
|
|
],
|
|
)
|
|
|
|
# Format results for RAG prompt construction
|
|
formatted_results = []
|
|
for obj in result.objects:
|
|
props = obj.properties
|
|
similarity = round((1 - obj.metadata.distance) * 100, 1) if obj.metadata and obj.metadata.distance else 0.0
|
|
|
|
formatted_results.append({
|
|
"text": props.get("text", ""),
|
|
"author": props.get("workAuthor", "Auteur inconnu"),
|
|
"work": props.get("workTitle", "Œuvre inconnue"),
|
|
"section": props.get("sectionPath") or props.get("chapterTitle") or "Section inconnue",
|
|
"similarity": similarity,
|
|
"uuid": str(obj.uuid),
|
|
})
|
|
|
|
# Log search metrics
|
|
elapsed = time.time() - start_time
|
|
print(f"[RAG Search] Query: '{query[:50]}...' | Results: {len(formatted_results)} | Time: {elapsed:.2f}s")
|
|
|
|
return formatted_results
|
|
|
|
except Exception as e:
|
|
print(f"[RAG Search] Error: {e}")
|
|
return []
|
|
|
|
|
|
def diverse_author_search(
|
|
query: str,
|
|
limit: int = 10,
|
|
initial_pool: int = 100,
|
|
max_authors: int = 5,
|
|
chunks_per_author: int = 2
|
|
) -> List[Dict[str, Any]]:
|
|
"""Search passages with author diversity to avoid corpus imbalance bias.
|
|
|
|
This function addresses the problem where prolific authors (e.g., Peirce with
|
|
300 works) dominate search results over less represented but equally relevant
|
|
authors (e.g., Tiercelin with 1 work).
|
|
|
|
Algorithm:
|
|
1. Retrieve large initial pool of chunks (e.g., 100)
|
|
2. Group chunks by author
|
|
3. Compute average similarity score of top-3 chunks per author
|
|
4. Select top-N authors by average score
|
|
5. Extract best chunks from each selected author
|
|
6. Return diversified chunk list
|
|
|
|
Args:
|
|
query: The user's question or search query.
|
|
limit: Maximum number of chunks to return (default: 10).
|
|
initial_pool: Size of initial candidate pool (default: 100).
|
|
max_authors: Maximum number of distinct authors to include (default: 5).
|
|
chunks_per_author: Number of chunks per selected author (default: 2).
|
|
|
|
Returns:
|
|
List of context dictionaries with keys:
|
|
- text (str): The passage text content
|
|
- author (str): Author name (from workAuthor)
|
|
- work (str): Work title (from workTitle)
|
|
- section (str): Section path or chapter title
|
|
- similarity (float): Similarity score 0-100
|
|
- uuid (str): Weaviate chunk UUID
|
|
|
|
Example:
|
|
>>> results = diverse_author_search("Scotus et Peirce", limit=10)
|
|
>>> authors = set(r["author"] for r in results)
|
|
>>> len(authors) # Multiple authors guaranteed
|
|
5
|
|
>>> [r["author"] for r in results].count("Peirce") # Max chunks_per_author
|
|
2
|
|
|
|
Note:
|
|
This prevents a single prolific author from dominating all results.
|
|
For "Scotus et Peirce", ensures results from Peirce, Tiercelin, Scotus,
|
|
Boler, and other relevant commentators.
|
|
"""
|
|
import time
|
|
start_time = time.time()
|
|
|
|
print(f"[Diverse Search] CALLED with query='{query[:50]}...', initial_pool={initial_pool}, max_authors={max_authors}, chunks_per_author={chunks_per_author}")
|
|
|
|
try:
|
|
# Step 1: Retrieve large initial pool
|
|
print(f"[Diverse Search] Calling rag_search with limit={initial_pool}")
|
|
candidates = rag_search(query, limit=initial_pool)
|
|
print(f"[Diverse Search] rag_search returned {len(candidates)} candidates")
|
|
|
|
if not candidates:
|
|
print("[Diverse Search] No candidates found, returning empty list")
|
|
return []
|
|
|
|
# Step 2: Group chunks by author
|
|
by_author: Dict[str, List[Dict[str, Any]]] = {}
|
|
for chunk in candidates:
|
|
author = chunk.get("author", "Auteur inconnu")
|
|
if author not in by_author:
|
|
by_author[author] = []
|
|
by_author[author].append(chunk)
|
|
|
|
print(f"[Diverse Search] Found {len(by_author)} distinct authors in pool of {len(candidates)} chunks")
|
|
|
|
# Step 3: Compute average similarity of top-3 chunks per author
|
|
author_scores: Dict[str, float] = {}
|
|
for author, chunks in by_author.items():
|
|
# Sort by similarity descending
|
|
sorted_chunks = sorted(chunks, key=lambda x: x["similarity"], reverse=True)
|
|
# Take top-3 (or all if fewer than 3)
|
|
top_chunks = sorted_chunks[:3]
|
|
# Average similarity
|
|
avg_score = sum(c["similarity"] for c in top_chunks) / len(top_chunks)
|
|
author_scores[author] = avg_score
|
|
|
|
# Step 4: Select top-N authors by average score
|
|
top_authors = sorted(author_scores.items(), key=lambda x: x[1], reverse=True)[:max_authors]
|
|
|
|
print(f"[Diverse Search] Top {len(top_authors)} authors: {[author for author, score in top_authors]}")
|
|
for author, score in top_authors:
|
|
print(f" - {author}: avg_score={score:.1f}%, {len(by_author[author])} chunks in pool")
|
|
|
|
# Step 5: Extract best chunks from each selected author
|
|
# SMART ALLOCATION: If only 1-2 authors, take more chunks per author to reach target limit
|
|
num_authors = len(top_authors)
|
|
if num_authors == 1:
|
|
# Only one author: take up to 'limit' chunks from that author
|
|
adaptive_chunks_per_author = limit
|
|
print(f"[Diverse Search] Only 1 author found → taking up to {adaptive_chunks_per_author} chunks")
|
|
elif num_authors <= 3:
|
|
# Few authors (2-3): take more chunks per author
|
|
adaptive_chunks_per_author = max(chunks_per_author, limit // num_authors)
|
|
print(f"[Diverse Search] Only {num_authors} authors → taking up to {adaptive_chunks_per_author} chunks per author")
|
|
else:
|
|
# Many authors (4+): stick to original limit for diversity
|
|
adaptive_chunks_per_author = chunks_per_author
|
|
print(f"[Diverse Search] {num_authors} authors → taking {adaptive_chunks_per_author} chunks per author")
|
|
|
|
final_chunks: List[Dict[str, Any]] = []
|
|
for author, avg_score in top_authors:
|
|
# Get best chunks for this author
|
|
author_chunks = sorted(by_author[author], key=lambda x: x["similarity"], reverse=True)
|
|
selected = author_chunks[:adaptive_chunks_per_author]
|
|
final_chunks.extend(selected)
|
|
|
|
# Cap at limit
|
|
final_chunks = final_chunks[:limit]
|
|
|
|
# Log final metrics
|
|
final_authors = set(c["author"] for c in final_chunks)
|
|
elapsed = time.time() - start_time
|
|
print(f"[Diverse Search] Final: {len(final_chunks)} chunks from {len(final_authors)} authors | Time: {elapsed:.2f}s")
|
|
|
|
return final_chunks
|
|
|
|
except Exception as e:
|
|
import traceback
|
|
print(f"[Diverse Search] EXCEPTION CAUGHT: {e}")
|
|
print(f"[Diverse Search] Traceback: {traceback.format_exc()}")
|
|
print(f"[Diverse Search] Falling back to standard rag_search with limit={limit}")
|
|
# Fallback to standard search
|
|
return rag_search(query, limit)
|
|
|
|
|
|
def build_prompt_with_context(user_question: str, rag_context: List[Dict[str, Any]]) -> str:
|
|
"""Build a prompt for LLM generation using RAG context.
|
|
|
|
Constructs a comprehensive prompt that includes a system instruction,
|
|
formatted RAG context chunks with author/work metadata, and the user's
|
|
question. The prompt is designed to work with all LLM providers
|
|
(Ollama, Mistral, Anthropic, OpenAI).
|
|
|
|
Args:
|
|
user_question: The user's question in natural language.
|
|
rag_context: List of context dictionaries from rag_search() with keys:
|
|
- text: Passage text
|
|
- author: Author name
|
|
- work: Work title
|
|
- section: Section or chapter
|
|
- similarity: Similarity score (0-100)
|
|
|
|
Returns:
|
|
Formatted prompt string ready for LLM generation.
|
|
|
|
Example:
|
|
>>> context = rag_search("Qu'est-ce que la justice ?", limit=2)
|
|
>>> prompt = build_prompt_with_context("Qu'est-ce que la justice ?", context)
|
|
>>> print(prompt[:100])
|
|
'Vous êtes un assistant spécialisé en philosophie...'
|
|
"""
|
|
# System instruction
|
|
system_instruction = """Vous êtes un assistant expert en philosophie. Votre rôle est de fournir des analyses APPROFONDIES et DÉTAILLÉES en vous appuyant sur les passages philosophiques fournis.
|
|
|
|
INSTRUCTIONS IMPÉRATIVES :
|
|
- Fournissez une réponse LONGUE et DÉVELOPPÉE (minimum 500-800 mots)
|
|
- Analysez EN PROFONDEUR tous les aspects de la question
|
|
- Citez ABONDAMMENT les passages fournis avec références précises (auteur, œuvre)
|
|
- Développez les concepts philosophiques, ne vous contentez PAS de résumés superficiels
|
|
- Explorez les NUANCES, les implications, les relations entre les idées
|
|
- Structurez votre réponse en sections claires (introduction, développement avec sous-parties, conclusion)
|
|
- Si les passages ne couvrent pas tous les aspects, indiquez-le mais développez ce qui est disponible
|
|
- Adoptez un style académique rigoureux digne d'une analyse philosophique universitaire
|
|
- N'inventez JAMAIS d'informations absentes des passages, mais exploitez à fond celles qui y sont"""
|
|
|
|
# Build context section
|
|
context_section = "\n\nPASSAGES PHILOSOPHIQUES :\n\n"
|
|
|
|
if not rag_context:
|
|
context_section += "(Aucun passage trouvé)\n"
|
|
else:
|
|
for i, chunk in enumerate(rag_context, 1):
|
|
author = chunk.get("author", "Auteur inconnu")
|
|
work = chunk.get("work", "Œuvre inconnue")
|
|
section = chunk.get("section", "")
|
|
text = chunk.get("text", "")
|
|
similarity = chunk.get("similarity", 0)
|
|
|
|
# Truncate very long passages (keep first 2000 chars max per chunk for deep analysis)
|
|
if len(text) > 2000:
|
|
text = text[:2000] + "..."
|
|
|
|
context_section += f"**Passage {i}** [Score de pertinence: {similarity}%]\n"
|
|
context_section += f"**Auteur :** {author}\n"
|
|
context_section += f"**Œuvre :** {work}\n"
|
|
if section:
|
|
context_section += f"**Section :** {section}\n"
|
|
context_section += f"\n{text}\n\n"
|
|
context_section += "---\n\n"
|
|
|
|
# User question
|
|
question_section = f"\nQUESTION :\n{user_question}\n\n"
|
|
|
|
# Final instruction
|
|
final_instruction = """CONSIGNE FINALE :
|
|
Répondez à cette question en produisant une analyse philosophique COMPLÈTE et APPROFONDIE (minimum 500-800 mots).
|
|
Votre réponse doit :
|
|
1. Commencer par une introduction contextualisant la question
|
|
2. Développer une analyse détaillée en plusieurs parties, citant abondamment les passages
|
|
3. Explorer les implications philosophiques, les concepts-clés, les relations entre les idées
|
|
4. Conclure en synthétisant l'apport des passages à la question posée
|
|
|
|
Ne vous limitez PAS à un résumé superficiel. Développez, analysez, approfondissez. C'est une discussion philosophique universitaire, pas un tweet."""
|
|
|
|
# Combine all sections
|
|
full_prompt = system_instruction + context_section + question_section + final_instruction
|
|
|
|
# Truncate if too long (max ~30000 chars - modern LLMs have 128k+ context windows)
|
|
if len(full_prompt) > 30000:
|
|
# Reduce number of context chunks
|
|
print(f"[Prompt Builder] Warning: Prompt too long ({len(full_prompt)} chars), truncating context")
|
|
truncated_context = rag_context[:min(3, len(rag_context))] # Keep only top 3 chunks
|
|
return build_prompt_with_context(user_question, truncated_context)
|
|
|
|
return full_prompt
|
|
|
|
|
|
@app.route("/test-rag")
|
|
def test_rag() -> Dict[str, Any]:
|
|
"""Test endpoint for RAG search function.
|
|
|
|
Example:
|
|
GET /test-rag?q=vertu&limit=3
|
|
"""
|
|
query = request.args.get("q", "Qu'est-ce que la vertu ?")
|
|
limit = request.args.get("limit", 5, type=int)
|
|
|
|
results = rag_search(query, limit)
|
|
|
|
return jsonify({
|
|
"query": query,
|
|
"limit": limit,
|
|
"results_count": len(results),
|
|
"results": results
|
|
})
|
|
|
|
|
|
@app.route("/test-prompt")
|
|
def test_prompt() -> str:
|
|
"""Test endpoint for prompt construction with RAG context.
|
|
|
|
Example:
|
|
GET /test-prompt?q=Qu'est-ce que la justice ?&limit=3
|
|
|
|
Returns:
|
|
HTML page displaying the constructed prompt.
|
|
"""
|
|
query = request.args.get("q", "Qu'est-ce que la vertu ?")
|
|
limit = request.args.get("limit", 3, type=int)
|
|
|
|
# Get RAG context
|
|
rag_context = rag_search(query, limit)
|
|
|
|
# Build prompt
|
|
prompt = build_prompt_with_context(query, rag_context)
|
|
|
|
# Display as preformatted text in HTML
|
|
html = f"""
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Test Prompt RAG</title>
|
|
<style>
|
|
body {{
|
|
font-family: monospace;
|
|
padding: 2rem;
|
|
background-color: #f5f5f5;
|
|
}}
|
|
.container {{
|
|
max-width: 1000px;
|
|
margin: 0 auto;
|
|
background: white;
|
|
padding: 2rem;
|
|
border-radius: 8px;
|
|
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
|
}}
|
|
h1 {{
|
|
font-family: sans-serif;
|
|
color: #333;
|
|
}}
|
|
.info {{
|
|
background: #e3f2fd;
|
|
padding: 1rem;
|
|
border-radius: 4px;
|
|
margin-bottom: 1rem;
|
|
font-family: sans-serif;
|
|
}}
|
|
pre {{
|
|
background: #2b2b2b;
|
|
color: #f8f8f8;
|
|
padding: 1.5rem;
|
|
border-radius: 4px;
|
|
overflow-x: auto;
|
|
white-space: pre-wrap;
|
|
word-wrap: break-word;
|
|
line-height: 1.5;
|
|
}}
|
|
.stats {{
|
|
margin-top: 1rem;
|
|
padding: 1rem;
|
|
background: #f9f9f9;
|
|
border-radius: 4px;
|
|
font-family: sans-serif;
|
|
}}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<div class="container">
|
|
<h1>🧪 Test Prompt Construction RAG</h1>
|
|
<div class="info">
|
|
<strong>Question:</strong> {query}<br>
|
|
<strong>Contextes RAG:</strong> {len(rag_context)} passages<br>
|
|
<strong>Longueur prompt:</strong> {len(prompt)} caractères
|
|
</div>
|
|
<h2>Prompt généré :</h2>
|
|
<pre>{prompt}</pre>
|
|
<div class="stats">
|
|
<strong>Chunks utilisés :</strong><br>
|
|
{chr(10).join([f"- {c['author']} - {c['work']} (similarité: {c['similarity']}%)" for c in rag_context])}
|
|
</div>
|
|
</div>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
return html
|
|
|
|
|
|
@app.route("/test-llm")
|
|
def test_llm() -> WerkzeugResponse:
|
|
"""Test endpoint for LLM streaming.
|
|
|
|
Example:
|
|
GET /test-llm?provider=ollama&model=qwen2.5:7b&prompt=Hello
|
|
|
|
Returns:
|
|
Plain text streamed response.
|
|
"""
|
|
from utils.llm_chat import call_llm, LLMError
|
|
|
|
provider = request.args.get("provider", "ollama")
|
|
model = request.args.get("model", "qwen2.5:7b")
|
|
prompt = request.args.get("prompt", "Réponds en une phrase: Qu'est-ce que la philosophie ?")
|
|
|
|
def generate() -> Iterator[str]:
|
|
try:
|
|
yield f"[Test LLM Streaming]\n"
|
|
yield f"Provider: {provider}\n"
|
|
yield f"Model: {model}\n"
|
|
yield f"Prompt: {prompt}\n\n"
|
|
yield "Response:\n"
|
|
|
|
for token in call_llm(prompt, provider, model, stream=True):
|
|
yield token
|
|
|
|
yield "\n\n[Done]"
|
|
|
|
except LLMError as e:
|
|
yield f"\n\n[Error] {str(e)}"
|
|
except Exception as e:
|
|
yield f"\n\n[Unexpected Error] {str(e)}"
|
|
|
|
return Response(generate(), mimetype='text/plain')
|
|
|
|
|
|
@app.route("/test-chat-backend")
|
|
def test_chat_backend() -> str:
|
|
"""Test page for chat backend."""
|
|
return render_template("test_chat_backend.html")
|
|
|