diff --git a/.gitignore b/.gitignore index 0da9dd7..74ac56a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,12 +2,18 @@ generations/* !generations/library_rag/ +# Python cache and compiled files +__pycache__/ +*.pyc +*.pyo +*.pyd + # Log files logs/ +*.log .env venv -__pycache__ # Node modules (if any) node_modules/ @@ -18,4 +24,7 @@ backup_migration_*/ restoration_log.txt restoration_remaining_log.txt summary_generation_progress.json -nul \ No newline at end of file +nul + +# Archives (migration scripts moved here) +archive/ \ No newline at end of file diff --git a/08_fix_summaries_properties.py b/08_fix_summaries_properties.py deleted file mode 100644 index e74d8a0..0000000 --- a/08_fix_summaries_properties.py +++ /dev/null @@ -1,157 +0,0 @@ -"""Correctif: Ajouter workAuthor, year, language aux Summary_v2.""" - -import weaviate -import sys - -if sys.stdout.encoding != 'utf-8': - sys.stdout.reconfigure(encoding='utf-8') - -# Try to import tqdm -try: - from tqdm import tqdm - HAS_TQDM = True -except ImportError: - HAS_TQDM = False - -client = weaviate.connect_to_local() - -try: - print("=" * 80) - print("CORRECTIF: AJOUTER workAuthor, year, language À SUMMARY_V2") - print("=" * 80) - print() - - summary_v2 = client.collections.get("Summary_v2") - work_collection = client.collections.get("Work") - - # Build workTitle → Work metadata map - print("Étape 1: Mapping workTitle → Work metadata") - print("-" * 80) - - work_map = {} - - for work in work_collection.iterator(include_vector=False): - props = work.properties - title = props.get("title") - if title: - work_map[title] = { - "author": props.get("author", "Unknown"), - "year": props.get("year", 0), - "language": props.get("language", "en"), - } - - print(f"✓ {len(work_map)} mappings workTitle → metadata") - print() - - # Count total summaries - print("Étape 2: Comptage summaries") - print("-" * 80) - - print("Comptage en cours...") - total_summaries = sum(1 for _ in summary_v2.iterator(include_vector=False)) - - print(f"✓ {total_summaries} summaries à corriger") - print() - - # Update summaries - print("Étape 3: Mise à jour des propriétés") - print("-" * 80) - print() - - updated = 0 - skipped = 0 - errors = [] - - # Create iterator with or without tqdm - if HAS_TQDM: - iterator = tqdm( - summary_v2.iterator(include_vector=False), - total=total_summaries, - desc="Mise à jour", - unit="summaries" - ) - else: - iterator = summary_v2.iterator(include_vector=False) - print("Mise à jour en cours...") - - for idx, summary in enumerate(iterator, 1): - props = summary.properties - - try: - work_title = props.get("workTitle") - - if not work_title: - errors.append(f"Summary {summary.uuid}: pas de workTitle") - skipped += 1 - continue - - # Get work metadata - work_metadata = work_map.get(work_title) - if not work_metadata: - errors.append(f"Summary {summary.uuid}: Work '{work_title}' introuvable") - skipped += 1 - continue - - # Check if already updated (workAuthor exists) - if props.get("workAuthor") is not None: - skipped += 1 - continue - - # Update properties - summary_v2.data.update( - uuid=summary.uuid, - properties={ - "workAuthor": work_metadata["author"], - "year": work_metadata["year"], - "language": work_metadata["language"], - } - ) - - updated += 1 - - # Progress without tqdm - if not HAS_TQDM and idx % 10 == 0: - print(f" {idx}/{total_summaries} summaries traités...") - - except Exception as e: - errors.append(f"Summary {summary.uuid}: {e}") - - print() - print("-" * 80) - print(f"✓ Total mis à jour: {updated}/{total_summaries}") - print(f" Déjà à jour: {skipped}") - - if errors: - print(f"⚠️ Erreurs rencontrées: {len(errors)}") - print() - print("Premières erreurs:") - for err in errors[:10]: - print(f" - {err}") - if len(errors) > 10: - print(f" ... et {len(errors) - 10} autres") - - print() - print("=" * 80) - print("CORRECTIF TERMINÉ") - print("=" * 80) - print() - - if updated == total_summaries: - print("✅ Tous les summaries ont été mis à jour") - print() - print("Propriétés ajoutées:") - print(" ✓ workAuthor (auteur de l'œuvre)") - print(" ✓ year (année de publication)") - print(" ✓ language (langue du texte)") - print() - print("VÉRIFICATION:") - print(" python -c \"from verify_summaries import verify; verify()\"") - elif updated > 0: - print(f"⚠️ {updated}/{total_summaries} summaries mis à jour") - print(" Vérifier les erreurs") - else: - print("❌ Aucun summary mis à jour") - print(" Corriger les erreurs et relancer") - -finally: - client.close() diff --git a/09_rechunk_oversized.py b/09_rechunk_oversized.py deleted file mode 100644 index 5083b9e..0000000 --- a/09_rechunk_oversized.py +++ /dev/null @@ -1,267 +0,0 @@ -"""Script to re-chunk oversized chunks (> 2000 tokens) in Chunk_v2. - -This script identifies chunks that are too large (> 2000 tokens) and splits them -into smaller chunks with overlap (max 1000 words, overlap 100 words). - -Steps: - 1. Identify all chunks > 2000 tokens in Chunk_v2 - 2. Re-chunk using simple_chunk_with_overlap (1000 words max, 100 overlap) - 3. Delete the original oversized chunk - 4. Insert new smaller chunks with preserved metadata - 5. Update Summary_v2 chunksCount if needed -""" - -import weaviate -import sys -from pathlib import Path - -# Add utils to path -sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag")) - -from utils.llm_chunker_improved import simple_chunk_with_overlap, estimate_tokens - -if sys.stdout.encoding != 'utf-8': - sys.stdout.reconfigure(encoding='utf-8') - -# Try to import tqdm -try: - from tqdm import tqdm - HAS_TQDM = True -except ImportError: - HAS_TQDM = False - -# Constants -TOKEN_THRESHOLD = 2000 # Chunks > 2000 tokens will be re-chunked -MAX_WORDS = 1000 -OVERLAP_WORDS = 100 - -client = weaviate.connect_to_local() - -try: - print("=" * 80) - print("RE-CHUNKING DES CHUNKS SURDIMENSIONNÉS") - print("=" * 80) - print() - - chunk_v2 = client.collections.get("Chunk_v2") - work_collection = client.collections.get("Work") - - # ========== 1. IDENTIFIER LES CHUNKS PROBLÉMATIQUES ========== - print("1. IDENTIFICATION DES CHUNKS > 2000 TOKENS") - print("-" * 80) - print() - - oversized_chunks = [] - - print("Analyse en cours...") - for chunk in chunk_v2.iterator(include_vector=False): - props = chunk.properties - text = props.get('text', '') - tokens = estimate_tokens(text) - - if tokens > TOKEN_THRESHOLD: - oversized_chunks.append({ - 'uuid': str(chunk.uuid), - 'tokens': tokens, - 'chars': len(text), - 'text': text, - 'workTitle': props.get('workTitle', ''), - 'workAuthor': props.get('workAuthor', ''), - 'year': props.get('year', 0), - 'language': props.get('language', 'en'), - 'sectionPath': props.get('sectionPath', ''), - 'chapterTitle': props.get('chapterTitle', ''), - 'canonicalReference': props.get('canonicalReference', ''), - 'unitType': props.get('unitType', 'main_content'), - 'keywords': props.get('keywords', []), - 'orderIndex': props.get('orderIndex', 0), - }) - - print(f"✓ {len(oversized_chunks)} chunks > {TOKEN_THRESHOLD} tokens trouvés") - print() - - if not oversized_chunks: - print("✅ Aucun chunk surdimensionné à traiter") - print() - print("=" * 80) - print("SCRIPT TERMINÉ - RIEN À FAIRE") - print("=" * 80) - sys.exit(0) - - # Trier par taille - oversized_chunks.sort(key=lambda x: x['tokens'], reverse=True) - - print("Top 5 plus gros chunks:") - for i, chunk in enumerate(oversized_chunks[:5], 1): - print(f"{i}. {chunk['tokens']:,} tokens ({chunk['chars']:,} chars)") - print(f" Œuvre: {chunk['workTitle']}") - print(f" Section: {chunk['sectionPath'][:60]}...") - print() - - if len(oversized_chunks) > 5: - print(f"... et {len(oversized_chunks) - 5} autres") - - print() - - # ========== 2. RE-CHUNKING ========== - print("2. RE-CHUNKING AVEC OVERLAP") - print("-" * 80) - print() - - # Build work_title -> work_uuid map for references - work_map = {} - for work in work_collection.iterator(include_vector=False): - props = work.properties - title = props.get("title") - if title: - work_map[title] = str(work.uuid) - - print(f"✓ {len(work_map)} Works mappés") - print() - - deleted_count = 0 - inserted_count = 0 - errors = [] - - # Create iterator with or without tqdm - if HAS_TQDM: - iterator = tqdm( - oversized_chunks, - desc="Re-chunking", - unit="chunks" - ) - else: - iterator = oversized_chunks - print("Re-chunking en cours...") - - for idx, old_chunk in enumerate(iterator, 1): - try: - # Re-chunk text - new_texts = simple_chunk_with_overlap( - old_chunk['text'], - max_words=MAX_WORDS, - overlap_words=OVERLAP_WORDS - ) - - # Get work reference - work_uuid = work_map.get(old_chunk['workTitle']) - if not work_uuid: - errors.append(f"Chunk {old_chunk['uuid'][:8]}: Work '{old_chunk['workTitle']}' introuvable") - continue - - # Insert new chunks - for i, new_text in enumerate(new_texts): - # Sub-ordering: multiply base index by 100 and add part index - # Example: orderIndex=5 becomes 500, 501, 502, etc. - new_order_index = (old_chunk['orderIndex'] * 100) + i - - new_props = { - "text": new_text, - "summary": "", # Empty summary for simple chunks - "keywords": old_chunk['keywords'], - "workTitle": old_chunk['workTitle'], - "workAuthor": old_chunk['workAuthor'], - "year": old_chunk['year'], - "language": old_chunk['language'], - "sectionPath": old_chunk['sectionPath'], - "chapterTitle": old_chunk['chapterTitle'], - "canonicalReference": old_chunk['canonicalReference'], - "unitType": old_chunk['unitType'], - "orderIndex": new_order_index, - } - - chunk_v2.data.insert( - properties=new_props, - references={"work": work_uuid} - ) - inserted_count += 1 - - # Delete old chunk - chunk_v2.data.delete_by_id(old_chunk['uuid']) - deleted_count += 1 - - # Progress without tqdm - if not HAS_TQDM and idx % 5 == 0: - print(f" {idx}/{len(oversized_chunks)} chunks traités...") - - except Exception as e: - errors.append(f"Chunk {old_chunk['uuid'][:8]}: {e}") - - print() - print("-" * 80) - print(f"✓ Chunks supprimés: {deleted_count}") - print(f"✓ Nouveaux chunks créés: {inserted_count}") - if deleted_count > 0: - print(f" Expansion moyenne: {inserted_count / deleted_count:.1f}x") - else: - print(f" ⚠️ Aucun chunk supprimé - vérifier les erreurs") - - if errors: - print() - print(f"⚠️ Erreurs rencontrées: {len(errors)}") - for err in errors[:10]: - print(f" - {err}") - if len(errors) > 10: - print(f" ... et {len(errors) - 10} autres") - - print() - - # ========== 3. VÉRIFICATION ========== - print("3. VÉRIFICATION POST-RECHUNKING") - print("-" * 80) - print() - - print("Comptage des nouveaux chunks...") - remaining_oversized = 0 - total_chunks = 0 - - for chunk in chunk_v2.iterator(include_vector=False): - total_chunks += 1 - text = chunk.properties.get('text', '') - tokens = estimate_tokens(text) - if tokens > TOKEN_THRESHOLD: - remaining_oversized += 1 - - print(f"✓ Total chunks: {total_chunks:,}") - print(f"✓ Chunks > {TOKEN_THRESHOLD} tokens: {remaining_oversized}") - - if remaining_oversized == 0: - print() - print("✅ Aucun chunk surdimensionné restant!") - else: - print() - print(f"⚠️ {remaining_oversized} chunks encore > {TOKEN_THRESHOLD} tokens") - print(" Relancer le script si nécessaire") - - print() - print("=" * 80) - print("RE-CHUNKING TERMINÉ") - print("=" * 80) - print() - - print("RÉSULTATS:") - print(f" • Chunks supprimés: {deleted_count}") - print(f" • Nouveaux chunks créés: {inserted_count}") - if deleted_count > 0: - print(f" • Expansion: {inserted_count / deleted_count:.1f}x") - print(f" • Chunks restants > {TOKEN_THRESHOLD} tokens: {remaining_oversized}") - print() - - if remaining_oversized == 0 and deleted_count > 0: - print("✅ RE-CHUNKING RÉUSSI") - print() - print("AMÉLIORATIONS:") - print(f" • {deleted_count} chunks géants éliminés") - print(f" • {inserted_count} chunks optimaux créés") - print(f" • Taille max: {MAX_WORDS} mots (~{MAX_WORDS * 2.5:.0f} tokens)") - print(f" • Overlap: {OVERLAP_WORDS} mots (contexte préservé)") - print() - print("PROCHAINES ÉTAPES:") - print(" 1. Tester la recherche sémantique") - print(" 2. Vérifier la qualité des vecteurs") - print(" 3. Optionnel: Mettre à jour Summary_v2.chunksCount si nécessaire") - elif deleted_count == 0: - print("ℹ️ Aucun chunk n'a nécessité de re-chunking") - -finally: - client.close() diff --git a/10_test_search_quality.py b/10_test_search_quality.py deleted file mode 100644 index 86a4e28..0000000 --- a/10_test_search_quality.py +++ /dev/null @@ -1,402 +0,0 @@ -"""Test search quality with re-chunked data. - -This script tests semantic search to verify that the re-chunking improved -search quality and relevance. - -Tests: - 1. Chunk size distribution after re-chunking - 2. Overlap verification between consecutive chunks - 3. Semantic search quality on various queries - 4. Comparison of results from giant chunks vs optimized chunks -""" - -import weaviate -import sys -import requests -from pathlib import Path - -# Add utils to path -sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag")) - -from utils.llm_chunker_improved import estimate_tokens - -if sys.stdout.encoding != 'utf-8': - sys.stdout.reconfigure(encoding='utf-8') - -# Vectorizer URL (same as in 11_vectorize_missing_chunks.py) -VECTORIZER_URL = "http://localhost:8090/vectors" - -def vectorize_query(query: str) -> list[float]: - """Manually vectorize a query using text2vec-transformers service. - - Args: - query: Query text to vectorize - - Returns: - Vector as list of floats (1024 dimensions for BGE-M3) - """ - response = requests.post( - VECTORIZER_URL, - json={"text": query}, - headers={"Content-Type": "application/json"}, - timeout=30 - ) - if response.status_code != 200: - raise RuntimeError(f"Vectorization failed: HTTP {response.status_code}") - - result = response.json() - vector = result.get('vector') - if not vector: - raise RuntimeError("No vector in response") - - return vector - -client = weaviate.connect_to_local() - -try: - print("=" * 80) - print("TEST DE LA QUALITÉ DE RECHERCHE APRÈS RE-CHUNKING") - print("=" * 80) - print() - - chunk_v2 = client.collections.get("Chunk_v2") - - # ========== 1. DISTRIBUTION DES TAILLES ========== - print("1. DISTRIBUTION DES TAILLES DE CHUNKS") - print("-" * 80) - print() - - print("Analyse en cours...") - - sizes = [] - for chunk in chunk_v2.iterator(include_vector=False): - text = chunk.properties.get('text', '') - tokens = estimate_tokens(text) - sizes.append(tokens) - - total = len(sizes) - avg = sum(sizes) / total - max_size = max(sizes) - min_size = min(sizes) - - print(f"Total chunks: {total:,}") - print(f"Taille moyenne: {avg:.0f} tokens") - print(f"Min: {min_size} tokens") - print(f"Max: {max_size} tokens") - print() - - # Distribution par tranches - ranges = [ - (0, 500, "Très petits"), - (500, 1000, "Petits"), - (1000, 1500, "Moyens"), - (1500, 2000, "Grands"), - (2000, 3000, "Très grands"), - (3000, 10000, "ÉNORMES"), - ] - - print("Distribution par tranches:") - for min_tok, max_tok, label in ranges: - count = sum(1 for s in sizes if min_tok <= s < max_tok) - percentage = count / total * 100 - bar = "█" * int(percentage / 2) - print(f" {min_tok:>5}-{max_tok:>5} tokens ({label:15}): {count:>5} ({percentage:>5.1f}%) {bar}") - - print() - - # ========== 2. VÉRIFICATION OVERLAP ========== - print("2. VÉRIFICATION DE L'OVERLAP ENTRE CHUNKS CONSÉCUTIFS") - print("-" * 80) - print() - - # Prendre une œuvre pour vérifier l'overlap - print("Analyse de l'overlap dans 'Between Past and Future'...") - - arendt_chunks = [] - for chunk in chunk_v2.iterator(include_vector=False): - props = chunk.properties - if props.get('workTitle') == 'Between Past and Future': - arendt_chunks.append({ - 'orderIndex': props.get('orderIndex', 0), - 'text': props.get('text', ''), - 'sectionPath': props.get('sectionPath', '') - }) - - # Trier par orderIndex - arendt_chunks.sort(key=lambda x: x['orderIndex']) - - print(f"Chunks trouvés: {len(arendt_chunks)}") - print() - - # Vérifier overlap entre chunks consécutifs de même section - overlaps_found = 0 - overlaps_checked = 0 - - for i in range(len(arendt_chunks) - 1): - current = arendt_chunks[i] - next_chunk = arendt_chunks[i + 1] - - # Vérifier si même section (potentiellement des chunks split) - if current['sectionPath'] == next_chunk['sectionPath']: - # Extraire les derniers 200 caractères du chunk actuel - current_end = current['text'][-200:].strip() - # Extraire les premiers 200 caractères du chunk suivant - next_start = next_chunk['text'][:200].strip() - - # Chercher overlap - overlap_found = False - for length in range(50, 201, 10): # Tester différentes longueurs - if len(current_end) < length or len(next_start) < length: - continue - - test_end = current_end[-length:] - if test_end in next_start: - overlap_found = True - overlaps_found += 1 - break - - overlaps_checked += 1 - - if overlaps_checked > 0: - print(f"Chunks consécutifs vérifiés: {overlaps_checked}") - print(f"Overlaps détectés: {overlaps_found} ({overlaps_found/overlaps_checked*100:.1f}%)") - else: - print("Aucun chunk consécutif dans la même section (pas de split détecté)") - - print() - - # ========== 3. TESTS DE RECHERCHE SÉMANTIQUE ========== - print("3. TESTS DE RECHERCHE SÉMANTIQUE") - print("-" * 80) - print() - - test_queries = [ - { - "query": "What is the nature of representation in cognitive science?", - "expected_work": "Mind Design III", - "description": "Requête philosophique complexe" - }, - { - "query": "Comment définit-on la vertu selon Platon?", - "expected_work": "Platon - Ménon", - "description": "Requête en français sur un concept spécifique" - }, - { - "query": "pragmatism and belief fixation", - "expected_work": "Collected papers", - "description": "Concepts multiples (test de granularité)" - }, - { - "query": "Entre la logique des termes et la grammaire spéculative", - "expected_work": "La pensée-signe", - "description": "Requête technique académique" - }, - ] - - for i, test in enumerate(test_queries, 1): - print(f"Test {i}: {test['description']}") - print(f"Query: \"{test['query']}\"") - print() - - # Vectorize query and search with near_vector - # (Chunk_v2 has no vectorizer, so we must manually vectorize queries) - query_vector = vectorize_query(test['query']) - result = chunk_v2.query.near_vector( - near_vector=query_vector, - limit=5, - return_properties=[ - 'text', 'workTitle', 'workAuthor', - 'sectionPath', 'chapterTitle' - ], - return_metadata=['distance'] - ) - - if not result.objects: - print(" ❌ Aucun résultat trouvé") - print() - continue - - # Analyser les résultats - print(f" Résultats: {len(result.objects)}") - print() - - for j, obj in enumerate(result.objects, 1): - props = obj.properties - work_title = props.get('workTitle', 'N/A') - text = props.get('text', '') - tokens = estimate_tokens(text) - - # Distance (si disponible) - distance = getattr(obj.metadata, 'distance', None) if hasattr(obj, 'metadata') else None - distance_str = f" (distance: {distance:.4f})" if distance else "" - - # Marquer si c'est l'œuvre attendue - match_icon = "✓" if test['expected_work'] in work_title else " " - - print(f" [{match_icon}] {j}. {work_title}{distance_str}") - print(f" Taille: {tokens} tokens") - print(f" Section: {props.get('sectionPath', 'N/A')[:60]}...") - print(f" Extrait: {text[:120]}...") - print() - - # Vérifier si l'œuvre attendue est dans les résultats - found_expected = any( - test['expected_work'] in obj.properties.get('workTitle', '') - for obj in result.objects - ) - - if found_expected: - rank = next( - i for i, obj in enumerate(result.objects, 1) - if test['expected_work'] in obj.properties.get('workTitle', '') - ) - print(f" ✅ Œuvre attendue trouvée (rang {rank}/5)") - else: - print(f" ⚠️ Œuvre attendue '{test['expected_work']}' non trouvée dans le top 5") - - print() - print("-" * 80) - print() - - # ========== 4. STATISTIQUES GLOBALES ========== - print("4. STATISTIQUES GLOBALES DE RECHERCHE") - print("-" * 80) - print() - - # Tester une requête large - broad_query = "philosophy and logic" - print(f"Requête large: \"{broad_query}\"") - print() - - query_vector = vectorize_query(broad_query) - result = chunk_v2.query.near_vector( - near_vector=query_vector, - limit=20, - return_properties=['workTitle', 'text'] - ) - - # Compter par œuvre - work_distribution = {} - chunk_sizes_in_results = [] - - for obj in result.objects: - props = obj.properties - work = props.get('workTitle', 'Unknown') - work_distribution[work] = work_distribution.get(work, 0) + 1 - - text = props.get('text', '') - tokens = estimate_tokens(text) - chunk_sizes_in_results.append(tokens) - - print(f"Résultats par œuvre (top 20):") - for work, count in sorted(work_distribution.items(), key=lambda x: x[1], reverse=True): - print(f" • {work}: {count} chunks") - - print() - - if chunk_sizes_in_results: - avg_result_size = sum(chunk_sizes_in_results) / len(chunk_sizes_in_results) - max_result_size = max(chunk_sizes_in_results) - print(f"Taille moyenne des chunks retournés: {avg_result_size:.0f} tokens") - print(f"Taille max des chunks retournés: {max_result_size} tokens") - - print() - - # ========== 5. SCORE DE QUALITÉ ========== - print("5. SCORE DE QUALITÉ DE LA RECHERCHE") - print("-" * 80) - print() - - quality_checks = [] - - # Check 1: Aucun chunk > 2000 tokens - oversized = sum(1 for s in sizes if s > 2000) - quality_checks.append({ - 'name': 'Taille des chunks', - 'passed': oversized == 0, - 'detail': f'{oversized} chunks > 2000 tokens' - }) - - # Check 2: Distribution équilibrée - optimal_range = sum(1 for s in sizes if 200 <= s <= 1500) - optimal_percentage = optimal_range / total * 100 - quality_checks.append({ - 'name': 'Distribution optimale', - 'passed': optimal_percentage >= 80, - 'detail': f'{optimal_percentage:.1f}% dans range optimal (200-1500 tokens)' - }) - - # Check 3: Résultats variés - unique_works = len(work_distribution) - quality_checks.append({ - 'name': 'Diversité des résultats', - 'passed': unique_works >= 3, - 'detail': f'{unique_works} œuvres différentes dans top 20' - }) - - # Check 4: Overlap présent - quality_checks.append({ - 'name': 'Overlap entre chunks', - 'passed': overlaps_found > 0 if overlaps_checked > 0 else None, - 'detail': f'{overlaps_found}/{overlaps_checked} overlaps détectés' if overlaps_checked > 0 else 'N/A' - }) - - # Afficher les résultats - passed = sum(1 for c in quality_checks if c['passed'] is True) - total_checks = sum(1 for c in quality_checks if c['passed'] is not None) - - for check in quality_checks: - if check['passed'] is None: - icon = "⚠️" - status = "N/A" - elif check['passed']: - icon = "✅" - status = "OK" - else: - icon = "❌" - status = "FAIL" - - print(f"{icon} {check['name']}: {status}") - print(f" {check['detail']}") - - print() - print(f"Score: {passed}/{total_checks} ({passed/total_checks*100:.0f}%)") - print() - - # ========== 6. RÉSUMÉ ========== - print("=" * 80) - print("RÉSUMÉ DU TEST") - print("=" * 80) - print() - - if passed >= total_checks * 0.8: - print("✅ QUALITÉ DE RECHERCHE: EXCELLENTE") - print() - print("Les chunks re-chunkés ont amélioré la recherche:") - print(f" • {total:,} chunks optimisés") - print(f" • Taille moyenne: {avg:.0f} tokens (optimal)") - print(f" • {optimal_percentage:.1f}% dans la plage optimale") - print(f" • Max: {max_size} tokens (< 2500)") - print(f" • Overlap détecté: {overlaps_found > 0 if overlaps_checked > 0 else 'N/A'}") - print() - print("Recommandations:") - print(" ✓ La recherche sémantique fonctionne correctement") - print(" ✓ Les chunks sont de taille optimale pour BGE-M3") - print(" ✓ Le système est prêt pour la production") - elif passed >= total_checks * 0.6: - print("⚠️ QUALITÉ DE RECHERCHE: BONNE") - print() - print("Quelques améliorations possibles:") - for check in quality_checks: - if not check['passed'] and check['passed'] is not None: - print(f" • {check['name']}: {check['detail']}") - else: - print("❌ QUALITÉ DE RECHERCHE: À AMÉLIORER") - print() - print("Problèmes détectés:") - for check in quality_checks: - if not check['passed'] and check['passed'] is not None: - print(f" • {check['name']}: {check['detail']}") - -finally: - client.close() diff --git a/11_vectorize_missing_chunks.py b/11_vectorize_missing_chunks.py deleted file mode 100644 index 7dd7aaf..0000000 --- a/11_vectorize_missing_chunks.py +++ /dev/null @@ -1,217 +0,0 @@ -"""Vectorize chunks that don't have vectors. - -After re-chunking, new chunks were created without vectors because Chunk_v2 -collection has no vectorizer configured. This script manually vectorizes -these chunks using the text2vec-transformers service. -""" - -import weaviate -import sys -import requests - -if sys.stdout.encoding != 'utf-8': - sys.stdout.reconfigure(encoding='utf-8') - -# Try to import tqdm -try: - from tqdm import tqdm - HAS_TQDM = True -except ImportError: - HAS_TQDM = False - -# Text2vec-transformers service URL (from docker-compose.yml) -VECTORIZER_URL = "http://localhost:8090/vectors" - -client = weaviate.connect_to_local() - -try: - print("=" * 80) - print("VECTORISATION DES CHUNKS SANS VECTEUR") - print("=" * 80) - print() - - chunk_v2 = client.collections.get("Chunk_v2") - - # ========== 1. IDENTIFIER LES CHUNKS SANS VECTEUR ========== - print("1. IDENTIFICATION DES CHUNKS SANS VECTEUR") - print("-" * 80) - print() - - print("Analyse en cours...") - - chunks_to_vectorize = [] - - for chunk in chunk_v2.iterator(include_vector=True): - if not chunk.vector or not chunk.vector.get('default'): - props = chunk.properties - chunks_to_vectorize.append({ - 'uuid': chunk.uuid, - 'text': props.get('text', ''), - 'summary': props.get('summary', ''), - 'keywords': props.get('keywords', []), - 'workTitle': props.get('workTitle', 'N/A') - }) - - print(f"✓ {len(chunks_to_vectorize)} chunks sans vecteur trouvés") - print() - - if not chunks_to_vectorize: - print("✅ Aucun chunk à vectoriser") - print() - print("=" * 80) - print("SCRIPT TERMINÉ - RIEN À FAIRE") - print("=" * 80) - sys.exit(0) - - # ========== 2. VECTORISATION ========== - print("2. VECTORISATION DES CHUNKS") - print("-" * 80) - print() - - print(f"Service vectorizer: {VECTORIZER_URL}") - print() - - vectorized_count = 0 - errors = [] - - # Create iterator with or without tqdm - if HAS_TQDM: - iterator = tqdm( - chunks_to_vectorize, - desc="Vectorisation", - unit="chunks" - ) - else: - iterator = chunks_to_vectorize - print("Vectorisation en cours...") - - for idx, chunk_data in enumerate(iterator, 1): - try: - # Prepare text for vectorization - # Combine text, summary, and keywords as per original Chunk schema - text_parts = [chunk_data['text']] - - if chunk_data['summary']: - text_parts.append(chunk_data['summary']) - - if chunk_data['keywords']: - text_parts.append(' '.join(chunk_data['keywords'])) - - combined_text = ' '.join(text_parts) - - # Call text2vec-transformers service - response = requests.post( - VECTORIZER_URL, - json={"text": combined_text}, - headers={"Content-Type": "application/json"}, - timeout=30 - ) - - if response.status_code != 200: - errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: HTTP {response.status_code}") - continue - - result = response.json() - vector = result.get('vector') - - if not vector: - errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: Pas de vecteur dans la réponse") - continue - - # Update chunk with vector - chunk_v2.data.update( - uuid=chunk_data['uuid'], - vector=vector - ) - - vectorized_count += 1 - - # Progress without tqdm - if not HAS_TQDM and idx % 10 == 0: - print(f" {idx}/{len(chunks_to_vectorize)} chunks vectorisés...") - - except requests.exceptions.RequestException as e: - errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: Erreur réseau - {e}") - except Exception as e: - errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: {e}") - - print() - print("-" * 80) - print(f"✓ Chunks vectorisés: {vectorized_count}/{len(chunks_to_vectorize)}") - - if errors: - print() - print(f"⚠️ Erreurs rencontrées: {len(errors)}") - for err in errors[:10]: - print(f" - {err}") - if len(errors) > 10: - print(f" ... et {len(errors) - 10} autres") - - print() - - # ========== 3. VÉRIFICATION ========== - print("3. VÉRIFICATION POST-VECTORISATION") - print("-" * 80) - print() - - print("Recomptage...") - remaining_without_vector = 0 - total_chunks = 0 - - for chunk in chunk_v2.iterator(include_vector=True): - total_chunks += 1 - if not chunk.vector or not chunk.vector.get('default'): - remaining_without_vector += 1 - - chunks_with_vector = total_chunks - remaining_without_vector - - print(f"✓ Total chunks: {total_chunks:,}") - print(f"✓ Avec vecteur: {chunks_with_vector:,} ({chunks_with_vector/total_chunks*100:.1f}%)") - print(f"✓ Sans vecteur: {remaining_without_vector:,}") - - print() - - if remaining_without_vector == 0: - print("✅ Tous les chunks ont été vectorisés!") - else: - print(f"⚠️ {remaining_without_vector} chunks encore sans vecteur") - print(" Relancer le script ou vérifier les erreurs") - - print() - print("=" * 80) - print("VECTORISATION TERMINÉE") - print("=" * 80) - print() - - if remaining_without_vector == 0: - print("✅ VECTORISATION RÉUSSIE") - print() - print("RÉSULTATS:") - print(f" • {vectorized_count} nouveaux vecteurs créés") - print(f" • {total_chunks:,} chunks totaux") - print(f" • 100% des chunks ont des vecteurs") - print() - print("PROCHAINES ÉTAPES:") - print(" 1. Relancer le test de recherche: python 10_test_search_quality.py") - print(" 2. Tester l'application Flask") - print() - print("NOTE: Chunk_v2 n'a toujours pas de vectorizer configuré.") - print("Les futurs nouveaux chunks devront être vectorisés manuellement") - print("OU la collection devra être recréée avec un vectorizer.") - elif vectorized_count > 0: - print("⚠️ VECTORISATION PARTIELLE") - print() - print(f" • {vectorized_count} chunks vectorisés") - print(f" • {remaining_without_vector} chunks restants") - print(" • Vérifier les erreurs et relancer") - else: - print("❌ VECTORISATION ÉCHOUÉE") - print() - print("Aucun chunk n'a pu être vectorisé.") - print("Vérifications:") - print(f" 1. Service text2vec-transformers actif: {VECTORIZER_URL}") - print(" 2. Docker containers en cours d'exécution") - print(" 3. Logs des erreurs ci-dessus") - -finally: - client.close() diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..0dbe17c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,136 @@ +# Changelog - Library RAG Project + +## 2026-01-08 - Chunking Optimization & Vectorization + +### Chunking Improvements +- **Strict chunk size limits**: Max 1000 words (down from 1500-2000) +- **Overlap implementation**: 100-word overlap between consecutive chunks +- **Triple fallback system**: Ensures robust chunking even on LLM failures +- **New module**: `llm_chunker_improved.py` with overlap functionality + +### Re-chunking Results +- Identified 31 oversized chunks (>2000 tokens, max 7,158) +- Split into 92 optimally-sized chunks +- **Result**: 0 chunks > 2000 tokens (100% within BGE-M3 limits) +- Preserved all metadata during split (workTitle, workAuthor, sectionPath, orderIndex) + +### Vectorization +- Created manual vectorization system for Chunk_v2 (no vectorizer configured) +- Successfully vectorized 92 new chunks via text2vec-transformers API +- **Result**: 5,304/5,304 chunks with vectors (100% coverage) + +### Docker Configuration +- Exposed text2vec-transformers port (8090:8080) for external vectorization +- Added cluster configuration to fix "No private IP address found" error +- Increased WORKER_TIMEOUT to 600s for very large chunks + +### Search Quality +- Created comprehensive test suite (`10_test_search_quality.py`) +- Tests: distribution, overlap detection, semantic search (4 queries) +- Search now uses `near_vector()` with manual query vectorization +- **Issue identified**: Collected papers dominates results (95.8% of chunks) + +### Database Stats (Post-Optimization) +- Total chunks: 5,304 +- Average size: 289 tokens (optimal for BGE-M3) +- Distribution: 84.6% < 500 tokens, 11.5% 500-1000, 3.0% 1000-1500 +- Works: 8 (Collected papers: 5,080 chunks, Mind Design III: 61, Platon Ménon: 56, etc.) + +--- + +## 2025-01 - Weaviate v2 Migration & GPU Integration + +### Phase 1-3: Schema Migration (Complete) +- Migrated from Chunk/Summary/Document to Chunk_v2/Summary_v2/Work +- Removed nested `document` object, added direct properties (workTitle, workAuthor, year, language) +- Work collection with sourceId for documents +- Fixed 114 summaries missing properties +- Deleted vL-jepa chunks (17), fixed null workTitles + +### Phase 4: Memory System (Complete) +- Added Thought/Message/Conversation collections to Weaviate +- 9 MCP tools for memory management (add_thought, search_thoughts, etc.) +- GPU embeddings integration (BAAI/bge-m3, RTX 4070) +- Data: 102 Thoughts, 377 Messages, 12 Conversations + +### Phase 5: Backend Integration (Complete) +- Integrated GPU embedder into Flask app (singleton pattern) +- All search routes now use manual vectorization with `near_vector()` +- Updated all routes: simple_search, hierarchical_search, summary_only_search, rag_search +- Fixed Work → Chunk/Summary property mapping (v2 schema) + +### Phase 6-7: Testing & Optimization +- Comprehensive testing of search routes +- MCP tools validation +- Performance optimization with GPU embeddings +- Documentation updates (README.md, CLAUDE.md) + +### Phase 8: Documentation Cleanup +- Consolidated all phase documentation +- Updated README with Memory MCP tools section +- Cleaned up temporary files and scripts + +--- + +## Archive Structure + +``` +archive/ +├── migration_scripts/ # Migration & optimization scripts (01-11) +│ ├── 01_migrate_document_to_work.py +│ ├── 02_create_schema_v2.py +│ ├── 03_migrate_chunks_v2.py +│ ├── 04_migrate_summaries_v2.py +│ ├── 05_validate_migration.py +│ ├── 07_cleanup.py +│ ├── 08_fix_summaries_properties.py +│ ├── 09_rechunk_oversized.py +│ ├── 10_test_search_quality.py +│ ├── 11_vectorize_missing_chunks.py +│ └── old_scripts/ # ChromaDB migration scripts +├── migration_docs/ # Detailed migration documentation +│ ├── PLAN_MIGRATION_V2_SANS_DOCUMENT.md +│ ├── PHASE5_BACKEND_INTEGRATION.md +│ └── WEAVIATE_RETRIEVAL_ARCHITECTURE.md +├── documentation/ # Phase summaries +│ ├── PHASE_0_PYTORCH_CUDA.md +│ ├── PHASE_2_MIGRATION_SUMMARY.md +│ ├── PHASE_3_CONVERSATIONS_SUMMARY.md +│ ├── PHASE_4_MIGRATION_CHROMADB.md +│ ├── PHASE_5_MCP_TOOLS.md +│ ├── PHASE_6_TESTS_OPTIMISATION.md +│ ├── PHASE_7_INTEGRATION_BACKEND.md +│ ├── PHASE_8_DOCUMENTATION_CLEANUP.md +│ └── MIGRATION_README.md +└── backups/ # Pre-migration data backups + └── pre_migration_20260108_152033/ +``` + +--- + +## Technology Stack + +**Vector Database**: Weaviate 1.34.4 with BAAI/bge-m3 embeddings (1024-dim) +**Embedder**: PyTorch 2.6.0+cu124, GPU RTX 4070 +**Backend**: Flask 3.0 with Server-Sent Events +**MCP Integration**: 9 memory tools + 6 RAG tools for Claude Desktop +**OCR**: Mistral OCR API +**LLM**: Ollama (local) or Mistral API + +--- + +## Known Issues + +1. **Chunk_v2 has no vectorizer**: All new chunks require manual vectorization via `11_vectorize_missing_chunks.py` +2. **Data imbalance**: Collected papers represents 95.8% of chunks, dominating search results +3. **Mind Design III underrepresented**: Only 61 chunks (1.2%) vs 5,080 for Collected papers + +## Recommendations + +1. Add more diverse works to balance corpus +2. Consider re-ranking with per-work boosting for diversity +3. Recreate Chunk_v2 with text2vec-transformers vectorizer for auto-vectorization (requires full data reload) + +--- + +For detailed implementation notes, see `.claude/CLAUDE.md` and `archive/` directories. diff --git a/check_linear_status.py b/check_linear_status.py deleted file mode 100644 index 9febbb3..0000000 --- a/check_linear_status.py +++ /dev/null @@ -1,174 +0,0 @@ -""" -Script pour vérifier l'état actuel des issues Linear du projet library_rag. - -Affiche : -- Nombre total d'issues -- Nombre d'issues par statut (Todo, In Progress, Done) -- Liste des issues In Progress (si présentes) -- Liste des issues Todo avec priorité 1 ou 2 -""" - -import os -import json -import requests -from pathlib import Path -from dotenv import load_dotenv - -# Load environment variables -load_dotenv() - -LINEAR_API_KEY = os.environ.get("LINEAR_API_KEY") -if not LINEAR_API_KEY: - print("❌ LINEAR_API_KEY not found in .env file") - exit(1) - -# Read project info -project_file = Path("generations/library_rag/.linear_project.json") -if not project_file.exists(): - print(f"❌ Project file not found: {project_file}") - exit(1) - -with open(project_file) as f: - project_info = json.load(f) - -project_id = project_info.get("project_id") -team_id = project_info.get("team_id") -total_issues_created = project_info.get("total_issues", 0) - -print("=" * 80) -print(f"LINEAR STATUS CHECK - Project: {project_info.get('project_name')}") -print(f"URL: {project_info.get('project_url')}") -print(f"Total issues created historically: {total_issues_created}") -print("=" * 80) -print() - -# GraphQL query to list all issues in the project -query = """ -query($projectId: String!) { - project(id: $projectId) { - issues(first: 200) { - nodes { - id - identifier - title - priority - state { - name - } - createdAt - } - } - } -} -""" - -headers = { - "Authorization": LINEAR_API_KEY, - "Content-Type": "application/json" -} - -response = requests.post( - "https://api.linear.app/graphql", - headers=headers, - json={"query": query, "variables": {"projectId": project_id}} -) - -if response.status_code != 200: - print(f"❌ Linear API error: {response.status_code}") - print(response.text) - exit(1) - -data = response.json() - -if "errors" in data: - print(f"❌ GraphQL errors: {data['errors']}") - exit(1) - -issues = data["data"]["project"]["issues"]["nodes"] - -# Count by status -status_counts = { - "Todo": 0, - "In Progress": 0, - "Done": 0, - "Other": 0 -} - -issues_by_status = { - "Todo": [], - "In Progress": [], - "Done": [] -} - -for issue in issues: - state_name = issue["state"]["name"] - if state_name in status_counts: - status_counts[state_name] += 1 - issues_by_status[state_name].append(issue) - else: - status_counts["Other"] += 1 - -# Display summary -print(f"STATUS SUMMARY:") -print(f" Done: {status_counts['Done']}") -print(f" In Progress: {status_counts['In Progress']}") -print(f" Todo: {status_counts['Todo']}") -print(f" Other: {status_counts['Other']}") -print(f" TOTAL: {len(issues)}") -print() - -# Check for issues In Progress (potential blocker) -if status_counts["In Progress"] > 0: - print("WARNING: There are 'In Progress' issues:") - print() - for issue in issues_by_status["In Progress"]: - priority = issue.get("priority", "N/A") - print(f" [IN PROGRESS] {issue['identifier']} - Priority {priority}") - print(f" {issue['title']}") - print() - print("! The agent will resume these issues first!") - print() - -# List high-priority Todo issues -high_priority_todo = [ - issue for issue in issues_by_status["Todo"] - if issue.get("priority") in [1, 2] -] - -if high_priority_todo: - print(f"HIGH PRIORITY TODO (Priority 1-2): {len(high_priority_todo)}") - print() - for issue in sorted(high_priority_todo, key=lambda x: x.get("priority", 99)): - priority = issue.get("priority", "N/A") - print(f" [TODO] {issue['identifier']} - Priority {priority}") - print(f" {issue['title'][:80]}") - print() - -# List all Todo issues (for reference) -if status_counts["Todo"] > 0: - print(f"ALL TODO ISSUES: {status_counts['Todo']}") - print() - for issue in sorted(issues_by_status["Todo"], key=lambda x: x.get("priority", 99)): - priority = issue.get("priority", "N/A") - title = issue['title'][:60] + "..." if len(issue['title']) > 60 else issue['title'] - print(f" {issue['identifier']} [P{priority}] {title}") - print() - -# Recommendation -print("=" * 80) -if status_counts["In Progress"] > 0: - print("RECOMMENDATION:") - print(" - There are 'In Progress' issues that should be finished first") - print(" - Before adding new issues, check if these should be:") - print(" 1. Completed") - print(" 2. Cancelled (moved back to Todo)") - print(" 3. Deleted") -elif status_counts["Todo"] > 10: - print("RECOMMENDATION:") - print(f" - There are {status_counts['Todo']} Todo issues pending") - print(" - Consider finishing them before adding new ones") -else: - print("RECOMMENDATION:") - print(" - Project is in good state to add new issues") - print(" - You can proceed with --new-spec") -print("=" * 80) diff --git a/check_meta_issue.py b/check_meta_issue.py deleted file mode 100644 index 1a7cc95..0000000 --- a/check_meta_issue.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -Vérifier si le META issue existe toujours dans Linear. -""" - -import os -import json -import requests -from pathlib import Path -from dotenv import load_dotenv - -load_dotenv() - -LINEAR_API_KEY = os.environ.get("LINEAR_API_KEY") -if not LINEAR_API_KEY: - print("ERROR: LINEAR_API_KEY not found") - exit(1) - -# Read project info -project_file = Path("generations/library_rag/.linear_project.json") -with open(project_file) as f: - project_info = json.load(f) - -meta_issue_id = project_info.get("meta_issue_id") -project_id = project_info.get("project_id") - -print("=" * 80) -print("Checking META issue existence...") -print(f"META issue ID from .linear_project.json: {meta_issue_id}") -print("=" * 80) -print() - -# Try to fetch the META issue -query = """ -query($issueId: String!) { - issue(id: $issueId) { - id - identifier - title - state { - name - } - } -} -""" - -headers = { - "Authorization": LINEAR_API_KEY, - "Content-Type": "application/json" -} - -response = requests.post( - "https://api.linear.app/graphql", - headers=headers, - json={"query": query, "variables": {"issueId": meta_issue_id}} -) - -if response.status_code != 200: - print(f"ERROR: Linear API error: {response.status_code}") - exit(1) - -data = response.json() - -if "errors" in data: - print("META ISSUE NOT FOUND (was deleted)") - print() - print("SOLUTION: Need to recreate META issue or reset .linear_project.json") - exit(1) - -issue = data["data"]["issue"] -if issue is None: - print("META ISSUE NOT FOUND (was deleted)") - print() - print("SOLUTION: Need to recreate META issue or reset .linear_project.json") - exit(1) - -print(f"META issue EXISTS:") -print(f" ID: {issue['id']}") -print(f" Identifier: {issue['identifier']}") -print(f" Title: {issue['title']}") -print(f" State: {issue['state']['name']}") -print() -print("OK - Can proceed with agent") diff --git a/dockerize_ikario_body.py b/dockerize_ikario_body.py deleted file mode 100644 index ddd5703..0000000 --- a/dockerize_ikario_body.py +++ /dev/null @@ -1,93 +0,0 @@ -""" -Dockerization helper for ikario_body -=================================== - -Ce script crée les fichiers Docker nécessaires pour exécuter l'application -`generations/ikario_body` (frontend + serveur + base SQLite) dans Docker, -SANS modifier aucun fichier existant. - -Il génère un fichier de composition : - - docker-compose.ikario_body.yml (à la racine du repo) - -Ce fichier utilise l'image officielle Node et monte le code existant -ainsi que la base SQLite dans les conteneurs (mode développement). - -Utilisation : - 1) Depuis la racine du repo : - python dockerize_ikario_body.py - 2) Puis pour lancer l'appli dans Docker : - docker compose -f docker-compose.ikario_body.yml up - ou, selon votre installation : - docker-compose -f docker-compose.ikario_body.yml up - - - Frontend accessible sur: http://localhost:3000 - - API backend (server) sur : http://localhost:3001 -""" - -from pathlib import Path - - -def generate_docker_compose(root: Path) -> None: - """Génère le fichier docker-compose.ikario_body.yml sans toucher au code existant.""" - project_dir = root / "generations" / "ikario_body" - - if not project_dir.exists(): - raise SystemExit(f"Project directory not found: {project_dir}") - - compose_path = root / "docker-compose.ikario_body.yml" - - # On utilise les scripts npm déjà définis : - # - frontend: npm run dev (Vite) en écoutant sur 0.0.0.0:3000 (dans le conteneur) - # - server: npm start dans ./server sur 3001 (dans le conteneur) - # - # Pour éviter les conflits de ports courants (3000/3001) sur la machine hôte, - # on mappe vers des ports plus élevés côté host : - # - frontend : host 4300 -> container 3000 - # - backend : host 4301 -> container 3001 - # - # Le volume ./generations/ikario_body est monté dans /app, - # ce qui inclut aussi la base SQLite dans server/data/claude-clone.db. - compose_content = f"""services: - ikario_body_frontend: - image: node:20 - working_dir: /app - volumes: - - ./generations/ikario_body:/app - # Eviter de réutiliser les node_modules Windows dans le conteneur Linux - - /app/node_modules - command: ["sh", "-c", "npm install && npm run dev -- --host 0.0.0.0 --port 3000"] - ports: - - "4300:3000" - environment: - - NODE_ENV=development - - ikario_body_server: - image: node:20 - working_dir: /app/server - volumes: - - ./generations/ikario_body:/app - # Eviter de réutiliser les node_modules Windows dans le conteneur Linux - - /app/server/node_modules - command: ["sh", "-c", "npm install && npm start"] - ports: - - "4301:3001" - environment: - - NODE_ENV=development - depends_on: - - ikario_body_frontend - -""" - - compose_path.write_text(compose_content, encoding="utf-8") - print(f"Created {compose_path.relative_to(root)}") - - -def main() -> None: - repo_root = Path(__file__).resolve().parent - generate_docker_compose(repo_root) - - -if __name__ == "__main__": - main() - - diff --git a/generations/library_rag/README.md b/generations/library_rag/README.md index 92b85b7..6a7c2b1 100644 --- a/generations/library_rag/README.md +++ b/generations/library_rag/README.md @@ -451,7 +451,101 @@ filter_by_author(author="Platon") delete_document(source_id="platon-menon", confirm=true) ``` -Pour plus de détails, voir la documentation complète dans `.claude/CLAUDE.md`. +### Outils MCP Memory (9 outils intégrés - Phase 4) + +**Système de Mémoire Unifié** : Le serveur MCP intègre désormais 9 outils pour gérer un système de mémoire (Thoughts, Messages, Conversations) utilisant Weaviate + GPU embeddings. Ces outils permettent à Claude Desktop de créer, rechercher et gérer des pensées, messages et conversations de manière persistante. + +**Architecture Memory** : +- **Backend** : Weaviate 1.34.4 (collections Thought, Message, Conversation) +- **Embeddings** : BAAI/bge-m3 GPU (1024-dim, RTX 4070, PyTorch 2.6.0+cu124) +- **Handlers** : `memory/mcp/` (thought_tools, message_tools, conversation_tools) +- **Données** : 102 Thoughts, 377 Messages, 12 Conversations (au 2025-01-08) + +#### Thought Tools (3) + +**1. add_thought** - Ajouter une pensée au système +``` +add_thought( + content="Exploring vector databases for semantic search", + thought_type="observation", # reflection, question, intuition, observation + trigger="Research session", + concepts=["weaviate", "embeddings", "gpu"], + privacy_level="private" # private, shared, public +) +``` + +**2. search_thoughts** - Recherche sémantique dans les pensées +``` +search_thoughts( + query="vector databases GPU", + limit=10, + thought_type_filter="observation" # optionnel +) +``` + +**3. get_thought** - Récupérer une pensée par UUID +``` +get_thought(uuid="730c1a8e-b09f-4889-bbe9-4867d0ee7f1a") +``` + +#### Message Tools (3) + +**4. add_message** - Ajouter un message à une conversation +``` +add_message( + content="Explain transformers in AI", + role="user", # user, assistant, system + conversation_id="chat_2025_01_08", + order_index=0 +) +``` + +**5. get_messages** - Récupérer tous les messages d'une conversation +``` +get_messages( + conversation_id="chat_2025_01_08", + limit=50 +) +``` + +**6. search_messages** - Recherche sémantique dans les messages +``` +search_messages( + query="transformers AI", + limit=10, + conversation_id_filter="chat_2025_01_08" # optionnel +) +``` + +#### Conversation Tools (3) + +**7. get_conversation** - Récupérer une conversation par ID +``` +get_conversation(conversation_id="ikario_derniere_pensee") +``` + +**8. search_conversations** - Recherche sémantique dans les conversations +``` +search_conversations( + query="philosophical discussion", + limit=10, + category_filter="philosophy" # optionnel +) +``` + +**9. list_conversations** - Lister toutes les conversations +``` +list_conversations( + limit=20, + category_filter="testing" # optionnel +) +``` + +**Tests** : Tous les outils Memory ont été testés avec succès (voir `test_memory_mcp_tools.py`) + +**Documentation complète** : Voir `memory/README_MCP_TOOLS.md` pour l'architecture détaillée, les schémas de données et les exemples d'utilisation. + +Pour plus de détails sur les outils Library RAG, voir la documentation complète dans `.claude/CLAUDE.md`. --- diff --git a/generations/library_rag/flask_app.py b/generations/library_rag/flask_app.py index e698332..89a1b50 100644 --- a/generations/library_rag/flask_app.py +++ b/generations/library_rag/flask_app.py @@ -89,8 +89,23 @@ from utils.types import ( SSEEvent, ) +# GPU Embedder for manual vectorization (Phase 5: Backend Integration) +import sys +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) +from memory.core import get_embedder + app = Flask(__name__) +# Initialize GPU embedder singleton +_embedder = None + +def get_gpu_embedder(): + """Get or create GPU embedder singleton.""" + global _embedder + if _embedder is None: + _embedder = get_embedder() + return _embedder + # Configuration Flask app.config["SECRET_KEY"] = os.environ.get("SECRET_KEY", "dev-secret-key-change-in-production") @@ -152,26 +167,25 @@ def get_collection_stats() -> Optional[CollectionStats]: stats: CollectionStats = {} # Chunk stats (renamed from Passage) - passages = client.collections.get("Chunk") + passages = client.collections.get("Chunk_v2") passage_count = passages.aggregate.over_all(total_count=True) stats["passages"] = passage_count.total_count or 0 - # Get unique authors and works (from nested objects) - all_passages = passages.query.fetch_objects(limit=1000) + # Get unique authors and works (from direct properties in v2) + all_passages = passages.query.fetch_objects(limit=10000) authors: set[str] = set() works: set[str] = set() languages: set[str] = set() for obj in all_passages.objects: - # Work is now a nested object with {title, author} - work_obj = obj.properties.get("work") - if work_obj and isinstance(work_obj, dict): - if work_obj.get("author"): - authors.add(str(work_obj["author"])) - if work_obj.get("title"): - works.add(str(work_obj["title"])) - if obj.properties.get("language"): - languages.add(str(obj.properties["language"])) + props = obj.properties + # In v2: workAuthor and workTitle are direct properties + if props.get("workAuthor"): + authors.add(str(props["workAuthor"])) + if props.get("workTitle"): + works.add(str(props["workTitle"])) + if props.get("language"): + languages.add(str(props["language"])) stats["authors"] = len(authors) stats["works"] = len(works) @@ -208,13 +222,13 @@ def get_all_passages( if client is None: return [] - chunks = client.collections.get("Chunk") + chunks = client.collections.get("Chunk_v2") result = chunks.query.fetch_objects( limit=limit, offset=offset, return_properties=[ - "text", "sectionPath", "sectionLevel", "chapterTitle", + "text", "sectionPath", "chapterTitle", "canonicalReference", "unitType", "keywords", "orderIndex", "language" ], ) @@ -253,7 +267,7 @@ def simple_search( if client is None: return [] - chunks = client.collections.get("Chunk") + chunks = client.collections.get("Chunk_v2") # Build filters using top-level properties (workAuthor, workTitle) filters: Optional[Any] = None @@ -263,13 +277,17 @@ def simple_search( work_filter_obj = wvq.Filter.by_property("workTitle").equal(work_filter) filters = filters & work_filter_obj if filters else work_filter_obj - result = chunks.query.near_text( - query=query, + # Generate query vector with GPU embedder (Phase 5: manual vectorization) + embedder = get_gpu_embedder() + query_vector = embedder.embed_single(query) + + result = chunks.query.near_vector( + near_vector=query_vector.tolist(), limit=limit, filters=filters, return_metadata=wvq.MetadataQuery(distance=True), return_properties=[ - "text", "sectionPath", "sectionLevel", "chapterTitle", + "text", "sectionPath", "chapterTitle", "canonicalReference", "unitType", "keywords", "orderIndex", "language" ], ) @@ -333,10 +351,14 @@ def hierarchical_search( # STAGE 1: Search Summary collection for relevant sections # ═══════════════════════════════════════════════════════════════ - summary_collection = client.collections.get("Summary") + summary_collection = client.collections.get("Summary_v2") - summaries_result = summary_collection.query.near_text( - query=query, + # Generate query vector with GPU embedder (Phase 5: manual vectorization) + embedder = get_gpu_embedder() + query_vector = embedder.embed_single(query) + + summaries_result = summary_collection.query.near_vector( + near_vector=query_vector.tolist(), limit=sections_limit, return_metadata=wvq.MetadataQuery(distance=True), # Note: Don't specify return_properties - let Weaviate return all properties @@ -358,63 +380,62 @@ def hierarchical_search( for summary_obj in summaries_result.objects: props = summary_obj.properties - # Try to get document.sourceId if available (nested object might still be returned) - doc_obj = props.get("document") - source_id = "" - if doc_obj and isinstance(doc_obj, dict): - source_id = doc_obj.get("sourceId", "") + # In v2: Summary has workTitle property, need to get sourceId from Work + work_title = props.get("workTitle", "") + # We'll get sourceId later by matching workTitle with Work.sourceId + # For now, use workTitle as identifier sections_data.append({ "section_path": props.get("sectionPath", ""), "title": props.get("title", ""), "summary_text": props.get("text", ""), "level": props.get("level", 1), "concepts": props.get("concepts", []), - "document_source_id": source_id, - "summary_uuid": str(summary_obj.uuid), # Keep UUID for later retrieval if needed + "document_source_id": "", # Will be populated during filtering + "work_title": work_title, # Add workTitle for filtering + "summary_uuid": str(summary_obj.uuid), "similarity": round((1 - summary_obj.metadata.distance) * 100, 1) if summary_obj.metadata and summary_obj.metadata.distance else 0, }) - # Post-filter sections by author/work (Summary doesn't have work nested object) + # Post-filter sections by author/work (Summary_v2 has workTitle property) if author_filter or work_filter: print(f"[HIERARCHICAL] Post-filtering {len(sections_data)} sections by work='{work_filter}'") - doc_collection = client.collections.get("Document") - filtered_sections = [] + # Build Work title -> author map for filtering + work_collection = client.collections.get("Work") + work_map = {} + for work in work_collection.iterator(include_vector=False): + props = work.properties + title = props.get("title") + if title: + work_map[title] = { + "author": props.get("author", "Unknown"), + "sourceId": props.get("sourceId", "") + } + + filtered_sections = [] for section in sections_data: - source_id = section["document_source_id"] - if not source_id: - print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no sourceId)") + work_title = section.get("work_title", "") + + if not work_title or work_title not in work_map: + print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no work mapping)") continue - # Query Document to get work metadata - # Note: 'work' is a nested object, so we don't specify it in return_properties - # Weaviate should return it automatically - doc_result = doc_collection.query.fetch_objects( - filters=wvq.Filter.by_property("sourceId").equal(source_id), - limit=1, - ) + work_author = work_map[work_title]["author"] + section["document_source_id"] = work_map[work_title]["sourceId"] # Populate sourceId - if doc_result.objects: - doc_work = doc_result.objects[0].properties.get("work", {}) - print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' doc_work type={type(doc_work)}, value={doc_work}") - if isinstance(doc_work, dict): - work_title = doc_work.get("title", "N/A") - work_author = doc_work.get("author", "N/A") - # Check filters - if author_filter and work_author != author_filter: - print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (author '{work_author}' != '{author_filter}')") - continue - if work_filter and work_title != work_filter: - print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (work '{work_title}' != '{work_filter}')") - continue + print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' work={work_title}, author={work_author}") - print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' KEPT (work='{work_title}')") - filtered_sections.append(section) - else: - print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (doc_work not a dict)") - else: - print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no doc found for sourceId='{source_id}')") + # Check filters + if author_filter and work_author != author_filter: + print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (author '{work_author}' != '{author_filter}')") + continue + if work_filter and work_title != work_filter: + print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (work '{work_title}' != '{work_filter}')") + continue + + print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' KEPT (work='{work_title}')") + filtered_sections.append(section) sections_data = filtered_sections print(f"[HIERARCHICAL] After filtering: {len(sections_data)} sections remaining") @@ -438,7 +459,7 @@ def hierarchical_search( # For each section, search chunks using the section's summary text # This groups chunks under their relevant sections - chunk_collection = client.collections.get("Chunk") + chunk_collection = client.collections.get("Chunk_v2") # Build base filters (author/work only) base_filters: Optional[Any] = None @@ -464,8 +485,11 @@ def hierarchical_search( if base_filters: section_filters = base_filters & section_filters - chunks_result = chunk_collection.query.near_text( - query=section_query, + # Generate query vector with GPU embedder (Phase 5: manual vectorization) + section_query_vector = embedder.embed_single(section_query) + + chunks_result = chunk_collection.query.near_vector( + near_vector=section_query_vector.tolist(), limit=chunks_per_section, filters=section_filters, return_metadata=wvq.MetadataQuery(distance=True), @@ -600,14 +624,28 @@ def summary_only_search( if client is None: return [] - summaries = client.collections.get("Summary") + summaries = client.collections.get("Summary_v2") - # Note: Cannot filter by nested document properties directly in Weaviate v4 - # Must fetch all and filter in Python if author/work filters are present + # Build Work map for metadata lookup (Summary_v2 has workTitle, not document) + work_collection = client.collections.get("Work") + work_map = {} + for work in work_collection.iterator(include_vector=False): + work_props = work.properties + title = work_props.get("title") + if title: + work_map[title] = { + "author": work_props.get("author", "Unknown"), + "year": work_props.get("year", 0), + "sourceId": work_props.get("sourceId", ""), + } + + # Generate query vector with GPU embedder (Phase 5: manual vectorization) + embedder = get_gpu_embedder() + query_vector = embedder.embed_single(query) # Semantic search - results = summaries.query.near_text( - query=query, + results = summaries.query.near_vector( + near_vector=query_vector.tolist(), limit=limit * 3 if (author_filter or work_filter) else limit, # Fetch more if filtering return_metadata=wvq.MetadataQuery(distance=True) ) @@ -618,24 +656,34 @@ def summary_only_search( props = obj.properties similarity = 1 - obj.metadata.distance - # Apply filters (Python-side since nested properties) - if author_filter and props["document"].get("author", "") != author_filter: + # Get work metadata from workTitle + work_title = props.get("workTitle", "") + if not work_title or work_title not in work_map: continue - if work_filter and props["document"].get("title", "") != work_filter: + + work_info = work_map[work_title] + work_author = work_info["author"] + work_year = work_info["year"] + source_id = work_info["sourceId"] + + # Apply filters + if author_filter and work_author != author_filter: + continue + if work_filter and work_title != work_filter: continue # Determine document icon and name - doc_id = props["document"]["sourceId"].lower() - if "tiercelin" in doc_id: + doc_id_lower = source_id.lower() + if "tiercelin" in doc_id_lower: doc_icon = "🟡" doc_name = "Tiercelin" - elif "platon" in doc_id or "menon" in doc_id: + elif "platon" in doc_id_lower or "menon" in doc_id_lower: doc_icon = "🟢" doc_name = "Platon" - elif "haugeland" in doc_id: + elif "haugeland" in doc_id_lower: doc_icon = "🟣" doc_name = "Haugeland" - elif "logique" in doc_id: + elif "logique" in doc_id_lower: doc_icon = "🔵" doc_name = "Logique" else: @@ -647,19 +695,19 @@ def summary_only_search( "uuid": str(obj.uuid), "similarity": round(similarity * 100, 1), # Convert to percentage "text": props.get("text", ""), - "title": props["title"], + "title": props.get("title", ""), "concepts": props.get("concepts", []), "doc_icon": doc_icon, "doc_name": doc_name, - "author": props["document"].get("author", ""), - "year": props["document"].get("year", 0), + "author": work_author, + "year": work_year, "chunks_count": props.get("chunksCount", 0), "section_path": props.get("sectionPath", ""), "sectionPath": props.get("sectionPath", ""), # Alias for template compatibility # Add work info for template compatibility "work": { - "title": props["document"].get("title", ""), - "author": props["document"].get("author", ""), + "title": work_title, + "author": work_author, }, } @@ -969,7 +1017,7 @@ def rag_search( print("[RAG Search] Weaviate client unavailable") return [] - chunks = client.collections.get("Chunk") + chunks = client.collections.get("Chunk_v2") # Build work filter if selected_works is provided work_filter: Optional[Any] = None @@ -978,9 +1026,13 @@ def rag_search( work_filter = wvq.Filter.by_property("workTitle").contains_any(selected_works) print(f"[RAG Search] Applying work filter: {selected_works}") + # Generate query vector with GPU embedder (Phase 5: manual vectorization) + embedder = get_gpu_embedder() + query_vector = embedder.embed_single(query) + # Query with properties needed for RAG context - result = chunks.query.near_text( - query=query, + result = chunks.query.near_vector( + near_vector=query_vector.tolist(), limit=limit, filters=work_filter, return_metadata=wvq.MetadataQuery(distance=True), @@ -1444,33 +1496,30 @@ def api_get_works() -> Union[Response, tuple[Response, int]]: "message": "Cannot connect to Weaviate database" }), 500 - # Query Chunk collection to get all unique works with counts - chunks = client.collections.get("Chunk") + # Query Chunk_v2 collection to get all unique works with counts + chunks = client.collections.get("Chunk_v2") # Fetch all chunks to aggregate by work - # Using a larger limit to get all documents - # Note: Don't use return_properties with nested objects (causes gRPC error) - # Fetch all objects without specifying properties + # In v2: work is NOT a nested object, use workTitle and workAuthor properties all_chunks = chunks.query.fetch_objects(limit=10000) # Aggregate chunks by work (title + author) works_count: Dict[str, Dict[str, Any]] = {} for obj in all_chunks.objects: - work_obj = obj.properties.get("work") - if work_obj and isinstance(work_obj, dict): - title = work_obj.get("title", "") - author = work_obj.get("author", "") + props = obj.properties + title = props.get("workTitle", "") + author = props.get("workAuthor", "") - if title: # Only count if title exists - # Use title as key (assumes unique titles) - if title not in works_count: - works_count[title] = { - "title": title, - "author": author or "Unknown", - "chunks_count": 0 - } - works_count[title]["chunks_count"] += 1 + if title: # Only count if title exists + # Use title as key (assumes unique titles) + if title not in works_count: + works_count[title] = { + "title": title, + "author": author or "Unknown", + "chunks_count": 0 + } + works_count[title]["chunks_count"] += 1 # Convert to list and sort by author, then title works_list = list(works_count.values()) @@ -3082,45 +3131,60 @@ def documents() -> str: with get_weaviate_client() as client: if client is not None: - # Get chunk counts and authors - chunk_collection = client.collections.get("Chunk") + from typing import cast - for obj in chunk_collection.iterator(include_vector=False): - props = obj.properties - from typing import cast - doc_obj = cast(Dict[str, Any], props.get("document", {})) - work_obj = cast(Dict[str, Any], props.get("work", {})) - - if doc_obj: - source_id = doc_obj.get("sourceId", "") - if source_id: - if source_id not in documents_from_weaviate: - documents_from_weaviate[source_id] = { - "source_id": source_id, - "title": work_obj.get("title") if work_obj else "Unknown", - "author": work_obj.get("author") if work_obj else "Unknown", - "chunks_count": 0, - "summaries_count": 0, - "authors": set(), - } - documents_from_weaviate[source_id]["chunks_count"] += 1 - - # Track unique authors - author = work_obj.get("author") if work_obj else None - if author: - documents_from_weaviate[source_id]["authors"].add(author) - - # Get summary counts + # Get all Works (now with sourceId added in Phase 1 of migration) try: - summary_collection = client.collections.get("Summary") - for obj in summary_collection.iterator(include_vector=False): - props = obj.properties - doc_obj = cast(Dict[str, Any], props.get("document", {})) + work_collection = client.collections.get("Work") + chunk_collection = client.collections.get("Chunk_v2") - if doc_obj: - source_id = doc_obj.get("sourceId", "") - if source_id and source_id in documents_from_weaviate: - documents_from_weaviate[source_id]["summaries_count"] += 1 + # Build documents from Work collection + for work in work_collection.iterator(include_vector=False): + props = work.properties + source_id = props.get("sourceId") + + # Skip Works without sourceId (not documents) + if not source_id: + continue + + documents_from_weaviate[source_id] = { + "source_id": source_id, + "title": props.get("title", "Unknown"), + "author": props.get("author", "Unknown"), + "pages": props.get("pages", 0), + "edition": props.get("edition", ""), + "chunks_count": 0, + "summaries_count": 0, + "authors": set(), + } + + # Add author to set + if props.get("author") and props.get("author") != "Unknown": + documents_from_weaviate[source_id]["authors"].add(props.get("author")) + + # Count chunks per document (via workTitle) + for chunk in chunk_collection.iterator(include_vector=False): + work_title = chunk.properties.get("workTitle") + + # Find corresponding sourceId + for source_id, doc_data in documents_from_weaviate.items(): + if doc_data["title"] == work_title: + doc_data["chunks_count"] += 1 + break + except Exception as e: + print(f"Warning: Could not load Work collection: {e}") + + # Count summaries (if collection exists) + try: + summary_collection = client.collections.get("Summary_v2") + for summary in summary_collection.iterator(include_vector=False): + work_title = summary.properties.get("workTitle") + + # Find corresponding sourceId + for source_id, doc_data in documents_from_weaviate.items(): + if doc_data["title"] == work_title: + doc_data["summaries_count"] += 1 + break except Exception: # Summary collection may not exist pass @@ -3157,17 +3221,195 @@ def documents() -> str: "has_images": images_dir.exists() and any(images_dir.iterdir()) if images_dir.exists() else False, "image_count": len(list(images_dir.glob("*.png"))) if images_dir.exists() else 0, "metadata": metadata, + "pages": weaviate_data.get("pages", pages), # FROM WEAVIATE, fallback to file "summaries_count": weaviate_data["summaries_count"], # FROM WEAVIATE "authors_count": len(weaviate_data["authors"]), # FROM WEAVIATE "chunks_count": weaviate_data["chunks_count"], # FROM WEAVIATE "title": weaviate_data["title"], # FROM WEAVIATE "author": weaviate_data["author"], # FROM WEAVIATE + "edition": weaviate_data.get("edition", ""), # FROM WEAVIATE "toc": toc, }) return render_template("documents.html", documents=documents_list) +# ═══════════════════════════════════════════════════════════════════════════════ +# Memory Routes (Phase 5: Backend Integration) +# ═══════════════════════════════════════════════════════════════════════════════ + +def run_async(coro): + """Run async coroutine in sync Flask context.""" + import asyncio + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + return loop.run_until_complete(coro) + finally: + loop.close() + + +@app.route("/memories") +def memories() -> str: + """Render the Memory search page (Thoughts + Messages).""" + # Get memory statistics + with get_weaviate_client() as client: + if client is None: + flash("Cannot connect to Weaviate database", "error") + stats = {"thoughts": 0, "messages": 0, "conversations": 0} + else: + try: + thoughts = client.collections.get("Thought") + messages = client.collections.get("Message") + conversations = client.collections.get("Conversation") + + thoughts_count = thoughts.aggregate.over_all(total_count=True).total_count + messages_count = messages.aggregate.over_all(total_count=True).total_count + conversations_count = conversations.aggregate.over_all(total_count=True).total_count + + stats = { + "thoughts": thoughts_count or 0, + "messages": messages_count or 0, + "conversations": conversations_count or 0, + } + except Exception as e: + print(f"Error fetching memory stats: {e}") + stats = {"thoughts": 0, "messages": 0, "conversations": 0} + + return render_template("memories.html", stats=stats) + + +@app.route("/api/memories/search-thoughts", methods=["POST"]) +def api_search_thoughts(): + """API endpoint for thought semantic search.""" + try: + # Import Memory MCP tools locally + from memory.mcp import SearchThoughtsInput, search_thoughts_handler + + data = request.json + query = data.get("query", "") + limit = data.get("limit", 10) + thought_type_filter = data.get("thought_type_filter") + + input_data = SearchThoughtsInput( + query=query, + limit=limit, + thought_type_filter=thought_type_filter + ) + + result = run_async(search_thoughts_handler(input_data)) + return jsonify(result) + except Exception as e: + return jsonify({"success": False, "error": str(e)}), 500 + + +@app.route("/api/memories/search-messages", methods=["POST"]) +def api_search_messages(): + """API endpoint for message semantic search.""" + try: + from memory.mcp import SearchMessagesInput, search_messages_handler + + data = request.json + query = data.get("query", "") + limit = data.get("limit", 10) + conversation_id_filter = data.get("conversation_id_filter") + + input_data = SearchMessagesInput( + query=query, + limit=limit, + conversation_id_filter=conversation_id_filter + ) + + result = run_async(search_messages_handler(input_data)) + return jsonify(result) + except Exception as e: + return jsonify({"success": False, "error": str(e)}), 500 + + +@app.route("/conversations") +def conversations() -> str: + """Render the Conversations page.""" + try: + from memory.mcp import ListConversationsInput, list_conversations_handler + + limit = request.args.get("limit", 20, type=int) + category_filter = request.args.get("category") + + input_data = ListConversationsInput( + limit=limit, + category_filter=category_filter + ) + + result = run_async(list_conversations_handler(input_data)) + + if result.get("success"): + conversations_list = result.get("conversations", []) + else: + flash(f"Error loading conversations: {result.get('error')}", "error") + conversations_list = [] + + return render_template("conversations.html", conversations=conversations_list) + except Exception as e: + flash(f"Error loading conversations: {str(e)}", "error") + return render_template("conversations.html", conversations=[]) + + +@app.route("/conversation/") +def conversation_view(conversation_id: str) -> str: + """View a specific conversation with all its messages.""" + try: + from memory.mcp import ( + GetConversationInput, get_conversation_handler, + GetMessagesInput, get_messages_handler + ) + + # Get conversation metadata + conv_input = GetConversationInput(conversation_id=conversation_id) + conversation = run_async(get_conversation_handler(conv_input)) + + if not conversation.get("success"): + flash(f"Conversation not found: {conversation.get('error')}", "error") + return redirect(url_for("conversations")) + + # Get all messages + msg_input = GetMessagesInput(conversation_id=conversation_id, limit=500) + messages_result = run_async(get_messages_handler(msg_input)) + + messages = messages_result.get("messages", []) if messages_result.get("success") else [] + + return render_template( + "conversation_view.html", + conversation=conversation, + messages=messages + ) + except Exception as e: + flash(f"Error loading conversation: {str(e)}", "error") + return redirect(url_for("conversations")) + + +@app.route("/api/conversations/search", methods=["POST"]) +def api_search_conversations(): + """API endpoint for conversation semantic search.""" + try: + from memory.mcp import SearchConversationsInput, search_conversations_handler + + data = request.json + query = data.get("query", "") + limit = data.get("limit", 10) + category_filter = data.get("category_filter") + + input_data = SearchConversationsInput( + query=query, + limit=limit, + category_filter=category_filter + ) + + result = run_async(search_conversations_handler(input_data)) + return jsonify(result) + except Exception as e: + return jsonify({"success": False, "error": str(e)}), 500 + + # ═══════════════════════════════════════════════════════════════════════════════ # Main # ═══════════════════════════════════════════════════════════════════════════════ diff --git a/generations/library_rag/mcp_server.py b/generations/library_rag/mcp_server.py index cb1f26b..221390a 100644 --- a/generations/library_rag/mcp_server.py +++ b/generations/library_rag/mcp_server.py @@ -62,6 +62,31 @@ from mcp_tools import ( PDFProcessingError, ) +# Memory MCP Tools (added for unified Memory + Library system) +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) +from memory.mcp import ( + # Thought tools + AddThoughtInput, + SearchThoughtsInput, + add_thought_handler, + search_thoughts_handler, + get_thought_handler, + # Message tools + AddMessageInput, + GetMessagesInput, + SearchMessagesInput, + add_message_handler, + get_messages_handler, + search_messages_handler, + # Conversation tools + GetConversationInput, + SearchConversationsInput, + ListConversationsInput, + get_conversation_handler, + search_conversations_handler, + list_conversations_handler, +) + # ============================================================================= # Logging Configuration # ============================================================================= @@ -551,6 +576,264 @@ async def delete_document( return result.model_dump(mode='json') +# ============================================================================= +# Memory Tools (Thoughts, Messages, Conversations) +# ============================================================================= + + +@mcp.tool() +async def add_thought( + content: str, + thought_type: str = "reflection", + trigger: str = "", + concepts: list[str] | None = None, + privacy_level: str = "private", +) -> Dict[str, Any]: + """ + Add a new thought to the Memory system. + + Args: + content: The thought content. + thought_type: Type (reflection, question, intuition, observation, etc.). + trigger: What triggered this thought (optional). + concepts: Related concepts/tags (optional). + privacy_level: Privacy level (private, shared, public). + + Returns: + Dictionary containing: + - success: Whether thought was added successfully + - uuid: UUID of the created thought + - content: Preview of the thought content + - thought_type: The thought type + """ + input_data = AddThoughtInput( + content=content, + thought_type=thought_type, + trigger=trigger, + concepts=concepts or [], + privacy_level=privacy_level, + ) + result = await add_thought_handler(input_data) + return result + + +@mcp.tool() +async def search_thoughts( + query: str, + limit: int = 10, + thought_type_filter: str | None = None, +) -> Dict[str, Any]: + """ + Search thoughts using semantic similarity. + + Args: + query: Search query text. + limit: Maximum number of results (1-100, default 10). + thought_type_filter: Filter by thought type (optional). + + Returns: + Dictionary containing: + - success: Whether search succeeded + - query: The original search query + - results: List of matching thoughts + - count: Number of results returned + """ + input_data = SearchThoughtsInput( + query=query, + limit=limit, + thought_type_filter=thought_type_filter, + ) + result = await search_thoughts_handler(input_data) + return result + + +@mcp.tool() +async def get_thought(uuid: str) -> Dict[str, Any]: + """ + Get a specific thought by UUID. + + Args: + uuid: Thought UUID. + + Returns: + Dictionary containing complete thought data or error message. + """ + result = await get_thought_handler(uuid) + return result + + +@mcp.tool() +async def add_message( + content: str, + role: str, + conversation_id: str, + order_index: int = 0, +) -> Dict[str, Any]: + """ + Add a new message to a conversation. + + Args: + content: Message content. + role: Role (user, assistant, system). + conversation_id: Conversation identifier. + order_index: Position in conversation (default 0). + + Returns: + Dictionary containing: + - success: Whether message was added successfully + - uuid: UUID of the created message + - content: Preview of the message content + - role: The message role + - conversation_id: The conversation ID + """ + input_data = AddMessageInput( + content=content, + role=role, + conversation_id=conversation_id, + order_index=order_index, + ) + result = await add_message_handler(input_data) + return result + + +@mcp.tool() +async def get_messages( + conversation_id: str, + limit: int = 50, +) -> Dict[str, Any]: + """ + Get all messages from a conversation in order. + + Args: + conversation_id: Conversation identifier. + limit: Maximum messages to return (1-500, default 50). + + Returns: + Dictionary containing: + - success: Whether query succeeded + - conversation_id: The conversation ID + - messages: List of messages in order + - count: Number of messages returned + """ + input_data = GetMessagesInput( + conversation_id=conversation_id, + limit=limit, + ) + result = await get_messages_handler(input_data) + return result + + +@mcp.tool() +async def search_messages( + query: str, + limit: int = 10, + conversation_id_filter: str | None = None, +) -> Dict[str, Any]: + """ + Search messages using semantic similarity. + + Args: + query: Search query text. + limit: Maximum number of results (1-100, default 10). + conversation_id_filter: Filter by conversation ID (optional). + + Returns: + Dictionary containing: + - success: Whether search succeeded + - query: The original search query + - results: List of matching messages + - count: Number of results returned + """ + input_data = SearchMessagesInput( + query=query, + limit=limit, + conversation_id_filter=conversation_id_filter, + ) + result = await search_messages_handler(input_data) + return result + + +@mcp.tool() +async def get_conversation(conversation_id: str) -> Dict[str, Any]: + """ + Get a specific conversation by ID. + + Args: + conversation_id: Conversation identifier. + + Returns: + Dictionary containing: + - success: Whether conversation was found + - conversation_id: The conversation ID + - category: Conversation category + - summary: Conversation summary + - timestamp_start: Start time + - timestamp_end: End time + - participants: List of participants + - tags: Semantic tags + - message_count: Number of messages + """ + input_data = GetConversationInput(conversation_id=conversation_id) + result = await get_conversation_handler(input_data) + return result + + +@mcp.tool() +async def search_conversations( + query: str, + limit: int = 10, + category_filter: str | None = None, +) -> Dict[str, Any]: + """ + Search conversations using semantic similarity. + + Args: + query: Search query text. + limit: Maximum number of results (1-50, default 10). + category_filter: Filter by category (optional). + + Returns: + Dictionary containing: + - success: Whether search succeeded + - query: The original search query + - results: List of matching conversations + - count: Number of results returned + """ + input_data = SearchConversationsInput( + query=query, + limit=limit, + category_filter=category_filter, + ) + result = await search_conversations_handler(input_data) + return result + + +@mcp.tool() +async def list_conversations( + limit: int = 20, + category_filter: str | None = None, +) -> Dict[str, Any]: + """ + List all conversations with optional filtering. + + Args: + limit: Maximum conversations to return (1-100, default 20). + category_filter: Filter by category (optional). + + Returns: + Dictionary containing: + - success: Whether query succeeded + - conversations: List of conversations + - count: Number of conversations returned + """ + input_data = ListConversationsInput( + limit=limit, + category_filter=category_filter, + ) + result = await list_conversations_handler(input_data) + return result + + # ============================================================================= # Signal Handlers # ============================================================================= diff --git a/generations/library_rag/templates/base.html b/generations/library_rag/templates/base.html index 65f1618..78e5327 100644 --- a/generations/library_rag/templates/base.html +++ b/generations/library_rag/templates/base.html @@ -718,6 +718,15 @@ 📚 Documents +
+ + 🧠 + Memory (Ikario) + + + 💭 + Conversations + @@ -736,6 +745,7 @@ Conversation Parser PDF Documents + Memory