chore: Major cleanup - archive migration scripts and remove temp files
CLEANUP ACTIONS: - Archived 11 migration/optimization scripts to archive/migration_scripts/ - Archived 11 phase documentation files to archive/documentation/ - Moved backups/, docs/, scripts/ to archive/ - Deleted 30+ temporary debug/test/fix scripts - Cleaned Python cache (__pycache__/, *.pyc) - Cleaned log files (*.log) NEW FILES: - CHANGELOG.md: Consolidated project history and migration documentation - Updated .gitignore: Added *.log, *.pyc, archive/ exclusions FINAL ROOT STRUCTURE (19 items): - Core framework: agent.py, autonomous_agent_demo.py, client.py, security.py, progress.py, prompts.py - Config: requirements.txt, package.json, .gitignore - Docs: README.md, CHANGELOG.md, project_progress.md - Directories: archive/, generations/, memory/, prompts/, utils/ ARCHIVED SCRIPTS (in archive/migration_scripts/): 01-11: Migration & optimization scripts (migrate, schema, rechunk, vectorize, etc.) ARCHIVED DOCS (in archive/documentation/): PHASE_0-8: Detailed phase summaries MIGRATION_README.md, PLAN_MIGRATION_WEAVIATE_GPU.md Repository is now clean and production-ready with all important files preserved in archive/. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
11
.gitignore
vendored
11
.gitignore
vendored
@@ -2,12 +2,18 @@
|
|||||||
generations/*
|
generations/*
|
||||||
!generations/library_rag/
|
!generations/library_rag/
|
||||||
|
|
||||||
|
# Python cache and compiled files
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
*.pyd
|
||||||
|
|
||||||
# Log files
|
# Log files
|
||||||
logs/
|
logs/
|
||||||
|
*.log
|
||||||
|
|
||||||
.env
|
.env
|
||||||
venv
|
venv
|
||||||
__pycache__
|
|
||||||
|
|
||||||
# Node modules (if any)
|
# Node modules (if any)
|
||||||
node_modules/
|
node_modules/
|
||||||
@@ -19,3 +25,6 @@ restoration_log.txt
|
|||||||
restoration_remaining_log.txt
|
restoration_remaining_log.txt
|
||||||
summary_generation_progress.json
|
summary_generation_progress.json
|
||||||
nul
|
nul
|
||||||
|
|
||||||
|
# Archives (migration scripts moved here)
|
||||||
|
archive/
|
||||||
@@ -1,157 +0,0 @@
|
|||||||
"""Correctif: Ajouter workAuthor, year, language aux Summary_v2."""
|
|
||||||
|
|
||||||
import weaviate
|
|
||||||
import sys
|
|
||||||
|
|
||||||
if sys.stdout.encoding != 'utf-8':
|
|
||||||
sys.stdout.reconfigure(encoding='utf-8')
|
|
||||||
|
|
||||||
# Try to import tqdm
|
|
||||||
try:
|
|
||||||
from tqdm import tqdm
|
|
||||||
HAS_TQDM = True
|
|
||||||
except ImportError:
|
|
||||||
HAS_TQDM = False
|
|
||||||
|
|
||||||
client = weaviate.connect_to_local()
|
|
||||||
|
|
||||||
try:
|
|
||||||
print("=" * 80)
|
|
||||||
print("CORRECTIF: AJOUTER workAuthor, year, language À SUMMARY_V2")
|
|
||||||
print("=" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
summary_v2 = client.collections.get("Summary_v2")
|
|
||||||
work_collection = client.collections.get("Work")
|
|
||||||
|
|
||||||
# Build workTitle → Work metadata map
|
|
||||||
print("Étape 1: Mapping workTitle → Work metadata")
|
|
||||||
print("-" * 80)
|
|
||||||
|
|
||||||
work_map = {}
|
|
||||||
|
|
||||||
for work in work_collection.iterator(include_vector=False):
|
|
||||||
props = work.properties
|
|
||||||
title = props.get("title")
|
|
||||||
if title:
|
|
||||||
work_map[title] = {
|
|
||||||
"author": props.get("author", "Unknown"),
|
|
||||||
"year": props.get("year", 0),
|
|
||||||
"language": props.get("language", "en"),
|
|
||||||
}
|
|
||||||
|
|
||||||
print(f"✓ {len(work_map)} mappings workTitle → metadata")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Count total summaries
|
|
||||||
print("Étape 2: Comptage summaries")
|
|
||||||
print("-" * 80)
|
|
||||||
|
|
||||||
print("Comptage en cours...")
|
|
||||||
total_summaries = sum(1 for _ in summary_v2.iterator(include_vector=False))
|
|
||||||
|
|
||||||
print(f"✓ {total_summaries} summaries à corriger")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Update summaries
|
|
||||||
print("Étape 3: Mise à jour des propriétés")
|
|
||||||
print("-" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
updated = 0
|
|
||||||
skipped = 0
|
|
||||||
errors = []
|
|
||||||
|
|
||||||
# Create iterator with or without tqdm
|
|
||||||
if HAS_TQDM:
|
|
||||||
iterator = tqdm(
|
|
||||||
summary_v2.iterator(include_vector=False),
|
|
||||||
total=total_summaries,
|
|
||||||
desc="Mise à jour",
|
|
||||||
unit="summaries"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
iterator = summary_v2.iterator(include_vector=False)
|
|
||||||
print("Mise à jour en cours...")
|
|
||||||
|
|
||||||
for idx, summary in enumerate(iterator, 1):
|
|
||||||
props = summary.properties
|
|
||||||
|
|
||||||
try:
|
|
||||||
work_title = props.get("workTitle")
|
|
||||||
|
|
||||||
if not work_title:
|
|
||||||
errors.append(f"Summary {summary.uuid}: pas de workTitle")
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get work metadata
|
|
||||||
work_metadata = work_map.get(work_title)
|
|
||||||
if not work_metadata:
|
|
||||||
errors.append(f"Summary {summary.uuid}: Work '{work_title}' introuvable")
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check if already updated (workAuthor exists)
|
|
||||||
if props.get("workAuthor") is not None:
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Update properties
|
|
||||||
summary_v2.data.update(
|
|
||||||
uuid=summary.uuid,
|
|
||||||
properties={
|
|
||||||
"workAuthor": work_metadata["author"],
|
|
||||||
"year": work_metadata["year"],
|
|
||||||
"language": work_metadata["language"],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
updated += 1
|
|
||||||
|
|
||||||
# Progress without tqdm
|
|
||||||
if not HAS_TQDM and idx % 10 == 0:
|
|
||||||
print(f" {idx}/{total_summaries} summaries traités...")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
errors.append(f"Summary {summary.uuid}: {e}")
|
|
||||||
|
|
||||||
print()
|
|
||||||
print("-" * 80)
|
|
||||||
print(f"✓ Total mis à jour: {updated}/{total_summaries}")
|
|
||||||
print(f" Déjà à jour: {skipped}")
|
|
||||||
|
|
||||||
if errors:
|
|
||||||
print(f"⚠️ Erreurs rencontrées: {len(errors)}")
|
|
||||||
print()
|
|
||||||
print("Premières erreurs:")
|
|
||||||
for err in errors[:10]:
|
|
||||||
print(f" - {err}")
|
|
||||||
if len(errors) > 10:
|
|
||||||
print(f" ... et {len(errors) - 10} autres")
|
|
||||||
|
|
||||||
print()
|
|
||||||
print("=" * 80)
|
|
||||||
print("CORRECTIF TERMINÉ")
|
|
||||||
print("=" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
if updated == total_summaries:
|
|
||||||
print("✅ Tous les summaries ont été mis à jour")
|
|
||||||
print()
|
|
||||||
print("Propriétés ajoutées:")
|
|
||||||
print(" ✓ workAuthor (auteur de l'œuvre)")
|
|
||||||
print(" ✓ year (année de publication)")
|
|
||||||
print(" ✓ language (langue du texte)")
|
|
||||||
print()
|
|
||||||
print("VÉRIFICATION:")
|
|
||||||
print(" python -c \"from verify_summaries import verify; verify()\"")
|
|
||||||
elif updated > 0:
|
|
||||||
print(f"⚠️ {updated}/{total_summaries} summaries mis à jour")
|
|
||||||
print(" Vérifier les erreurs")
|
|
||||||
else:
|
|
||||||
print("❌ Aucun summary mis à jour")
|
|
||||||
print(" Corriger les erreurs et relancer")
|
|
||||||
|
|
||||||
finally:
|
|
||||||
client.close()
|
|
||||||
@@ -1,267 +0,0 @@
|
|||||||
"""Script to re-chunk oversized chunks (> 2000 tokens) in Chunk_v2.
|
|
||||||
|
|
||||||
This script identifies chunks that are too large (> 2000 tokens) and splits them
|
|
||||||
into smaller chunks with overlap (max 1000 words, overlap 100 words).
|
|
||||||
|
|
||||||
Steps:
|
|
||||||
1. Identify all chunks > 2000 tokens in Chunk_v2
|
|
||||||
2. Re-chunk using simple_chunk_with_overlap (1000 words max, 100 overlap)
|
|
||||||
3. Delete the original oversized chunk
|
|
||||||
4. Insert new smaller chunks with preserved metadata
|
|
||||||
5. Update Summary_v2 chunksCount if needed
|
|
||||||
"""
|
|
||||||
|
|
||||||
import weaviate
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# Add utils to path
|
|
||||||
sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
|
|
||||||
|
|
||||||
from utils.llm_chunker_improved import simple_chunk_with_overlap, estimate_tokens
|
|
||||||
|
|
||||||
if sys.stdout.encoding != 'utf-8':
|
|
||||||
sys.stdout.reconfigure(encoding='utf-8')
|
|
||||||
|
|
||||||
# Try to import tqdm
|
|
||||||
try:
|
|
||||||
from tqdm import tqdm
|
|
||||||
HAS_TQDM = True
|
|
||||||
except ImportError:
|
|
||||||
HAS_TQDM = False
|
|
||||||
|
|
||||||
# Constants
|
|
||||||
TOKEN_THRESHOLD = 2000 # Chunks > 2000 tokens will be re-chunked
|
|
||||||
MAX_WORDS = 1000
|
|
||||||
OVERLAP_WORDS = 100
|
|
||||||
|
|
||||||
client = weaviate.connect_to_local()
|
|
||||||
|
|
||||||
try:
|
|
||||||
print("=" * 80)
|
|
||||||
print("RE-CHUNKING DES CHUNKS SURDIMENSIONNÉS")
|
|
||||||
print("=" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
chunk_v2 = client.collections.get("Chunk_v2")
|
|
||||||
work_collection = client.collections.get("Work")
|
|
||||||
|
|
||||||
# ========== 1. IDENTIFIER LES CHUNKS PROBLÉMATIQUES ==========
|
|
||||||
print("1. IDENTIFICATION DES CHUNKS > 2000 TOKENS")
|
|
||||||
print("-" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
oversized_chunks = []
|
|
||||||
|
|
||||||
print("Analyse en cours...")
|
|
||||||
for chunk in chunk_v2.iterator(include_vector=False):
|
|
||||||
props = chunk.properties
|
|
||||||
text = props.get('text', '')
|
|
||||||
tokens = estimate_tokens(text)
|
|
||||||
|
|
||||||
if tokens > TOKEN_THRESHOLD:
|
|
||||||
oversized_chunks.append({
|
|
||||||
'uuid': str(chunk.uuid),
|
|
||||||
'tokens': tokens,
|
|
||||||
'chars': len(text),
|
|
||||||
'text': text,
|
|
||||||
'workTitle': props.get('workTitle', ''),
|
|
||||||
'workAuthor': props.get('workAuthor', ''),
|
|
||||||
'year': props.get('year', 0),
|
|
||||||
'language': props.get('language', 'en'),
|
|
||||||
'sectionPath': props.get('sectionPath', ''),
|
|
||||||
'chapterTitle': props.get('chapterTitle', ''),
|
|
||||||
'canonicalReference': props.get('canonicalReference', ''),
|
|
||||||
'unitType': props.get('unitType', 'main_content'),
|
|
||||||
'keywords': props.get('keywords', []),
|
|
||||||
'orderIndex': props.get('orderIndex', 0),
|
|
||||||
})
|
|
||||||
|
|
||||||
print(f"✓ {len(oversized_chunks)} chunks > {TOKEN_THRESHOLD} tokens trouvés")
|
|
||||||
print()
|
|
||||||
|
|
||||||
if not oversized_chunks:
|
|
||||||
print("✅ Aucun chunk surdimensionné à traiter")
|
|
||||||
print()
|
|
||||||
print("=" * 80)
|
|
||||||
print("SCRIPT TERMINÉ - RIEN À FAIRE")
|
|
||||||
print("=" * 80)
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
# Trier par taille
|
|
||||||
oversized_chunks.sort(key=lambda x: x['tokens'], reverse=True)
|
|
||||||
|
|
||||||
print("Top 5 plus gros chunks:")
|
|
||||||
for i, chunk in enumerate(oversized_chunks[:5], 1):
|
|
||||||
print(f"{i}. {chunk['tokens']:,} tokens ({chunk['chars']:,} chars)")
|
|
||||||
print(f" Œuvre: {chunk['workTitle']}")
|
|
||||||
print(f" Section: {chunk['sectionPath'][:60]}...")
|
|
||||||
print()
|
|
||||||
|
|
||||||
if len(oversized_chunks) > 5:
|
|
||||||
print(f"... et {len(oversized_chunks) - 5} autres")
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
# ========== 2. RE-CHUNKING ==========
|
|
||||||
print("2. RE-CHUNKING AVEC OVERLAP")
|
|
||||||
print("-" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Build work_title -> work_uuid map for references
|
|
||||||
work_map = {}
|
|
||||||
for work in work_collection.iterator(include_vector=False):
|
|
||||||
props = work.properties
|
|
||||||
title = props.get("title")
|
|
||||||
if title:
|
|
||||||
work_map[title] = str(work.uuid)
|
|
||||||
|
|
||||||
print(f"✓ {len(work_map)} Works mappés")
|
|
||||||
print()
|
|
||||||
|
|
||||||
deleted_count = 0
|
|
||||||
inserted_count = 0
|
|
||||||
errors = []
|
|
||||||
|
|
||||||
# Create iterator with or without tqdm
|
|
||||||
if HAS_TQDM:
|
|
||||||
iterator = tqdm(
|
|
||||||
oversized_chunks,
|
|
||||||
desc="Re-chunking",
|
|
||||||
unit="chunks"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
iterator = oversized_chunks
|
|
||||||
print("Re-chunking en cours...")
|
|
||||||
|
|
||||||
for idx, old_chunk in enumerate(iterator, 1):
|
|
||||||
try:
|
|
||||||
# Re-chunk text
|
|
||||||
new_texts = simple_chunk_with_overlap(
|
|
||||||
old_chunk['text'],
|
|
||||||
max_words=MAX_WORDS,
|
|
||||||
overlap_words=OVERLAP_WORDS
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get work reference
|
|
||||||
work_uuid = work_map.get(old_chunk['workTitle'])
|
|
||||||
if not work_uuid:
|
|
||||||
errors.append(f"Chunk {old_chunk['uuid'][:8]}: Work '{old_chunk['workTitle']}' introuvable")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Insert new chunks
|
|
||||||
for i, new_text in enumerate(new_texts):
|
|
||||||
# Sub-ordering: multiply base index by 100 and add part index
|
|
||||||
# Example: orderIndex=5 becomes 500, 501, 502, etc.
|
|
||||||
new_order_index = (old_chunk['orderIndex'] * 100) + i
|
|
||||||
|
|
||||||
new_props = {
|
|
||||||
"text": new_text,
|
|
||||||
"summary": "", # Empty summary for simple chunks
|
|
||||||
"keywords": old_chunk['keywords'],
|
|
||||||
"workTitle": old_chunk['workTitle'],
|
|
||||||
"workAuthor": old_chunk['workAuthor'],
|
|
||||||
"year": old_chunk['year'],
|
|
||||||
"language": old_chunk['language'],
|
|
||||||
"sectionPath": old_chunk['sectionPath'],
|
|
||||||
"chapterTitle": old_chunk['chapterTitle'],
|
|
||||||
"canonicalReference": old_chunk['canonicalReference'],
|
|
||||||
"unitType": old_chunk['unitType'],
|
|
||||||
"orderIndex": new_order_index,
|
|
||||||
}
|
|
||||||
|
|
||||||
chunk_v2.data.insert(
|
|
||||||
properties=new_props,
|
|
||||||
references={"work": work_uuid}
|
|
||||||
)
|
|
||||||
inserted_count += 1
|
|
||||||
|
|
||||||
# Delete old chunk
|
|
||||||
chunk_v2.data.delete_by_id(old_chunk['uuid'])
|
|
||||||
deleted_count += 1
|
|
||||||
|
|
||||||
# Progress without tqdm
|
|
||||||
if not HAS_TQDM and idx % 5 == 0:
|
|
||||||
print(f" {idx}/{len(oversized_chunks)} chunks traités...")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
errors.append(f"Chunk {old_chunk['uuid'][:8]}: {e}")
|
|
||||||
|
|
||||||
print()
|
|
||||||
print("-" * 80)
|
|
||||||
print(f"✓ Chunks supprimés: {deleted_count}")
|
|
||||||
print(f"✓ Nouveaux chunks créés: {inserted_count}")
|
|
||||||
if deleted_count > 0:
|
|
||||||
print(f" Expansion moyenne: {inserted_count / deleted_count:.1f}x")
|
|
||||||
else:
|
|
||||||
print(f" ⚠️ Aucun chunk supprimé - vérifier les erreurs")
|
|
||||||
|
|
||||||
if errors:
|
|
||||||
print()
|
|
||||||
print(f"⚠️ Erreurs rencontrées: {len(errors)}")
|
|
||||||
for err in errors[:10]:
|
|
||||||
print(f" - {err}")
|
|
||||||
if len(errors) > 10:
|
|
||||||
print(f" ... et {len(errors) - 10} autres")
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
# ========== 3. VÉRIFICATION ==========
|
|
||||||
print("3. VÉRIFICATION POST-RECHUNKING")
|
|
||||||
print("-" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
print("Comptage des nouveaux chunks...")
|
|
||||||
remaining_oversized = 0
|
|
||||||
total_chunks = 0
|
|
||||||
|
|
||||||
for chunk in chunk_v2.iterator(include_vector=False):
|
|
||||||
total_chunks += 1
|
|
||||||
text = chunk.properties.get('text', '')
|
|
||||||
tokens = estimate_tokens(text)
|
|
||||||
if tokens > TOKEN_THRESHOLD:
|
|
||||||
remaining_oversized += 1
|
|
||||||
|
|
||||||
print(f"✓ Total chunks: {total_chunks:,}")
|
|
||||||
print(f"✓ Chunks > {TOKEN_THRESHOLD} tokens: {remaining_oversized}")
|
|
||||||
|
|
||||||
if remaining_oversized == 0:
|
|
||||||
print()
|
|
||||||
print("✅ Aucun chunk surdimensionné restant!")
|
|
||||||
else:
|
|
||||||
print()
|
|
||||||
print(f"⚠️ {remaining_oversized} chunks encore > {TOKEN_THRESHOLD} tokens")
|
|
||||||
print(" Relancer le script si nécessaire")
|
|
||||||
|
|
||||||
print()
|
|
||||||
print("=" * 80)
|
|
||||||
print("RE-CHUNKING TERMINÉ")
|
|
||||||
print("=" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
print("RÉSULTATS:")
|
|
||||||
print(f" • Chunks supprimés: {deleted_count}")
|
|
||||||
print(f" • Nouveaux chunks créés: {inserted_count}")
|
|
||||||
if deleted_count > 0:
|
|
||||||
print(f" • Expansion: {inserted_count / deleted_count:.1f}x")
|
|
||||||
print(f" • Chunks restants > {TOKEN_THRESHOLD} tokens: {remaining_oversized}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
if remaining_oversized == 0 and deleted_count > 0:
|
|
||||||
print("✅ RE-CHUNKING RÉUSSI")
|
|
||||||
print()
|
|
||||||
print("AMÉLIORATIONS:")
|
|
||||||
print(f" • {deleted_count} chunks géants éliminés")
|
|
||||||
print(f" • {inserted_count} chunks optimaux créés")
|
|
||||||
print(f" • Taille max: {MAX_WORDS} mots (~{MAX_WORDS * 2.5:.0f} tokens)")
|
|
||||||
print(f" • Overlap: {OVERLAP_WORDS} mots (contexte préservé)")
|
|
||||||
print()
|
|
||||||
print("PROCHAINES ÉTAPES:")
|
|
||||||
print(" 1. Tester la recherche sémantique")
|
|
||||||
print(" 2. Vérifier la qualité des vecteurs")
|
|
||||||
print(" 3. Optionnel: Mettre à jour Summary_v2.chunksCount si nécessaire")
|
|
||||||
elif deleted_count == 0:
|
|
||||||
print("ℹ️ Aucun chunk n'a nécessité de re-chunking")
|
|
||||||
|
|
||||||
finally:
|
|
||||||
client.close()
|
|
||||||
@@ -1,402 +0,0 @@
|
|||||||
"""Test search quality with re-chunked data.
|
|
||||||
|
|
||||||
This script tests semantic search to verify that the re-chunking improved
|
|
||||||
search quality and relevance.
|
|
||||||
|
|
||||||
Tests:
|
|
||||||
1. Chunk size distribution after re-chunking
|
|
||||||
2. Overlap verification between consecutive chunks
|
|
||||||
3. Semantic search quality on various queries
|
|
||||||
4. Comparison of results from giant chunks vs optimized chunks
|
|
||||||
"""
|
|
||||||
|
|
||||||
import weaviate
|
|
||||||
import sys
|
|
||||||
import requests
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# Add utils to path
|
|
||||||
sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
|
|
||||||
|
|
||||||
from utils.llm_chunker_improved import estimate_tokens
|
|
||||||
|
|
||||||
if sys.stdout.encoding != 'utf-8':
|
|
||||||
sys.stdout.reconfigure(encoding='utf-8')
|
|
||||||
|
|
||||||
# Vectorizer URL (same as in 11_vectorize_missing_chunks.py)
|
|
||||||
VECTORIZER_URL = "http://localhost:8090/vectors"
|
|
||||||
|
|
||||||
def vectorize_query(query: str) -> list[float]:
|
|
||||||
"""Manually vectorize a query using text2vec-transformers service.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
query: Query text to vectorize
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Vector as list of floats (1024 dimensions for BGE-M3)
|
|
||||||
"""
|
|
||||||
response = requests.post(
|
|
||||||
VECTORIZER_URL,
|
|
||||||
json={"text": query},
|
|
||||||
headers={"Content-Type": "application/json"},
|
|
||||||
timeout=30
|
|
||||||
)
|
|
||||||
if response.status_code != 200:
|
|
||||||
raise RuntimeError(f"Vectorization failed: HTTP {response.status_code}")
|
|
||||||
|
|
||||||
result = response.json()
|
|
||||||
vector = result.get('vector')
|
|
||||||
if not vector:
|
|
||||||
raise RuntimeError("No vector in response")
|
|
||||||
|
|
||||||
return vector
|
|
||||||
|
|
||||||
client = weaviate.connect_to_local()
|
|
||||||
|
|
||||||
try:
|
|
||||||
print("=" * 80)
|
|
||||||
print("TEST DE LA QUALITÉ DE RECHERCHE APRÈS RE-CHUNKING")
|
|
||||||
print("=" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
chunk_v2 = client.collections.get("Chunk_v2")
|
|
||||||
|
|
||||||
# ========== 1. DISTRIBUTION DES TAILLES ==========
|
|
||||||
print("1. DISTRIBUTION DES TAILLES DE CHUNKS")
|
|
||||||
print("-" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
print("Analyse en cours...")
|
|
||||||
|
|
||||||
sizes = []
|
|
||||||
for chunk in chunk_v2.iterator(include_vector=False):
|
|
||||||
text = chunk.properties.get('text', '')
|
|
||||||
tokens = estimate_tokens(text)
|
|
||||||
sizes.append(tokens)
|
|
||||||
|
|
||||||
total = len(sizes)
|
|
||||||
avg = sum(sizes) / total
|
|
||||||
max_size = max(sizes)
|
|
||||||
min_size = min(sizes)
|
|
||||||
|
|
||||||
print(f"Total chunks: {total:,}")
|
|
||||||
print(f"Taille moyenne: {avg:.0f} tokens")
|
|
||||||
print(f"Min: {min_size} tokens")
|
|
||||||
print(f"Max: {max_size} tokens")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Distribution par tranches
|
|
||||||
ranges = [
|
|
||||||
(0, 500, "Très petits"),
|
|
||||||
(500, 1000, "Petits"),
|
|
||||||
(1000, 1500, "Moyens"),
|
|
||||||
(1500, 2000, "Grands"),
|
|
||||||
(2000, 3000, "Très grands"),
|
|
||||||
(3000, 10000, "ÉNORMES"),
|
|
||||||
]
|
|
||||||
|
|
||||||
print("Distribution par tranches:")
|
|
||||||
for min_tok, max_tok, label in ranges:
|
|
||||||
count = sum(1 for s in sizes if min_tok <= s < max_tok)
|
|
||||||
percentage = count / total * 100
|
|
||||||
bar = "█" * int(percentage / 2)
|
|
||||||
print(f" {min_tok:>5}-{max_tok:>5} tokens ({label:15}): {count:>5} ({percentage:>5.1f}%) {bar}")
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
# ========== 2. VÉRIFICATION OVERLAP ==========
|
|
||||||
print("2. VÉRIFICATION DE L'OVERLAP ENTRE CHUNKS CONSÉCUTIFS")
|
|
||||||
print("-" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Prendre une œuvre pour vérifier l'overlap
|
|
||||||
print("Analyse de l'overlap dans 'Between Past and Future'...")
|
|
||||||
|
|
||||||
arendt_chunks = []
|
|
||||||
for chunk in chunk_v2.iterator(include_vector=False):
|
|
||||||
props = chunk.properties
|
|
||||||
if props.get('workTitle') == 'Between Past and Future':
|
|
||||||
arendt_chunks.append({
|
|
||||||
'orderIndex': props.get('orderIndex', 0),
|
|
||||||
'text': props.get('text', ''),
|
|
||||||
'sectionPath': props.get('sectionPath', '')
|
|
||||||
})
|
|
||||||
|
|
||||||
# Trier par orderIndex
|
|
||||||
arendt_chunks.sort(key=lambda x: x['orderIndex'])
|
|
||||||
|
|
||||||
print(f"Chunks trouvés: {len(arendt_chunks)}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Vérifier overlap entre chunks consécutifs de même section
|
|
||||||
overlaps_found = 0
|
|
||||||
overlaps_checked = 0
|
|
||||||
|
|
||||||
for i in range(len(arendt_chunks) - 1):
|
|
||||||
current = arendt_chunks[i]
|
|
||||||
next_chunk = arendt_chunks[i + 1]
|
|
||||||
|
|
||||||
# Vérifier si même section (potentiellement des chunks split)
|
|
||||||
if current['sectionPath'] == next_chunk['sectionPath']:
|
|
||||||
# Extraire les derniers 200 caractères du chunk actuel
|
|
||||||
current_end = current['text'][-200:].strip()
|
|
||||||
# Extraire les premiers 200 caractères du chunk suivant
|
|
||||||
next_start = next_chunk['text'][:200].strip()
|
|
||||||
|
|
||||||
# Chercher overlap
|
|
||||||
overlap_found = False
|
|
||||||
for length in range(50, 201, 10): # Tester différentes longueurs
|
|
||||||
if len(current_end) < length or len(next_start) < length:
|
|
||||||
continue
|
|
||||||
|
|
||||||
test_end = current_end[-length:]
|
|
||||||
if test_end in next_start:
|
|
||||||
overlap_found = True
|
|
||||||
overlaps_found += 1
|
|
||||||
break
|
|
||||||
|
|
||||||
overlaps_checked += 1
|
|
||||||
|
|
||||||
if overlaps_checked > 0:
|
|
||||||
print(f"Chunks consécutifs vérifiés: {overlaps_checked}")
|
|
||||||
print(f"Overlaps détectés: {overlaps_found} ({overlaps_found/overlaps_checked*100:.1f}%)")
|
|
||||||
else:
|
|
||||||
print("Aucun chunk consécutif dans la même section (pas de split détecté)")
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
# ========== 3. TESTS DE RECHERCHE SÉMANTIQUE ==========
|
|
||||||
print("3. TESTS DE RECHERCHE SÉMANTIQUE")
|
|
||||||
print("-" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
test_queries = [
|
|
||||||
{
|
|
||||||
"query": "What is the nature of representation in cognitive science?",
|
|
||||||
"expected_work": "Mind Design III",
|
|
||||||
"description": "Requête philosophique complexe"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"query": "Comment définit-on la vertu selon Platon?",
|
|
||||||
"expected_work": "Platon - Ménon",
|
|
||||||
"description": "Requête en français sur un concept spécifique"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"query": "pragmatism and belief fixation",
|
|
||||||
"expected_work": "Collected papers",
|
|
||||||
"description": "Concepts multiples (test de granularité)"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"query": "Entre la logique des termes et la grammaire spéculative",
|
|
||||||
"expected_work": "La pensée-signe",
|
|
||||||
"description": "Requête technique académique"
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
for i, test in enumerate(test_queries, 1):
|
|
||||||
print(f"Test {i}: {test['description']}")
|
|
||||||
print(f"Query: \"{test['query']}\"")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Vectorize query and search with near_vector
|
|
||||||
# (Chunk_v2 has no vectorizer, so we must manually vectorize queries)
|
|
||||||
query_vector = vectorize_query(test['query'])
|
|
||||||
result = chunk_v2.query.near_vector(
|
|
||||||
near_vector=query_vector,
|
|
||||||
limit=5,
|
|
||||||
return_properties=[
|
|
||||||
'text', 'workTitle', 'workAuthor',
|
|
||||||
'sectionPath', 'chapterTitle'
|
|
||||||
],
|
|
||||||
return_metadata=['distance']
|
|
||||||
)
|
|
||||||
|
|
||||||
if not result.objects:
|
|
||||||
print(" ❌ Aucun résultat trouvé")
|
|
||||||
print()
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Analyser les résultats
|
|
||||||
print(f" Résultats: {len(result.objects)}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
for j, obj in enumerate(result.objects, 1):
|
|
||||||
props = obj.properties
|
|
||||||
work_title = props.get('workTitle', 'N/A')
|
|
||||||
text = props.get('text', '')
|
|
||||||
tokens = estimate_tokens(text)
|
|
||||||
|
|
||||||
# Distance (si disponible)
|
|
||||||
distance = getattr(obj.metadata, 'distance', None) if hasattr(obj, 'metadata') else None
|
|
||||||
distance_str = f" (distance: {distance:.4f})" if distance else ""
|
|
||||||
|
|
||||||
# Marquer si c'est l'œuvre attendue
|
|
||||||
match_icon = "✓" if test['expected_work'] in work_title else " "
|
|
||||||
|
|
||||||
print(f" [{match_icon}] {j}. {work_title}{distance_str}")
|
|
||||||
print(f" Taille: {tokens} tokens")
|
|
||||||
print(f" Section: {props.get('sectionPath', 'N/A')[:60]}...")
|
|
||||||
print(f" Extrait: {text[:120]}...")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Vérifier si l'œuvre attendue est dans les résultats
|
|
||||||
found_expected = any(
|
|
||||||
test['expected_work'] in obj.properties.get('workTitle', '')
|
|
||||||
for obj in result.objects
|
|
||||||
)
|
|
||||||
|
|
||||||
if found_expected:
|
|
||||||
rank = next(
|
|
||||||
i for i, obj in enumerate(result.objects, 1)
|
|
||||||
if test['expected_work'] in obj.properties.get('workTitle', '')
|
|
||||||
)
|
|
||||||
print(f" ✅ Œuvre attendue trouvée (rang {rank}/5)")
|
|
||||||
else:
|
|
||||||
print(f" ⚠️ Œuvre attendue '{test['expected_work']}' non trouvée dans le top 5")
|
|
||||||
|
|
||||||
print()
|
|
||||||
print("-" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
# ========== 4. STATISTIQUES GLOBALES ==========
|
|
||||||
print("4. STATISTIQUES GLOBALES DE RECHERCHE")
|
|
||||||
print("-" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Tester une requête large
|
|
||||||
broad_query = "philosophy and logic"
|
|
||||||
print(f"Requête large: \"{broad_query}\"")
|
|
||||||
print()
|
|
||||||
|
|
||||||
query_vector = vectorize_query(broad_query)
|
|
||||||
result = chunk_v2.query.near_vector(
|
|
||||||
near_vector=query_vector,
|
|
||||||
limit=20,
|
|
||||||
return_properties=['workTitle', 'text']
|
|
||||||
)
|
|
||||||
|
|
||||||
# Compter par œuvre
|
|
||||||
work_distribution = {}
|
|
||||||
chunk_sizes_in_results = []
|
|
||||||
|
|
||||||
for obj in result.objects:
|
|
||||||
props = obj.properties
|
|
||||||
work = props.get('workTitle', 'Unknown')
|
|
||||||
work_distribution[work] = work_distribution.get(work, 0) + 1
|
|
||||||
|
|
||||||
text = props.get('text', '')
|
|
||||||
tokens = estimate_tokens(text)
|
|
||||||
chunk_sizes_in_results.append(tokens)
|
|
||||||
|
|
||||||
print(f"Résultats par œuvre (top 20):")
|
|
||||||
for work, count in sorted(work_distribution.items(), key=lambda x: x[1], reverse=True):
|
|
||||||
print(f" • {work}: {count} chunks")
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
if chunk_sizes_in_results:
|
|
||||||
avg_result_size = sum(chunk_sizes_in_results) / len(chunk_sizes_in_results)
|
|
||||||
max_result_size = max(chunk_sizes_in_results)
|
|
||||||
print(f"Taille moyenne des chunks retournés: {avg_result_size:.0f} tokens")
|
|
||||||
print(f"Taille max des chunks retournés: {max_result_size} tokens")
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
# ========== 5. SCORE DE QUALITÉ ==========
|
|
||||||
print("5. SCORE DE QUALITÉ DE LA RECHERCHE")
|
|
||||||
print("-" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
quality_checks = []
|
|
||||||
|
|
||||||
# Check 1: Aucun chunk > 2000 tokens
|
|
||||||
oversized = sum(1 for s in sizes if s > 2000)
|
|
||||||
quality_checks.append({
|
|
||||||
'name': 'Taille des chunks',
|
|
||||||
'passed': oversized == 0,
|
|
||||||
'detail': f'{oversized} chunks > 2000 tokens'
|
|
||||||
})
|
|
||||||
|
|
||||||
# Check 2: Distribution équilibrée
|
|
||||||
optimal_range = sum(1 for s in sizes if 200 <= s <= 1500)
|
|
||||||
optimal_percentage = optimal_range / total * 100
|
|
||||||
quality_checks.append({
|
|
||||||
'name': 'Distribution optimale',
|
|
||||||
'passed': optimal_percentage >= 80,
|
|
||||||
'detail': f'{optimal_percentage:.1f}% dans range optimal (200-1500 tokens)'
|
|
||||||
})
|
|
||||||
|
|
||||||
# Check 3: Résultats variés
|
|
||||||
unique_works = len(work_distribution)
|
|
||||||
quality_checks.append({
|
|
||||||
'name': 'Diversité des résultats',
|
|
||||||
'passed': unique_works >= 3,
|
|
||||||
'detail': f'{unique_works} œuvres différentes dans top 20'
|
|
||||||
})
|
|
||||||
|
|
||||||
# Check 4: Overlap présent
|
|
||||||
quality_checks.append({
|
|
||||||
'name': 'Overlap entre chunks',
|
|
||||||
'passed': overlaps_found > 0 if overlaps_checked > 0 else None,
|
|
||||||
'detail': f'{overlaps_found}/{overlaps_checked} overlaps détectés' if overlaps_checked > 0 else 'N/A'
|
|
||||||
})
|
|
||||||
|
|
||||||
# Afficher les résultats
|
|
||||||
passed = sum(1 for c in quality_checks if c['passed'] is True)
|
|
||||||
total_checks = sum(1 for c in quality_checks if c['passed'] is not None)
|
|
||||||
|
|
||||||
for check in quality_checks:
|
|
||||||
if check['passed'] is None:
|
|
||||||
icon = "⚠️"
|
|
||||||
status = "N/A"
|
|
||||||
elif check['passed']:
|
|
||||||
icon = "✅"
|
|
||||||
status = "OK"
|
|
||||||
else:
|
|
||||||
icon = "❌"
|
|
||||||
status = "FAIL"
|
|
||||||
|
|
||||||
print(f"{icon} {check['name']}: {status}")
|
|
||||||
print(f" {check['detail']}")
|
|
||||||
|
|
||||||
print()
|
|
||||||
print(f"Score: {passed}/{total_checks} ({passed/total_checks*100:.0f}%)")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# ========== 6. RÉSUMÉ ==========
|
|
||||||
print("=" * 80)
|
|
||||||
print("RÉSUMÉ DU TEST")
|
|
||||||
print("=" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
if passed >= total_checks * 0.8:
|
|
||||||
print("✅ QUALITÉ DE RECHERCHE: EXCELLENTE")
|
|
||||||
print()
|
|
||||||
print("Les chunks re-chunkés ont amélioré la recherche:")
|
|
||||||
print(f" • {total:,} chunks optimisés")
|
|
||||||
print(f" • Taille moyenne: {avg:.0f} tokens (optimal)")
|
|
||||||
print(f" • {optimal_percentage:.1f}% dans la plage optimale")
|
|
||||||
print(f" • Max: {max_size} tokens (< 2500)")
|
|
||||||
print(f" • Overlap détecté: {overlaps_found > 0 if overlaps_checked > 0 else 'N/A'}")
|
|
||||||
print()
|
|
||||||
print("Recommandations:")
|
|
||||||
print(" ✓ La recherche sémantique fonctionne correctement")
|
|
||||||
print(" ✓ Les chunks sont de taille optimale pour BGE-M3")
|
|
||||||
print(" ✓ Le système est prêt pour la production")
|
|
||||||
elif passed >= total_checks * 0.6:
|
|
||||||
print("⚠️ QUALITÉ DE RECHERCHE: BONNE")
|
|
||||||
print()
|
|
||||||
print("Quelques améliorations possibles:")
|
|
||||||
for check in quality_checks:
|
|
||||||
if not check['passed'] and check['passed'] is not None:
|
|
||||||
print(f" • {check['name']}: {check['detail']}")
|
|
||||||
else:
|
|
||||||
print("❌ QUALITÉ DE RECHERCHE: À AMÉLIORER")
|
|
||||||
print()
|
|
||||||
print("Problèmes détectés:")
|
|
||||||
for check in quality_checks:
|
|
||||||
if not check['passed'] and check['passed'] is not None:
|
|
||||||
print(f" • {check['name']}: {check['detail']}")
|
|
||||||
|
|
||||||
finally:
|
|
||||||
client.close()
|
|
||||||
@@ -1,217 +0,0 @@
|
|||||||
"""Vectorize chunks that don't have vectors.
|
|
||||||
|
|
||||||
After re-chunking, new chunks were created without vectors because Chunk_v2
|
|
||||||
collection has no vectorizer configured. This script manually vectorizes
|
|
||||||
these chunks using the text2vec-transformers service.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import weaviate
|
|
||||||
import sys
|
|
||||||
import requests
|
|
||||||
|
|
||||||
if sys.stdout.encoding != 'utf-8':
|
|
||||||
sys.stdout.reconfigure(encoding='utf-8')
|
|
||||||
|
|
||||||
# Try to import tqdm
|
|
||||||
try:
|
|
||||||
from tqdm import tqdm
|
|
||||||
HAS_TQDM = True
|
|
||||||
except ImportError:
|
|
||||||
HAS_TQDM = False
|
|
||||||
|
|
||||||
# Text2vec-transformers service URL (from docker-compose.yml)
|
|
||||||
VECTORIZER_URL = "http://localhost:8090/vectors"
|
|
||||||
|
|
||||||
client = weaviate.connect_to_local()
|
|
||||||
|
|
||||||
try:
|
|
||||||
print("=" * 80)
|
|
||||||
print("VECTORISATION DES CHUNKS SANS VECTEUR")
|
|
||||||
print("=" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
chunk_v2 = client.collections.get("Chunk_v2")
|
|
||||||
|
|
||||||
# ========== 1. IDENTIFIER LES CHUNKS SANS VECTEUR ==========
|
|
||||||
print("1. IDENTIFICATION DES CHUNKS SANS VECTEUR")
|
|
||||||
print("-" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
print("Analyse en cours...")
|
|
||||||
|
|
||||||
chunks_to_vectorize = []
|
|
||||||
|
|
||||||
for chunk in chunk_v2.iterator(include_vector=True):
|
|
||||||
if not chunk.vector or not chunk.vector.get('default'):
|
|
||||||
props = chunk.properties
|
|
||||||
chunks_to_vectorize.append({
|
|
||||||
'uuid': chunk.uuid,
|
|
||||||
'text': props.get('text', ''),
|
|
||||||
'summary': props.get('summary', ''),
|
|
||||||
'keywords': props.get('keywords', []),
|
|
||||||
'workTitle': props.get('workTitle', 'N/A')
|
|
||||||
})
|
|
||||||
|
|
||||||
print(f"✓ {len(chunks_to_vectorize)} chunks sans vecteur trouvés")
|
|
||||||
print()
|
|
||||||
|
|
||||||
if not chunks_to_vectorize:
|
|
||||||
print("✅ Aucun chunk à vectoriser")
|
|
||||||
print()
|
|
||||||
print("=" * 80)
|
|
||||||
print("SCRIPT TERMINÉ - RIEN À FAIRE")
|
|
||||||
print("=" * 80)
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
# ========== 2. VECTORISATION ==========
|
|
||||||
print("2. VECTORISATION DES CHUNKS")
|
|
||||||
print("-" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
print(f"Service vectorizer: {VECTORIZER_URL}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
vectorized_count = 0
|
|
||||||
errors = []
|
|
||||||
|
|
||||||
# Create iterator with or without tqdm
|
|
||||||
if HAS_TQDM:
|
|
||||||
iterator = tqdm(
|
|
||||||
chunks_to_vectorize,
|
|
||||||
desc="Vectorisation",
|
|
||||||
unit="chunks"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
iterator = chunks_to_vectorize
|
|
||||||
print("Vectorisation en cours...")
|
|
||||||
|
|
||||||
for idx, chunk_data in enumerate(iterator, 1):
|
|
||||||
try:
|
|
||||||
# Prepare text for vectorization
|
|
||||||
# Combine text, summary, and keywords as per original Chunk schema
|
|
||||||
text_parts = [chunk_data['text']]
|
|
||||||
|
|
||||||
if chunk_data['summary']:
|
|
||||||
text_parts.append(chunk_data['summary'])
|
|
||||||
|
|
||||||
if chunk_data['keywords']:
|
|
||||||
text_parts.append(' '.join(chunk_data['keywords']))
|
|
||||||
|
|
||||||
combined_text = ' '.join(text_parts)
|
|
||||||
|
|
||||||
# Call text2vec-transformers service
|
|
||||||
response = requests.post(
|
|
||||||
VECTORIZER_URL,
|
|
||||||
json={"text": combined_text},
|
|
||||||
headers={"Content-Type": "application/json"},
|
|
||||||
timeout=30
|
|
||||||
)
|
|
||||||
|
|
||||||
if response.status_code != 200:
|
|
||||||
errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: HTTP {response.status_code}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
result = response.json()
|
|
||||||
vector = result.get('vector')
|
|
||||||
|
|
||||||
if not vector:
|
|
||||||
errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: Pas de vecteur dans la réponse")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Update chunk with vector
|
|
||||||
chunk_v2.data.update(
|
|
||||||
uuid=chunk_data['uuid'],
|
|
||||||
vector=vector
|
|
||||||
)
|
|
||||||
|
|
||||||
vectorized_count += 1
|
|
||||||
|
|
||||||
# Progress without tqdm
|
|
||||||
if not HAS_TQDM and idx % 10 == 0:
|
|
||||||
print(f" {idx}/{len(chunks_to_vectorize)} chunks vectorisés...")
|
|
||||||
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: Erreur réseau - {e}")
|
|
||||||
except Exception as e:
|
|
||||||
errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: {e}")
|
|
||||||
|
|
||||||
print()
|
|
||||||
print("-" * 80)
|
|
||||||
print(f"✓ Chunks vectorisés: {vectorized_count}/{len(chunks_to_vectorize)}")
|
|
||||||
|
|
||||||
if errors:
|
|
||||||
print()
|
|
||||||
print(f"⚠️ Erreurs rencontrées: {len(errors)}")
|
|
||||||
for err in errors[:10]:
|
|
||||||
print(f" - {err}")
|
|
||||||
if len(errors) > 10:
|
|
||||||
print(f" ... et {len(errors) - 10} autres")
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
# ========== 3. VÉRIFICATION ==========
|
|
||||||
print("3. VÉRIFICATION POST-VECTORISATION")
|
|
||||||
print("-" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
print("Recomptage...")
|
|
||||||
remaining_without_vector = 0
|
|
||||||
total_chunks = 0
|
|
||||||
|
|
||||||
for chunk in chunk_v2.iterator(include_vector=True):
|
|
||||||
total_chunks += 1
|
|
||||||
if not chunk.vector or not chunk.vector.get('default'):
|
|
||||||
remaining_without_vector += 1
|
|
||||||
|
|
||||||
chunks_with_vector = total_chunks - remaining_without_vector
|
|
||||||
|
|
||||||
print(f"✓ Total chunks: {total_chunks:,}")
|
|
||||||
print(f"✓ Avec vecteur: {chunks_with_vector:,} ({chunks_with_vector/total_chunks*100:.1f}%)")
|
|
||||||
print(f"✓ Sans vecteur: {remaining_without_vector:,}")
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
if remaining_without_vector == 0:
|
|
||||||
print("✅ Tous les chunks ont été vectorisés!")
|
|
||||||
else:
|
|
||||||
print(f"⚠️ {remaining_without_vector} chunks encore sans vecteur")
|
|
||||||
print(" Relancer le script ou vérifier les erreurs")
|
|
||||||
|
|
||||||
print()
|
|
||||||
print("=" * 80)
|
|
||||||
print("VECTORISATION TERMINÉE")
|
|
||||||
print("=" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
if remaining_without_vector == 0:
|
|
||||||
print("✅ VECTORISATION RÉUSSIE")
|
|
||||||
print()
|
|
||||||
print("RÉSULTATS:")
|
|
||||||
print(f" • {vectorized_count} nouveaux vecteurs créés")
|
|
||||||
print(f" • {total_chunks:,} chunks totaux")
|
|
||||||
print(f" • 100% des chunks ont des vecteurs")
|
|
||||||
print()
|
|
||||||
print("PROCHAINES ÉTAPES:")
|
|
||||||
print(" 1. Relancer le test de recherche: python 10_test_search_quality.py")
|
|
||||||
print(" 2. Tester l'application Flask")
|
|
||||||
print()
|
|
||||||
print("NOTE: Chunk_v2 n'a toujours pas de vectorizer configuré.")
|
|
||||||
print("Les futurs nouveaux chunks devront être vectorisés manuellement")
|
|
||||||
print("OU la collection devra être recréée avec un vectorizer.")
|
|
||||||
elif vectorized_count > 0:
|
|
||||||
print("⚠️ VECTORISATION PARTIELLE")
|
|
||||||
print()
|
|
||||||
print(f" • {vectorized_count} chunks vectorisés")
|
|
||||||
print(f" • {remaining_without_vector} chunks restants")
|
|
||||||
print(" • Vérifier les erreurs et relancer")
|
|
||||||
else:
|
|
||||||
print("❌ VECTORISATION ÉCHOUÉE")
|
|
||||||
print()
|
|
||||||
print("Aucun chunk n'a pu être vectorisé.")
|
|
||||||
print("Vérifications:")
|
|
||||||
print(f" 1. Service text2vec-transformers actif: {VECTORIZER_URL}")
|
|
||||||
print(" 2. Docker containers en cours d'exécution")
|
|
||||||
print(" 3. Logs des erreurs ci-dessus")
|
|
||||||
|
|
||||||
finally:
|
|
||||||
client.close()
|
|
||||||
136
CHANGELOG.md
Normal file
136
CHANGELOG.md
Normal file
@@ -0,0 +1,136 @@
|
|||||||
|
# Changelog - Library RAG Project
|
||||||
|
|
||||||
|
## 2026-01-08 - Chunking Optimization & Vectorization
|
||||||
|
|
||||||
|
### Chunking Improvements
|
||||||
|
- **Strict chunk size limits**: Max 1000 words (down from 1500-2000)
|
||||||
|
- **Overlap implementation**: 100-word overlap between consecutive chunks
|
||||||
|
- **Triple fallback system**: Ensures robust chunking even on LLM failures
|
||||||
|
- **New module**: `llm_chunker_improved.py` with overlap functionality
|
||||||
|
|
||||||
|
### Re-chunking Results
|
||||||
|
- Identified 31 oversized chunks (>2000 tokens, max 7,158)
|
||||||
|
- Split into 92 optimally-sized chunks
|
||||||
|
- **Result**: 0 chunks > 2000 tokens (100% within BGE-M3 limits)
|
||||||
|
- Preserved all metadata during split (workTitle, workAuthor, sectionPath, orderIndex)
|
||||||
|
|
||||||
|
### Vectorization
|
||||||
|
- Created manual vectorization system for Chunk_v2 (no vectorizer configured)
|
||||||
|
- Successfully vectorized 92 new chunks via text2vec-transformers API
|
||||||
|
- **Result**: 5,304/5,304 chunks with vectors (100% coverage)
|
||||||
|
|
||||||
|
### Docker Configuration
|
||||||
|
- Exposed text2vec-transformers port (8090:8080) for external vectorization
|
||||||
|
- Added cluster configuration to fix "No private IP address found" error
|
||||||
|
- Increased WORKER_TIMEOUT to 600s for very large chunks
|
||||||
|
|
||||||
|
### Search Quality
|
||||||
|
- Created comprehensive test suite (`10_test_search_quality.py`)
|
||||||
|
- Tests: distribution, overlap detection, semantic search (4 queries)
|
||||||
|
- Search now uses `near_vector()` with manual query vectorization
|
||||||
|
- **Issue identified**: Collected papers dominates results (95.8% of chunks)
|
||||||
|
|
||||||
|
### Database Stats (Post-Optimization)
|
||||||
|
- Total chunks: 5,304
|
||||||
|
- Average size: 289 tokens (optimal for BGE-M3)
|
||||||
|
- Distribution: 84.6% < 500 tokens, 11.5% 500-1000, 3.0% 1000-1500
|
||||||
|
- Works: 8 (Collected papers: 5,080 chunks, Mind Design III: 61, Platon Ménon: 56, etc.)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2025-01 - Weaviate v2 Migration & GPU Integration
|
||||||
|
|
||||||
|
### Phase 1-3: Schema Migration (Complete)
|
||||||
|
- Migrated from Chunk/Summary/Document to Chunk_v2/Summary_v2/Work
|
||||||
|
- Removed nested `document` object, added direct properties (workTitle, workAuthor, year, language)
|
||||||
|
- Work collection with sourceId for documents
|
||||||
|
- Fixed 114 summaries missing properties
|
||||||
|
- Deleted vL-jepa chunks (17), fixed null workTitles
|
||||||
|
|
||||||
|
### Phase 4: Memory System (Complete)
|
||||||
|
- Added Thought/Message/Conversation collections to Weaviate
|
||||||
|
- 9 MCP tools for memory management (add_thought, search_thoughts, etc.)
|
||||||
|
- GPU embeddings integration (BAAI/bge-m3, RTX 4070)
|
||||||
|
- Data: 102 Thoughts, 377 Messages, 12 Conversations
|
||||||
|
|
||||||
|
### Phase 5: Backend Integration (Complete)
|
||||||
|
- Integrated GPU embedder into Flask app (singleton pattern)
|
||||||
|
- All search routes now use manual vectorization with `near_vector()`
|
||||||
|
- Updated all routes: simple_search, hierarchical_search, summary_only_search, rag_search
|
||||||
|
- Fixed Work → Chunk/Summary property mapping (v2 schema)
|
||||||
|
|
||||||
|
### Phase 6-7: Testing & Optimization
|
||||||
|
- Comprehensive testing of search routes
|
||||||
|
- MCP tools validation
|
||||||
|
- Performance optimization with GPU embeddings
|
||||||
|
- Documentation updates (README.md, CLAUDE.md)
|
||||||
|
|
||||||
|
### Phase 8: Documentation Cleanup
|
||||||
|
- Consolidated all phase documentation
|
||||||
|
- Updated README with Memory MCP tools section
|
||||||
|
- Cleaned up temporary files and scripts
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Archive Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
archive/
|
||||||
|
├── migration_scripts/ # Migration & optimization scripts (01-11)
|
||||||
|
│ ├── 01_migrate_document_to_work.py
|
||||||
|
│ ├── 02_create_schema_v2.py
|
||||||
|
│ ├── 03_migrate_chunks_v2.py
|
||||||
|
│ ├── 04_migrate_summaries_v2.py
|
||||||
|
│ ├── 05_validate_migration.py
|
||||||
|
│ ├── 07_cleanup.py
|
||||||
|
│ ├── 08_fix_summaries_properties.py
|
||||||
|
│ ├── 09_rechunk_oversized.py
|
||||||
|
│ ├── 10_test_search_quality.py
|
||||||
|
│ ├── 11_vectorize_missing_chunks.py
|
||||||
|
│ └── old_scripts/ # ChromaDB migration scripts
|
||||||
|
├── migration_docs/ # Detailed migration documentation
|
||||||
|
│ ├── PLAN_MIGRATION_V2_SANS_DOCUMENT.md
|
||||||
|
│ ├── PHASE5_BACKEND_INTEGRATION.md
|
||||||
|
│ └── WEAVIATE_RETRIEVAL_ARCHITECTURE.md
|
||||||
|
├── documentation/ # Phase summaries
|
||||||
|
│ ├── PHASE_0_PYTORCH_CUDA.md
|
||||||
|
│ ├── PHASE_2_MIGRATION_SUMMARY.md
|
||||||
|
│ ├── PHASE_3_CONVERSATIONS_SUMMARY.md
|
||||||
|
│ ├── PHASE_4_MIGRATION_CHROMADB.md
|
||||||
|
│ ├── PHASE_5_MCP_TOOLS.md
|
||||||
|
│ ├── PHASE_6_TESTS_OPTIMISATION.md
|
||||||
|
│ ├── PHASE_7_INTEGRATION_BACKEND.md
|
||||||
|
│ ├── PHASE_8_DOCUMENTATION_CLEANUP.md
|
||||||
|
│ └── MIGRATION_README.md
|
||||||
|
└── backups/ # Pre-migration data backups
|
||||||
|
└── pre_migration_20260108_152033/
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Technology Stack
|
||||||
|
|
||||||
|
**Vector Database**: Weaviate 1.34.4 with BAAI/bge-m3 embeddings (1024-dim)
|
||||||
|
**Embedder**: PyTorch 2.6.0+cu124, GPU RTX 4070
|
||||||
|
**Backend**: Flask 3.0 with Server-Sent Events
|
||||||
|
**MCP Integration**: 9 memory tools + 6 RAG tools for Claude Desktop
|
||||||
|
**OCR**: Mistral OCR API
|
||||||
|
**LLM**: Ollama (local) or Mistral API
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Known Issues
|
||||||
|
|
||||||
|
1. **Chunk_v2 has no vectorizer**: All new chunks require manual vectorization via `11_vectorize_missing_chunks.py`
|
||||||
|
2. **Data imbalance**: Collected papers represents 95.8% of chunks, dominating search results
|
||||||
|
3. **Mind Design III underrepresented**: Only 61 chunks (1.2%) vs 5,080 for Collected papers
|
||||||
|
|
||||||
|
## Recommendations
|
||||||
|
|
||||||
|
1. Add more diverse works to balance corpus
|
||||||
|
2. Consider re-ranking with per-work boosting for diversity
|
||||||
|
3. Recreate Chunk_v2 with text2vec-transformers vectorizer for auto-vectorization (requires full data reload)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
For detailed implementation notes, see `.claude/CLAUDE.md` and `archive/` directories.
|
||||||
@@ -1,174 +0,0 @@
|
|||||||
"""
|
|
||||||
Script pour vérifier l'état actuel des issues Linear du projet library_rag.
|
|
||||||
|
|
||||||
Affiche :
|
|
||||||
- Nombre total d'issues
|
|
||||||
- Nombre d'issues par statut (Todo, In Progress, Done)
|
|
||||||
- Liste des issues In Progress (si présentes)
|
|
||||||
- Liste des issues Todo avec priorité 1 ou 2
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import json
|
|
||||||
import requests
|
|
||||||
from pathlib import Path
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
|
|
||||||
# Load environment variables
|
|
||||||
load_dotenv()
|
|
||||||
|
|
||||||
LINEAR_API_KEY = os.environ.get("LINEAR_API_KEY")
|
|
||||||
if not LINEAR_API_KEY:
|
|
||||||
print("❌ LINEAR_API_KEY not found in .env file")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
# Read project info
|
|
||||||
project_file = Path("generations/library_rag/.linear_project.json")
|
|
||||||
if not project_file.exists():
|
|
||||||
print(f"❌ Project file not found: {project_file}")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
with open(project_file) as f:
|
|
||||||
project_info = json.load(f)
|
|
||||||
|
|
||||||
project_id = project_info.get("project_id")
|
|
||||||
team_id = project_info.get("team_id")
|
|
||||||
total_issues_created = project_info.get("total_issues", 0)
|
|
||||||
|
|
||||||
print("=" * 80)
|
|
||||||
print(f"LINEAR STATUS CHECK - Project: {project_info.get('project_name')}")
|
|
||||||
print(f"URL: {project_info.get('project_url')}")
|
|
||||||
print(f"Total issues created historically: {total_issues_created}")
|
|
||||||
print("=" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
# GraphQL query to list all issues in the project
|
|
||||||
query = """
|
|
||||||
query($projectId: String!) {
|
|
||||||
project(id: $projectId) {
|
|
||||||
issues(first: 200) {
|
|
||||||
nodes {
|
|
||||||
id
|
|
||||||
identifier
|
|
||||||
title
|
|
||||||
priority
|
|
||||||
state {
|
|
||||||
name
|
|
||||||
}
|
|
||||||
createdAt
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Authorization": LINEAR_API_KEY,
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post(
|
|
||||||
"https://api.linear.app/graphql",
|
|
||||||
headers=headers,
|
|
||||||
json={"query": query, "variables": {"projectId": project_id}}
|
|
||||||
)
|
|
||||||
|
|
||||||
if response.status_code != 200:
|
|
||||||
print(f"❌ Linear API error: {response.status_code}")
|
|
||||||
print(response.text)
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
data = response.json()
|
|
||||||
|
|
||||||
if "errors" in data:
|
|
||||||
print(f"❌ GraphQL errors: {data['errors']}")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
issues = data["data"]["project"]["issues"]["nodes"]
|
|
||||||
|
|
||||||
# Count by status
|
|
||||||
status_counts = {
|
|
||||||
"Todo": 0,
|
|
||||||
"In Progress": 0,
|
|
||||||
"Done": 0,
|
|
||||||
"Other": 0
|
|
||||||
}
|
|
||||||
|
|
||||||
issues_by_status = {
|
|
||||||
"Todo": [],
|
|
||||||
"In Progress": [],
|
|
||||||
"Done": []
|
|
||||||
}
|
|
||||||
|
|
||||||
for issue in issues:
|
|
||||||
state_name = issue["state"]["name"]
|
|
||||||
if state_name in status_counts:
|
|
||||||
status_counts[state_name] += 1
|
|
||||||
issues_by_status[state_name].append(issue)
|
|
||||||
else:
|
|
||||||
status_counts["Other"] += 1
|
|
||||||
|
|
||||||
# Display summary
|
|
||||||
print(f"STATUS SUMMARY:")
|
|
||||||
print(f" Done: {status_counts['Done']}")
|
|
||||||
print(f" In Progress: {status_counts['In Progress']}")
|
|
||||||
print(f" Todo: {status_counts['Todo']}")
|
|
||||||
print(f" Other: {status_counts['Other']}")
|
|
||||||
print(f" TOTAL: {len(issues)}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Check for issues In Progress (potential blocker)
|
|
||||||
if status_counts["In Progress"] > 0:
|
|
||||||
print("WARNING: There are 'In Progress' issues:")
|
|
||||||
print()
|
|
||||||
for issue in issues_by_status["In Progress"]:
|
|
||||||
priority = issue.get("priority", "N/A")
|
|
||||||
print(f" [IN PROGRESS] {issue['identifier']} - Priority {priority}")
|
|
||||||
print(f" {issue['title']}")
|
|
||||||
print()
|
|
||||||
print("! The agent will resume these issues first!")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# List high-priority Todo issues
|
|
||||||
high_priority_todo = [
|
|
||||||
issue for issue in issues_by_status["Todo"]
|
|
||||||
if issue.get("priority") in [1, 2]
|
|
||||||
]
|
|
||||||
|
|
||||||
if high_priority_todo:
|
|
||||||
print(f"HIGH PRIORITY TODO (Priority 1-2): {len(high_priority_todo)}")
|
|
||||||
print()
|
|
||||||
for issue in sorted(high_priority_todo, key=lambda x: x.get("priority", 99)):
|
|
||||||
priority = issue.get("priority", "N/A")
|
|
||||||
print(f" [TODO] {issue['identifier']} - Priority {priority}")
|
|
||||||
print(f" {issue['title'][:80]}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# List all Todo issues (for reference)
|
|
||||||
if status_counts["Todo"] > 0:
|
|
||||||
print(f"ALL TODO ISSUES: {status_counts['Todo']}")
|
|
||||||
print()
|
|
||||||
for issue in sorted(issues_by_status["Todo"], key=lambda x: x.get("priority", 99)):
|
|
||||||
priority = issue.get("priority", "N/A")
|
|
||||||
title = issue['title'][:60] + "..." if len(issue['title']) > 60 else issue['title']
|
|
||||||
print(f" {issue['identifier']} [P{priority}] {title}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Recommendation
|
|
||||||
print("=" * 80)
|
|
||||||
if status_counts["In Progress"] > 0:
|
|
||||||
print("RECOMMENDATION:")
|
|
||||||
print(" - There are 'In Progress' issues that should be finished first")
|
|
||||||
print(" - Before adding new issues, check if these should be:")
|
|
||||||
print(" 1. Completed")
|
|
||||||
print(" 2. Cancelled (moved back to Todo)")
|
|
||||||
print(" 3. Deleted")
|
|
||||||
elif status_counts["Todo"] > 10:
|
|
||||||
print("RECOMMENDATION:")
|
|
||||||
print(f" - There are {status_counts['Todo']} Todo issues pending")
|
|
||||||
print(" - Consider finishing them before adding new ones")
|
|
||||||
else:
|
|
||||||
print("RECOMMENDATION:")
|
|
||||||
print(" - Project is in good state to add new issues")
|
|
||||||
print(" - You can proceed with --new-spec")
|
|
||||||
print("=" * 80)
|
|
||||||
@@ -1,82 +0,0 @@
|
|||||||
"""
|
|
||||||
Vérifier si le META issue existe toujours dans Linear.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import json
|
|
||||||
import requests
|
|
||||||
from pathlib import Path
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
|
|
||||||
load_dotenv()
|
|
||||||
|
|
||||||
LINEAR_API_KEY = os.environ.get("LINEAR_API_KEY")
|
|
||||||
if not LINEAR_API_KEY:
|
|
||||||
print("ERROR: LINEAR_API_KEY not found")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
# Read project info
|
|
||||||
project_file = Path("generations/library_rag/.linear_project.json")
|
|
||||||
with open(project_file) as f:
|
|
||||||
project_info = json.load(f)
|
|
||||||
|
|
||||||
meta_issue_id = project_info.get("meta_issue_id")
|
|
||||||
project_id = project_info.get("project_id")
|
|
||||||
|
|
||||||
print("=" * 80)
|
|
||||||
print("Checking META issue existence...")
|
|
||||||
print(f"META issue ID from .linear_project.json: {meta_issue_id}")
|
|
||||||
print("=" * 80)
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Try to fetch the META issue
|
|
||||||
query = """
|
|
||||||
query($issueId: String!) {
|
|
||||||
issue(id: $issueId) {
|
|
||||||
id
|
|
||||||
identifier
|
|
||||||
title
|
|
||||||
state {
|
|
||||||
name
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Authorization": LINEAR_API_KEY,
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post(
|
|
||||||
"https://api.linear.app/graphql",
|
|
||||||
headers=headers,
|
|
||||||
json={"query": query, "variables": {"issueId": meta_issue_id}}
|
|
||||||
)
|
|
||||||
|
|
||||||
if response.status_code != 200:
|
|
||||||
print(f"ERROR: Linear API error: {response.status_code}")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
data = response.json()
|
|
||||||
|
|
||||||
if "errors" in data:
|
|
||||||
print("META ISSUE NOT FOUND (was deleted)")
|
|
||||||
print()
|
|
||||||
print("SOLUTION: Need to recreate META issue or reset .linear_project.json")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
issue = data["data"]["issue"]
|
|
||||||
if issue is None:
|
|
||||||
print("META ISSUE NOT FOUND (was deleted)")
|
|
||||||
print()
|
|
||||||
print("SOLUTION: Need to recreate META issue or reset .linear_project.json")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
print(f"META issue EXISTS:")
|
|
||||||
print(f" ID: {issue['id']}")
|
|
||||||
print(f" Identifier: {issue['identifier']}")
|
|
||||||
print(f" Title: {issue['title']}")
|
|
||||||
print(f" State: {issue['state']['name']}")
|
|
||||||
print()
|
|
||||||
print("OK - Can proceed with agent")
|
|
||||||
@@ -1,93 +0,0 @@
|
|||||||
"""
|
|
||||||
Dockerization helper for ikario_body
|
|
||||||
===================================
|
|
||||||
|
|
||||||
Ce script crée les fichiers Docker nécessaires pour exécuter l'application
|
|
||||||
`generations/ikario_body` (frontend + serveur + base SQLite) dans Docker,
|
|
||||||
SANS modifier aucun fichier existant.
|
|
||||||
|
|
||||||
Il génère un fichier de composition :
|
|
||||||
- docker-compose.ikario_body.yml (à la racine du repo)
|
|
||||||
|
|
||||||
Ce fichier utilise l'image officielle Node et monte le code existant
|
|
||||||
ainsi que la base SQLite dans les conteneurs (mode développement).
|
|
||||||
|
|
||||||
Utilisation :
|
|
||||||
1) Depuis la racine du repo :
|
|
||||||
python dockerize_ikario_body.py
|
|
||||||
2) Puis pour lancer l'appli dans Docker :
|
|
||||||
docker compose -f docker-compose.ikario_body.yml up
|
|
||||||
ou, selon votre installation :
|
|
||||||
docker-compose -f docker-compose.ikario_body.yml up
|
|
||||||
|
|
||||||
- Frontend accessible sur: http://localhost:3000
|
|
||||||
- API backend (server) sur : http://localhost:3001
|
|
||||||
"""
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
def generate_docker_compose(root: Path) -> None:
|
|
||||||
"""Génère le fichier docker-compose.ikario_body.yml sans toucher au code existant."""
|
|
||||||
project_dir = root / "generations" / "ikario_body"
|
|
||||||
|
|
||||||
if not project_dir.exists():
|
|
||||||
raise SystemExit(f"Project directory not found: {project_dir}")
|
|
||||||
|
|
||||||
compose_path = root / "docker-compose.ikario_body.yml"
|
|
||||||
|
|
||||||
# On utilise les scripts npm déjà définis :
|
|
||||||
# - frontend: npm run dev (Vite) en écoutant sur 0.0.0.0:3000 (dans le conteneur)
|
|
||||||
# - server: npm start dans ./server sur 3001 (dans le conteneur)
|
|
||||||
#
|
|
||||||
# Pour éviter les conflits de ports courants (3000/3001) sur la machine hôte,
|
|
||||||
# on mappe vers des ports plus élevés côté host :
|
|
||||||
# - frontend : host 4300 -> container 3000
|
|
||||||
# - backend : host 4301 -> container 3001
|
|
||||||
#
|
|
||||||
# Le volume ./generations/ikario_body est monté dans /app,
|
|
||||||
# ce qui inclut aussi la base SQLite dans server/data/claude-clone.db.
|
|
||||||
compose_content = f"""services:
|
|
||||||
ikario_body_frontend:
|
|
||||||
image: node:20
|
|
||||||
working_dir: /app
|
|
||||||
volumes:
|
|
||||||
- ./generations/ikario_body:/app
|
|
||||||
# Eviter de réutiliser les node_modules Windows dans le conteneur Linux
|
|
||||||
- /app/node_modules
|
|
||||||
command: ["sh", "-c", "npm install && npm run dev -- --host 0.0.0.0 --port 3000"]
|
|
||||||
ports:
|
|
||||||
- "4300:3000"
|
|
||||||
environment:
|
|
||||||
- NODE_ENV=development
|
|
||||||
|
|
||||||
ikario_body_server:
|
|
||||||
image: node:20
|
|
||||||
working_dir: /app/server
|
|
||||||
volumes:
|
|
||||||
- ./generations/ikario_body:/app
|
|
||||||
# Eviter de réutiliser les node_modules Windows dans le conteneur Linux
|
|
||||||
- /app/server/node_modules
|
|
||||||
command: ["sh", "-c", "npm install && npm start"]
|
|
||||||
ports:
|
|
||||||
- "4301:3001"
|
|
||||||
environment:
|
|
||||||
- NODE_ENV=development
|
|
||||||
depends_on:
|
|
||||||
- ikario_body_frontend
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
compose_path.write_text(compose_content, encoding="utf-8")
|
|
||||||
print(f"Created {compose_path.relative_to(root)}")
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
repo_root = Path(__file__).resolve().parent
|
|
||||||
generate_docker_compose(repo_root)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
||||||
|
|
||||||
@@ -451,7 +451,101 @@ filter_by_author(author="Platon")
|
|||||||
delete_document(source_id="platon-menon", confirm=true)
|
delete_document(source_id="platon-menon", confirm=true)
|
||||||
```
|
```
|
||||||
|
|
||||||
Pour plus de détails, voir la documentation complète dans `.claude/CLAUDE.md`.
|
### Outils MCP Memory (9 outils intégrés - Phase 4)
|
||||||
|
|
||||||
|
**Système de Mémoire Unifié** : Le serveur MCP intègre désormais 9 outils pour gérer un système de mémoire (Thoughts, Messages, Conversations) utilisant Weaviate + GPU embeddings. Ces outils permettent à Claude Desktop de créer, rechercher et gérer des pensées, messages et conversations de manière persistante.
|
||||||
|
|
||||||
|
**Architecture Memory** :
|
||||||
|
- **Backend** : Weaviate 1.34.4 (collections Thought, Message, Conversation)
|
||||||
|
- **Embeddings** : BAAI/bge-m3 GPU (1024-dim, RTX 4070, PyTorch 2.6.0+cu124)
|
||||||
|
- **Handlers** : `memory/mcp/` (thought_tools, message_tools, conversation_tools)
|
||||||
|
- **Données** : 102 Thoughts, 377 Messages, 12 Conversations (au 2025-01-08)
|
||||||
|
|
||||||
|
#### Thought Tools (3)
|
||||||
|
|
||||||
|
**1. add_thought** - Ajouter une pensée au système
|
||||||
|
```
|
||||||
|
add_thought(
|
||||||
|
content="Exploring vector databases for semantic search",
|
||||||
|
thought_type="observation", # reflection, question, intuition, observation
|
||||||
|
trigger="Research session",
|
||||||
|
concepts=["weaviate", "embeddings", "gpu"],
|
||||||
|
privacy_level="private" # private, shared, public
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**2. search_thoughts** - Recherche sémantique dans les pensées
|
||||||
|
```
|
||||||
|
search_thoughts(
|
||||||
|
query="vector databases GPU",
|
||||||
|
limit=10,
|
||||||
|
thought_type_filter="observation" # optionnel
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**3. get_thought** - Récupérer une pensée par UUID
|
||||||
|
```
|
||||||
|
get_thought(uuid="730c1a8e-b09f-4889-bbe9-4867d0ee7f1a")
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Message Tools (3)
|
||||||
|
|
||||||
|
**4. add_message** - Ajouter un message à une conversation
|
||||||
|
```
|
||||||
|
add_message(
|
||||||
|
content="Explain transformers in AI",
|
||||||
|
role="user", # user, assistant, system
|
||||||
|
conversation_id="chat_2025_01_08",
|
||||||
|
order_index=0
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**5. get_messages** - Récupérer tous les messages d'une conversation
|
||||||
|
```
|
||||||
|
get_messages(
|
||||||
|
conversation_id="chat_2025_01_08",
|
||||||
|
limit=50
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**6. search_messages** - Recherche sémantique dans les messages
|
||||||
|
```
|
||||||
|
search_messages(
|
||||||
|
query="transformers AI",
|
||||||
|
limit=10,
|
||||||
|
conversation_id_filter="chat_2025_01_08" # optionnel
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Conversation Tools (3)
|
||||||
|
|
||||||
|
**7. get_conversation** - Récupérer une conversation par ID
|
||||||
|
```
|
||||||
|
get_conversation(conversation_id="ikario_derniere_pensee")
|
||||||
|
```
|
||||||
|
|
||||||
|
**8. search_conversations** - Recherche sémantique dans les conversations
|
||||||
|
```
|
||||||
|
search_conversations(
|
||||||
|
query="philosophical discussion",
|
||||||
|
limit=10,
|
||||||
|
category_filter="philosophy" # optionnel
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**9. list_conversations** - Lister toutes les conversations
|
||||||
|
```
|
||||||
|
list_conversations(
|
||||||
|
limit=20,
|
||||||
|
category_filter="testing" # optionnel
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Tests** : Tous les outils Memory ont été testés avec succès (voir `test_memory_mcp_tools.py`)
|
||||||
|
|
||||||
|
**Documentation complète** : Voir `memory/README_MCP_TOOLS.md` pour l'architecture détaillée, les schémas de données et les exemples d'utilisation.
|
||||||
|
|
||||||
|
Pour plus de détails sur les outils Library RAG, voir la documentation complète dans `.claude/CLAUDE.md`.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
@@ -89,8 +89,23 @@ from utils.types import (
|
|||||||
SSEEvent,
|
SSEEvent,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# GPU Embedder for manual vectorization (Phase 5: Backend Integration)
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
from memory.core import get_embedder
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
# Initialize GPU embedder singleton
|
||||||
|
_embedder = None
|
||||||
|
|
||||||
|
def get_gpu_embedder():
|
||||||
|
"""Get or create GPU embedder singleton."""
|
||||||
|
global _embedder
|
||||||
|
if _embedder is None:
|
||||||
|
_embedder = get_embedder()
|
||||||
|
return _embedder
|
||||||
|
|
||||||
# Configuration Flask
|
# Configuration Flask
|
||||||
app.config["SECRET_KEY"] = os.environ.get("SECRET_KEY", "dev-secret-key-change-in-production")
|
app.config["SECRET_KEY"] = os.environ.get("SECRET_KEY", "dev-secret-key-change-in-production")
|
||||||
|
|
||||||
@@ -152,26 +167,25 @@ def get_collection_stats() -> Optional[CollectionStats]:
|
|||||||
stats: CollectionStats = {}
|
stats: CollectionStats = {}
|
||||||
|
|
||||||
# Chunk stats (renamed from Passage)
|
# Chunk stats (renamed from Passage)
|
||||||
passages = client.collections.get("Chunk")
|
passages = client.collections.get("Chunk_v2")
|
||||||
passage_count = passages.aggregate.over_all(total_count=True)
|
passage_count = passages.aggregate.over_all(total_count=True)
|
||||||
stats["passages"] = passage_count.total_count or 0
|
stats["passages"] = passage_count.total_count or 0
|
||||||
|
|
||||||
# Get unique authors and works (from nested objects)
|
# Get unique authors and works (from direct properties in v2)
|
||||||
all_passages = passages.query.fetch_objects(limit=1000)
|
all_passages = passages.query.fetch_objects(limit=10000)
|
||||||
authors: set[str] = set()
|
authors: set[str] = set()
|
||||||
works: set[str] = set()
|
works: set[str] = set()
|
||||||
languages: set[str] = set()
|
languages: set[str] = set()
|
||||||
|
|
||||||
for obj in all_passages.objects:
|
for obj in all_passages.objects:
|
||||||
# Work is now a nested object with {title, author}
|
props = obj.properties
|
||||||
work_obj = obj.properties.get("work")
|
# In v2: workAuthor and workTitle are direct properties
|
||||||
if work_obj and isinstance(work_obj, dict):
|
if props.get("workAuthor"):
|
||||||
if work_obj.get("author"):
|
authors.add(str(props["workAuthor"]))
|
||||||
authors.add(str(work_obj["author"]))
|
if props.get("workTitle"):
|
||||||
if work_obj.get("title"):
|
works.add(str(props["workTitle"]))
|
||||||
works.add(str(work_obj["title"]))
|
if props.get("language"):
|
||||||
if obj.properties.get("language"):
|
languages.add(str(props["language"]))
|
||||||
languages.add(str(obj.properties["language"]))
|
|
||||||
|
|
||||||
stats["authors"] = len(authors)
|
stats["authors"] = len(authors)
|
||||||
stats["works"] = len(works)
|
stats["works"] = len(works)
|
||||||
@@ -208,13 +222,13 @@ def get_all_passages(
|
|||||||
if client is None:
|
if client is None:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
chunks = client.collections.get("Chunk")
|
chunks = client.collections.get("Chunk_v2")
|
||||||
|
|
||||||
result = chunks.query.fetch_objects(
|
result = chunks.query.fetch_objects(
|
||||||
limit=limit,
|
limit=limit,
|
||||||
offset=offset,
|
offset=offset,
|
||||||
return_properties=[
|
return_properties=[
|
||||||
"text", "sectionPath", "sectionLevel", "chapterTitle",
|
"text", "sectionPath", "chapterTitle",
|
||||||
"canonicalReference", "unitType", "keywords", "orderIndex", "language"
|
"canonicalReference", "unitType", "keywords", "orderIndex", "language"
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@@ -253,7 +267,7 @@ def simple_search(
|
|||||||
if client is None:
|
if client is None:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
chunks = client.collections.get("Chunk")
|
chunks = client.collections.get("Chunk_v2")
|
||||||
|
|
||||||
# Build filters using top-level properties (workAuthor, workTitle)
|
# Build filters using top-level properties (workAuthor, workTitle)
|
||||||
filters: Optional[Any] = None
|
filters: Optional[Any] = None
|
||||||
@@ -263,13 +277,17 @@ def simple_search(
|
|||||||
work_filter_obj = wvq.Filter.by_property("workTitle").equal(work_filter)
|
work_filter_obj = wvq.Filter.by_property("workTitle").equal(work_filter)
|
||||||
filters = filters & work_filter_obj if filters else work_filter_obj
|
filters = filters & work_filter_obj if filters else work_filter_obj
|
||||||
|
|
||||||
result = chunks.query.near_text(
|
# Generate query vector with GPU embedder (Phase 5: manual vectorization)
|
||||||
query=query,
|
embedder = get_gpu_embedder()
|
||||||
|
query_vector = embedder.embed_single(query)
|
||||||
|
|
||||||
|
result = chunks.query.near_vector(
|
||||||
|
near_vector=query_vector.tolist(),
|
||||||
limit=limit,
|
limit=limit,
|
||||||
filters=filters,
|
filters=filters,
|
||||||
return_metadata=wvq.MetadataQuery(distance=True),
|
return_metadata=wvq.MetadataQuery(distance=True),
|
||||||
return_properties=[
|
return_properties=[
|
||||||
"text", "sectionPath", "sectionLevel", "chapterTitle",
|
"text", "sectionPath", "chapterTitle",
|
||||||
"canonicalReference", "unitType", "keywords", "orderIndex", "language"
|
"canonicalReference", "unitType", "keywords", "orderIndex", "language"
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@@ -333,10 +351,14 @@ def hierarchical_search(
|
|||||||
# STAGE 1: Search Summary collection for relevant sections
|
# STAGE 1: Search Summary collection for relevant sections
|
||||||
# ═══════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
summary_collection = client.collections.get("Summary")
|
summary_collection = client.collections.get("Summary_v2")
|
||||||
|
|
||||||
summaries_result = summary_collection.query.near_text(
|
# Generate query vector with GPU embedder (Phase 5: manual vectorization)
|
||||||
query=query,
|
embedder = get_gpu_embedder()
|
||||||
|
query_vector = embedder.embed_single(query)
|
||||||
|
|
||||||
|
summaries_result = summary_collection.query.near_vector(
|
||||||
|
near_vector=query_vector.tolist(),
|
||||||
limit=sections_limit,
|
limit=sections_limit,
|
||||||
return_metadata=wvq.MetadataQuery(distance=True),
|
return_metadata=wvq.MetadataQuery(distance=True),
|
||||||
# Note: Don't specify return_properties - let Weaviate return all properties
|
# Note: Don't specify return_properties - let Weaviate return all properties
|
||||||
@@ -358,49 +380,52 @@ def hierarchical_search(
|
|||||||
for summary_obj in summaries_result.objects:
|
for summary_obj in summaries_result.objects:
|
||||||
props = summary_obj.properties
|
props = summary_obj.properties
|
||||||
|
|
||||||
# Try to get document.sourceId if available (nested object might still be returned)
|
# In v2: Summary has workTitle property, need to get sourceId from Work
|
||||||
doc_obj = props.get("document")
|
work_title = props.get("workTitle", "")
|
||||||
source_id = ""
|
|
||||||
if doc_obj and isinstance(doc_obj, dict):
|
|
||||||
source_id = doc_obj.get("sourceId", "")
|
|
||||||
|
|
||||||
|
# We'll get sourceId later by matching workTitle with Work.sourceId
|
||||||
|
# For now, use workTitle as identifier
|
||||||
sections_data.append({
|
sections_data.append({
|
||||||
"section_path": props.get("sectionPath", ""),
|
"section_path": props.get("sectionPath", ""),
|
||||||
"title": props.get("title", ""),
|
"title": props.get("title", ""),
|
||||||
"summary_text": props.get("text", ""),
|
"summary_text": props.get("text", ""),
|
||||||
"level": props.get("level", 1),
|
"level": props.get("level", 1),
|
||||||
"concepts": props.get("concepts", []),
|
"concepts": props.get("concepts", []),
|
||||||
"document_source_id": source_id,
|
"document_source_id": "", # Will be populated during filtering
|
||||||
"summary_uuid": str(summary_obj.uuid), # Keep UUID for later retrieval if needed
|
"work_title": work_title, # Add workTitle for filtering
|
||||||
|
"summary_uuid": str(summary_obj.uuid),
|
||||||
"similarity": round((1 - summary_obj.metadata.distance) * 100, 1) if summary_obj.metadata and summary_obj.metadata.distance else 0,
|
"similarity": round((1 - summary_obj.metadata.distance) * 100, 1) if summary_obj.metadata and summary_obj.metadata.distance else 0,
|
||||||
})
|
})
|
||||||
|
|
||||||
# Post-filter sections by author/work (Summary doesn't have work nested object)
|
# Post-filter sections by author/work (Summary_v2 has workTitle property)
|
||||||
if author_filter or work_filter:
|
if author_filter or work_filter:
|
||||||
print(f"[HIERARCHICAL] Post-filtering {len(sections_data)} sections by work='{work_filter}'")
|
print(f"[HIERARCHICAL] Post-filtering {len(sections_data)} sections by work='{work_filter}'")
|
||||||
doc_collection = client.collections.get("Document")
|
|
||||||
filtered_sections = []
|
|
||||||
|
|
||||||
|
# Build Work title -> author map for filtering
|
||||||
|
work_collection = client.collections.get("Work")
|
||||||
|
work_map = {}
|
||||||
|
for work in work_collection.iterator(include_vector=False):
|
||||||
|
props = work.properties
|
||||||
|
title = props.get("title")
|
||||||
|
if title:
|
||||||
|
work_map[title] = {
|
||||||
|
"author": props.get("author", "Unknown"),
|
||||||
|
"sourceId": props.get("sourceId", "")
|
||||||
|
}
|
||||||
|
|
||||||
|
filtered_sections = []
|
||||||
for section in sections_data:
|
for section in sections_data:
|
||||||
source_id = section["document_source_id"]
|
work_title = section.get("work_title", "")
|
||||||
if not source_id:
|
|
||||||
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no sourceId)")
|
if not work_title or work_title not in work_map:
|
||||||
|
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no work mapping)")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Query Document to get work metadata
|
work_author = work_map[work_title]["author"]
|
||||||
# Note: 'work' is a nested object, so we don't specify it in return_properties
|
section["document_source_id"] = work_map[work_title]["sourceId"] # Populate sourceId
|
||||||
# Weaviate should return it automatically
|
|
||||||
doc_result = doc_collection.query.fetch_objects(
|
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' work={work_title}, author={work_author}")
|
||||||
filters=wvq.Filter.by_property("sourceId").equal(source_id),
|
|
||||||
limit=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
if doc_result.objects:
|
|
||||||
doc_work = doc_result.objects[0].properties.get("work", {})
|
|
||||||
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' doc_work type={type(doc_work)}, value={doc_work}")
|
|
||||||
if isinstance(doc_work, dict):
|
|
||||||
work_title = doc_work.get("title", "N/A")
|
|
||||||
work_author = doc_work.get("author", "N/A")
|
|
||||||
# Check filters
|
# Check filters
|
||||||
if author_filter and work_author != author_filter:
|
if author_filter and work_author != author_filter:
|
||||||
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (author '{work_author}' != '{author_filter}')")
|
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' FILTERED (author '{work_author}' != '{author_filter}')")
|
||||||
@@ -411,10 +436,6 @@ def hierarchical_search(
|
|||||||
|
|
||||||
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' KEPT (work='{work_title}')")
|
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' KEPT (work='{work_title}')")
|
||||||
filtered_sections.append(section)
|
filtered_sections.append(section)
|
||||||
else:
|
|
||||||
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (doc_work not a dict)")
|
|
||||||
else:
|
|
||||||
print(f"[HIERARCHICAL] Section '{section['section_path'][:40]}...' SKIPPED (no doc found for sourceId='{source_id}')")
|
|
||||||
|
|
||||||
sections_data = filtered_sections
|
sections_data = filtered_sections
|
||||||
print(f"[HIERARCHICAL] After filtering: {len(sections_data)} sections remaining")
|
print(f"[HIERARCHICAL] After filtering: {len(sections_data)} sections remaining")
|
||||||
@@ -438,7 +459,7 @@ def hierarchical_search(
|
|||||||
# For each section, search chunks using the section's summary text
|
# For each section, search chunks using the section's summary text
|
||||||
# This groups chunks under their relevant sections
|
# This groups chunks under their relevant sections
|
||||||
|
|
||||||
chunk_collection = client.collections.get("Chunk")
|
chunk_collection = client.collections.get("Chunk_v2")
|
||||||
|
|
||||||
# Build base filters (author/work only)
|
# Build base filters (author/work only)
|
||||||
base_filters: Optional[Any] = None
|
base_filters: Optional[Any] = None
|
||||||
@@ -464,8 +485,11 @@ def hierarchical_search(
|
|||||||
if base_filters:
|
if base_filters:
|
||||||
section_filters = base_filters & section_filters
|
section_filters = base_filters & section_filters
|
||||||
|
|
||||||
chunks_result = chunk_collection.query.near_text(
|
# Generate query vector with GPU embedder (Phase 5: manual vectorization)
|
||||||
query=section_query,
|
section_query_vector = embedder.embed_single(section_query)
|
||||||
|
|
||||||
|
chunks_result = chunk_collection.query.near_vector(
|
||||||
|
near_vector=section_query_vector.tolist(),
|
||||||
limit=chunks_per_section,
|
limit=chunks_per_section,
|
||||||
filters=section_filters,
|
filters=section_filters,
|
||||||
return_metadata=wvq.MetadataQuery(distance=True),
|
return_metadata=wvq.MetadataQuery(distance=True),
|
||||||
@@ -600,14 +624,28 @@ def summary_only_search(
|
|||||||
if client is None:
|
if client is None:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
summaries = client.collections.get("Summary")
|
summaries = client.collections.get("Summary_v2")
|
||||||
|
|
||||||
# Note: Cannot filter by nested document properties directly in Weaviate v4
|
# Build Work map for metadata lookup (Summary_v2 has workTitle, not document)
|
||||||
# Must fetch all and filter in Python if author/work filters are present
|
work_collection = client.collections.get("Work")
|
||||||
|
work_map = {}
|
||||||
|
for work in work_collection.iterator(include_vector=False):
|
||||||
|
work_props = work.properties
|
||||||
|
title = work_props.get("title")
|
||||||
|
if title:
|
||||||
|
work_map[title] = {
|
||||||
|
"author": work_props.get("author", "Unknown"),
|
||||||
|
"year": work_props.get("year", 0),
|
||||||
|
"sourceId": work_props.get("sourceId", ""),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Generate query vector with GPU embedder (Phase 5: manual vectorization)
|
||||||
|
embedder = get_gpu_embedder()
|
||||||
|
query_vector = embedder.embed_single(query)
|
||||||
|
|
||||||
# Semantic search
|
# Semantic search
|
||||||
results = summaries.query.near_text(
|
results = summaries.query.near_vector(
|
||||||
query=query,
|
near_vector=query_vector.tolist(),
|
||||||
limit=limit * 3 if (author_filter or work_filter) else limit, # Fetch more if filtering
|
limit=limit * 3 if (author_filter or work_filter) else limit, # Fetch more if filtering
|
||||||
return_metadata=wvq.MetadataQuery(distance=True)
|
return_metadata=wvq.MetadataQuery(distance=True)
|
||||||
)
|
)
|
||||||
@@ -618,24 +656,34 @@ def summary_only_search(
|
|||||||
props = obj.properties
|
props = obj.properties
|
||||||
similarity = 1 - obj.metadata.distance
|
similarity = 1 - obj.metadata.distance
|
||||||
|
|
||||||
# Apply filters (Python-side since nested properties)
|
# Get work metadata from workTitle
|
||||||
if author_filter and props["document"].get("author", "") != author_filter:
|
work_title = props.get("workTitle", "")
|
||||||
|
if not work_title or work_title not in work_map:
|
||||||
continue
|
continue
|
||||||
if work_filter and props["document"].get("title", "") != work_filter:
|
|
||||||
|
work_info = work_map[work_title]
|
||||||
|
work_author = work_info["author"]
|
||||||
|
work_year = work_info["year"]
|
||||||
|
source_id = work_info["sourceId"]
|
||||||
|
|
||||||
|
# Apply filters
|
||||||
|
if author_filter and work_author != author_filter:
|
||||||
|
continue
|
||||||
|
if work_filter and work_title != work_filter:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Determine document icon and name
|
# Determine document icon and name
|
||||||
doc_id = props["document"]["sourceId"].lower()
|
doc_id_lower = source_id.lower()
|
||||||
if "tiercelin" in doc_id:
|
if "tiercelin" in doc_id_lower:
|
||||||
doc_icon = "🟡"
|
doc_icon = "🟡"
|
||||||
doc_name = "Tiercelin"
|
doc_name = "Tiercelin"
|
||||||
elif "platon" in doc_id or "menon" in doc_id:
|
elif "platon" in doc_id_lower or "menon" in doc_id_lower:
|
||||||
doc_icon = "🟢"
|
doc_icon = "🟢"
|
||||||
doc_name = "Platon"
|
doc_name = "Platon"
|
||||||
elif "haugeland" in doc_id:
|
elif "haugeland" in doc_id_lower:
|
||||||
doc_icon = "🟣"
|
doc_icon = "🟣"
|
||||||
doc_name = "Haugeland"
|
doc_name = "Haugeland"
|
||||||
elif "logique" in doc_id:
|
elif "logique" in doc_id_lower:
|
||||||
doc_icon = "🔵"
|
doc_icon = "🔵"
|
||||||
doc_name = "Logique"
|
doc_name = "Logique"
|
||||||
else:
|
else:
|
||||||
@@ -647,19 +695,19 @@ def summary_only_search(
|
|||||||
"uuid": str(obj.uuid),
|
"uuid": str(obj.uuid),
|
||||||
"similarity": round(similarity * 100, 1), # Convert to percentage
|
"similarity": round(similarity * 100, 1), # Convert to percentage
|
||||||
"text": props.get("text", ""),
|
"text": props.get("text", ""),
|
||||||
"title": props["title"],
|
"title": props.get("title", ""),
|
||||||
"concepts": props.get("concepts", []),
|
"concepts": props.get("concepts", []),
|
||||||
"doc_icon": doc_icon,
|
"doc_icon": doc_icon,
|
||||||
"doc_name": doc_name,
|
"doc_name": doc_name,
|
||||||
"author": props["document"].get("author", ""),
|
"author": work_author,
|
||||||
"year": props["document"].get("year", 0),
|
"year": work_year,
|
||||||
"chunks_count": props.get("chunksCount", 0),
|
"chunks_count": props.get("chunksCount", 0),
|
||||||
"section_path": props.get("sectionPath", ""),
|
"section_path": props.get("sectionPath", ""),
|
||||||
"sectionPath": props.get("sectionPath", ""), # Alias for template compatibility
|
"sectionPath": props.get("sectionPath", ""), # Alias for template compatibility
|
||||||
# Add work info for template compatibility
|
# Add work info for template compatibility
|
||||||
"work": {
|
"work": {
|
||||||
"title": props["document"].get("title", ""),
|
"title": work_title,
|
||||||
"author": props["document"].get("author", ""),
|
"author": work_author,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -969,7 +1017,7 @@ def rag_search(
|
|||||||
print("[RAG Search] Weaviate client unavailable")
|
print("[RAG Search] Weaviate client unavailable")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
chunks = client.collections.get("Chunk")
|
chunks = client.collections.get("Chunk_v2")
|
||||||
|
|
||||||
# Build work filter if selected_works is provided
|
# Build work filter if selected_works is provided
|
||||||
work_filter: Optional[Any] = None
|
work_filter: Optional[Any] = None
|
||||||
@@ -978,9 +1026,13 @@ def rag_search(
|
|||||||
work_filter = wvq.Filter.by_property("workTitle").contains_any(selected_works)
|
work_filter = wvq.Filter.by_property("workTitle").contains_any(selected_works)
|
||||||
print(f"[RAG Search] Applying work filter: {selected_works}")
|
print(f"[RAG Search] Applying work filter: {selected_works}")
|
||||||
|
|
||||||
|
# Generate query vector with GPU embedder (Phase 5: manual vectorization)
|
||||||
|
embedder = get_gpu_embedder()
|
||||||
|
query_vector = embedder.embed_single(query)
|
||||||
|
|
||||||
# Query with properties needed for RAG context
|
# Query with properties needed for RAG context
|
||||||
result = chunks.query.near_text(
|
result = chunks.query.near_vector(
|
||||||
query=query,
|
near_vector=query_vector.tolist(),
|
||||||
limit=limit,
|
limit=limit,
|
||||||
filters=work_filter,
|
filters=work_filter,
|
||||||
return_metadata=wvq.MetadataQuery(distance=True),
|
return_metadata=wvq.MetadataQuery(distance=True),
|
||||||
@@ -1444,23 +1496,20 @@ def api_get_works() -> Union[Response, tuple[Response, int]]:
|
|||||||
"message": "Cannot connect to Weaviate database"
|
"message": "Cannot connect to Weaviate database"
|
||||||
}), 500
|
}), 500
|
||||||
|
|
||||||
# Query Chunk collection to get all unique works with counts
|
# Query Chunk_v2 collection to get all unique works with counts
|
||||||
chunks = client.collections.get("Chunk")
|
chunks = client.collections.get("Chunk_v2")
|
||||||
|
|
||||||
# Fetch all chunks to aggregate by work
|
# Fetch all chunks to aggregate by work
|
||||||
# Using a larger limit to get all documents
|
# In v2: work is NOT a nested object, use workTitle and workAuthor properties
|
||||||
# Note: Don't use return_properties with nested objects (causes gRPC error)
|
|
||||||
# Fetch all objects without specifying properties
|
|
||||||
all_chunks = chunks.query.fetch_objects(limit=10000)
|
all_chunks = chunks.query.fetch_objects(limit=10000)
|
||||||
|
|
||||||
# Aggregate chunks by work (title + author)
|
# Aggregate chunks by work (title + author)
|
||||||
works_count: Dict[str, Dict[str, Any]] = {}
|
works_count: Dict[str, Dict[str, Any]] = {}
|
||||||
|
|
||||||
for obj in all_chunks.objects:
|
for obj in all_chunks.objects:
|
||||||
work_obj = obj.properties.get("work")
|
props = obj.properties
|
||||||
if work_obj and isinstance(work_obj, dict):
|
title = props.get("workTitle", "")
|
||||||
title = work_obj.get("title", "")
|
author = props.get("workAuthor", "")
|
||||||
author = work_obj.get("author", "")
|
|
||||||
|
|
||||||
if title: # Only count if title exists
|
if title: # Only count if title exists
|
||||||
# Use title as key (assumes unique titles)
|
# Use title as key (assumes unique titles)
|
||||||
@@ -3082,45 +3131,60 @@ def documents() -> str:
|
|||||||
|
|
||||||
with get_weaviate_client() as client:
|
with get_weaviate_client() as client:
|
||||||
if client is not None:
|
if client is not None:
|
||||||
# Get chunk counts and authors
|
|
||||||
chunk_collection = client.collections.get("Chunk")
|
|
||||||
|
|
||||||
for obj in chunk_collection.iterator(include_vector=False):
|
|
||||||
props = obj.properties
|
|
||||||
from typing import cast
|
from typing import cast
|
||||||
doc_obj = cast(Dict[str, Any], props.get("document", {}))
|
|
||||||
work_obj = cast(Dict[str, Any], props.get("work", {}))
|
|
||||||
|
|
||||||
if doc_obj:
|
# Get all Works (now with sourceId added in Phase 1 of migration)
|
||||||
source_id = doc_obj.get("sourceId", "")
|
try:
|
||||||
if source_id:
|
work_collection = client.collections.get("Work")
|
||||||
if source_id not in documents_from_weaviate:
|
chunk_collection = client.collections.get("Chunk_v2")
|
||||||
|
|
||||||
|
# Build documents from Work collection
|
||||||
|
for work in work_collection.iterator(include_vector=False):
|
||||||
|
props = work.properties
|
||||||
|
source_id = props.get("sourceId")
|
||||||
|
|
||||||
|
# Skip Works without sourceId (not documents)
|
||||||
|
if not source_id:
|
||||||
|
continue
|
||||||
|
|
||||||
documents_from_weaviate[source_id] = {
|
documents_from_weaviate[source_id] = {
|
||||||
"source_id": source_id,
|
"source_id": source_id,
|
||||||
"title": work_obj.get("title") if work_obj else "Unknown",
|
"title": props.get("title", "Unknown"),
|
||||||
"author": work_obj.get("author") if work_obj else "Unknown",
|
"author": props.get("author", "Unknown"),
|
||||||
|
"pages": props.get("pages", 0),
|
||||||
|
"edition": props.get("edition", ""),
|
||||||
"chunks_count": 0,
|
"chunks_count": 0,
|
||||||
"summaries_count": 0,
|
"summaries_count": 0,
|
||||||
"authors": set(),
|
"authors": set(),
|
||||||
}
|
}
|
||||||
documents_from_weaviate[source_id]["chunks_count"] += 1
|
|
||||||
|
|
||||||
# Track unique authors
|
# Add author to set
|
||||||
author = work_obj.get("author") if work_obj else None
|
if props.get("author") and props.get("author") != "Unknown":
|
||||||
if author:
|
documents_from_weaviate[source_id]["authors"].add(props.get("author"))
|
||||||
documents_from_weaviate[source_id]["authors"].add(author)
|
|
||||||
|
|
||||||
# Get summary counts
|
# Count chunks per document (via workTitle)
|
||||||
|
for chunk in chunk_collection.iterator(include_vector=False):
|
||||||
|
work_title = chunk.properties.get("workTitle")
|
||||||
|
|
||||||
|
# Find corresponding sourceId
|
||||||
|
for source_id, doc_data in documents_from_weaviate.items():
|
||||||
|
if doc_data["title"] == work_title:
|
||||||
|
doc_data["chunks_count"] += 1
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Could not load Work collection: {e}")
|
||||||
|
|
||||||
|
# Count summaries (if collection exists)
|
||||||
try:
|
try:
|
||||||
summary_collection = client.collections.get("Summary")
|
summary_collection = client.collections.get("Summary_v2")
|
||||||
for obj in summary_collection.iterator(include_vector=False):
|
for summary in summary_collection.iterator(include_vector=False):
|
||||||
props = obj.properties
|
work_title = summary.properties.get("workTitle")
|
||||||
doc_obj = cast(Dict[str, Any], props.get("document", {}))
|
|
||||||
|
|
||||||
if doc_obj:
|
# Find corresponding sourceId
|
||||||
source_id = doc_obj.get("sourceId", "")
|
for source_id, doc_data in documents_from_weaviate.items():
|
||||||
if source_id and source_id in documents_from_weaviate:
|
if doc_data["title"] == work_title:
|
||||||
documents_from_weaviate[source_id]["summaries_count"] += 1
|
doc_data["summaries_count"] += 1
|
||||||
|
break
|
||||||
except Exception:
|
except Exception:
|
||||||
# Summary collection may not exist
|
# Summary collection may not exist
|
||||||
pass
|
pass
|
||||||
@@ -3157,17 +3221,195 @@ def documents() -> str:
|
|||||||
"has_images": images_dir.exists() and any(images_dir.iterdir()) if images_dir.exists() else False,
|
"has_images": images_dir.exists() and any(images_dir.iterdir()) if images_dir.exists() else False,
|
||||||
"image_count": len(list(images_dir.glob("*.png"))) if images_dir.exists() else 0,
|
"image_count": len(list(images_dir.glob("*.png"))) if images_dir.exists() else 0,
|
||||||
"metadata": metadata,
|
"metadata": metadata,
|
||||||
|
"pages": weaviate_data.get("pages", pages), # FROM WEAVIATE, fallback to file
|
||||||
"summaries_count": weaviate_data["summaries_count"], # FROM WEAVIATE
|
"summaries_count": weaviate_data["summaries_count"], # FROM WEAVIATE
|
||||||
"authors_count": len(weaviate_data["authors"]), # FROM WEAVIATE
|
"authors_count": len(weaviate_data["authors"]), # FROM WEAVIATE
|
||||||
"chunks_count": weaviate_data["chunks_count"], # FROM WEAVIATE
|
"chunks_count": weaviate_data["chunks_count"], # FROM WEAVIATE
|
||||||
"title": weaviate_data["title"], # FROM WEAVIATE
|
"title": weaviate_data["title"], # FROM WEAVIATE
|
||||||
"author": weaviate_data["author"], # FROM WEAVIATE
|
"author": weaviate_data["author"], # FROM WEAVIATE
|
||||||
|
"edition": weaviate_data.get("edition", ""), # FROM WEAVIATE
|
||||||
"toc": toc,
|
"toc": toc,
|
||||||
})
|
})
|
||||||
|
|
||||||
return render_template("documents.html", documents=documents_list)
|
return render_template("documents.html", documents=documents_list)
|
||||||
|
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
|
# Memory Routes (Phase 5: Backend Integration)
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
def run_async(coro):
|
||||||
|
"""Run async coroutine in sync Flask context."""
|
||||||
|
import asyncio
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
asyncio.set_event_loop(loop)
|
||||||
|
try:
|
||||||
|
return loop.run_until_complete(coro)
|
||||||
|
finally:
|
||||||
|
loop.close()
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/memories")
|
||||||
|
def memories() -> str:
|
||||||
|
"""Render the Memory search page (Thoughts + Messages)."""
|
||||||
|
# Get memory statistics
|
||||||
|
with get_weaviate_client() as client:
|
||||||
|
if client is None:
|
||||||
|
flash("Cannot connect to Weaviate database", "error")
|
||||||
|
stats = {"thoughts": 0, "messages": 0, "conversations": 0}
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
thoughts = client.collections.get("Thought")
|
||||||
|
messages = client.collections.get("Message")
|
||||||
|
conversations = client.collections.get("Conversation")
|
||||||
|
|
||||||
|
thoughts_count = thoughts.aggregate.over_all(total_count=True).total_count
|
||||||
|
messages_count = messages.aggregate.over_all(total_count=True).total_count
|
||||||
|
conversations_count = conversations.aggregate.over_all(total_count=True).total_count
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
"thoughts": thoughts_count or 0,
|
||||||
|
"messages": messages_count or 0,
|
||||||
|
"conversations": conversations_count or 0,
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error fetching memory stats: {e}")
|
||||||
|
stats = {"thoughts": 0, "messages": 0, "conversations": 0}
|
||||||
|
|
||||||
|
return render_template("memories.html", stats=stats)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/memories/search-thoughts", methods=["POST"])
|
||||||
|
def api_search_thoughts():
|
||||||
|
"""API endpoint for thought semantic search."""
|
||||||
|
try:
|
||||||
|
# Import Memory MCP tools locally
|
||||||
|
from memory.mcp import SearchThoughtsInput, search_thoughts_handler
|
||||||
|
|
||||||
|
data = request.json
|
||||||
|
query = data.get("query", "")
|
||||||
|
limit = data.get("limit", 10)
|
||||||
|
thought_type_filter = data.get("thought_type_filter")
|
||||||
|
|
||||||
|
input_data = SearchThoughtsInput(
|
||||||
|
query=query,
|
||||||
|
limit=limit,
|
||||||
|
thought_type_filter=thought_type_filter
|
||||||
|
)
|
||||||
|
|
||||||
|
result = run_async(search_thoughts_handler(input_data))
|
||||||
|
return jsonify(result)
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({"success": False, "error": str(e)}), 500
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/memories/search-messages", methods=["POST"])
|
||||||
|
def api_search_messages():
|
||||||
|
"""API endpoint for message semantic search."""
|
||||||
|
try:
|
||||||
|
from memory.mcp import SearchMessagesInput, search_messages_handler
|
||||||
|
|
||||||
|
data = request.json
|
||||||
|
query = data.get("query", "")
|
||||||
|
limit = data.get("limit", 10)
|
||||||
|
conversation_id_filter = data.get("conversation_id_filter")
|
||||||
|
|
||||||
|
input_data = SearchMessagesInput(
|
||||||
|
query=query,
|
||||||
|
limit=limit,
|
||||||
|
conversation_id_filter=conversation_id_filter
|
||||||
|
)
|
||||||
|
|
||||||
|
result = run_async(search_messages_handler(input_data))
|
||||||
|
return jsonify(result)
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({"success": False, "error": str(e)}), 500
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/conversations")
|
||||||
|
def conversations() -> str:
|
||||||
|
"""Render the Conversations page."""
|
||||||
|
try:
|
||||||
|
from memory.mcp import ListConversationsInput, list_conversations_handler
|
||||||
|
|
||||||
|
limit = request.args.get("limit", 20, type=int)
|
||||||
|
category_filter = request.args.get("category")
|
||||||
|
|
||||||
|
input_data = ListConversationsInput(
|
||||||
|
limit=limit,
|
||||||
|
category_filter=category_filter
|
||||||
|
)
|
||||||
|
|
||||||
|
result = run_async(list_conversations_handler(input_data))
|
||||||
|
|
||||||
|
if result.get("success"):
|
||||||
|
conversations_list = result.get("conversations", [])
|
||||||
|
else:
|
||||||
|
flash(f"Error loading conversations: {result.get('error')}", "error")
|
||||||
|
conversations_list = []
|
||||||
|
|
||||||
|
return render_template("conversations.html", conversations=conversations_list)
|
||||||
|
except Exception as e:
|
||||||
|
flash(f"Error loading conversations: {str(e)}", "error")
|
||||||
|
return render_template("conversations.html", conversations=[])
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/conversation/<conversation_id>")
|
||||||
|
def conversation_view(conversation_id: str) -> str:
|
||||||
|
"""View a specific conversation with all its messages."""
|
||||||
|
try:
|
||||||
|
from memory.mcp import (
|
||||||
|
GetConversationInput, get_conversation_handler,
|
||||||
|
GetMessagesInput, get_messages_handler
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get conversation metadata
|
||||||
|
conv_input = GetConversationInput(conversation_id=conversation_id)
|
||||||
|
conversation = run_async(get_conversation_handler(conv_input))
|
||||||
|
|
||||||
|
if not conversation.get("success"):
|
||||||
|
flash(f"Conversation not found: {conversation.get('error')}", "error")
|
||||||
|
return redirect(url_for("conversations"))
|
||||||
|
|
||||||
|
# Get all messages
|
||||||
|
msg_input = GetMessagesInput(conversation_id=conversation_id, limit=500)
|
||||||
|
messages_result = run_async(get_messages_handler(msg_input))
|
||||||
|
|
||||||
|
messages = messages_result.get("messages", []) if messages_result.get("success") else []
|
||||||
|
|
||||||
|
return render_template(
|
||||||
|
"conversation_view.html",
|
||||||
|
conversation=conversation,
|
||||||
|
messages=messages
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
flash(f"Error loading conversation: {str(e)}", "error")
|
||||||
|
return redirect(url_for("conversations"))
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/conversations/search", methods=["POST"])
|
||||||
|
def api_search_conversations():
|
||||||
|
"""API endpoint for conversation semantic search."""
|
||||||
|
try:
|
||||||
|
from memory.mcp import SearchConversationsInput, search_conversations_handler
|
||||||
|
|
||||||
|
data = request.json
|
||||||
|
query = data.get("query", "")
|
||||||
|
limit = data.get("limit", 10)
|
||||||
|
category_filter = data.get("category_filter")
|
||||||
|
|
||||||
|
input_data = SearchConversationsInput(
|
||||||
|
query=query,
|
||||||
|
limit=limit,
|
||||||
|
category_filter=category_filter
|
||||||
|
)
|
||||||
|
|
||||||
|
result = run_async(search_conversations_handler(input_data))
|
||||||
|
return jsonify(result)
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({"success": False, "error": str(e)}), 500
|
||||||
|
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
# Main
|
# Main
|
||||||
# ═══════════════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
|
|||||||
@@ -62,6 +62,31 @@ from mcp_tools import (
|
|||||||
PDFProcessingError,
|
PDFProcessingError,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Memory MCP Tools (added for unified Memory + Library system)
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
from memory.mcp import (
|
||||||
|
# Thought tools
|
||||||
|
AddThoughtInput,
|
||||||
|
SearchThoughtsInput,
|
||||||
|
add_thought_handler,
|
||||||
|
search_thoughts_handler,
|
||||||
|
get_thought_handler,
|
||||||
|
# Message tools
|
||||||
|
AddMessageInput,
|
||||||
|
GetMessagesInput,
|
||||||
|
SearchMessagesInput,
|
||||||
|
add_message_handler,
|
||||||
|
get_messages_handler,
|
||||||
|
search_messages_handler,
|
||||||
|
# Conversation tools
|
||||||
|
GetConversationInput,
|
||||||
|
SearchConversationsInput,
|
||||||
|
ListConversationsInput,
|
||||||
|
get_conversation_handler,
|
||||||
|
search_conversations_handler,
|
||||||
|
list_conversations_handler,
|
||||||
|
)
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Logging Configuration
|
# Logging Configuration
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -551,6 +576,264 @@ async def delete_document(
|
|||||||
return result.model_dump(mode='json')
|
return result.model_dump(mode='json')
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Memory Tools (Thoughts, Messages, Conversations)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def add_thought(
|
||||||
|
content: str,
|
||||||
|
thought_type: str = "reflection",
|
||||||
|
trigger: str = "",
|
||||||
|
concepts: list[str] | None = None,
|
||||||
|
privacy_level: str = "private",
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Add a new thought to the Memory system.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: The thought content.
|
||||||
|
thought_type: Type (reflection, question, intuition, observation, etc.).
|
||||||
|
trigger: What triggered this thought (optional).
|
||||||
|
concepts: Related concepts/tags (optional).
|
||||||
|
privacy_level: Privacy level (private, shared, public).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing:
|
||||||
|
- success: Whether thought was added successfully
|
||||||
|
- uuid: UUID of the created thought
|
||||||
|
- content: Preview of the thought content
|
||||||
|
- thought_type: The thought type
|
||||||
|
"""
|
||||||
|
input_data = AddThoughtInput(
|
||||||
|
content=content,
|
||||||
|
thought_type=thought_type,
|
||||||
|
trigger=trigger,
|
||||||
|
concepts=concepts or [],
|
||||||
|
privacy_level=privacy_level,
|
||||||
|
)
|
||||||
|
result = await add_thought_handler(input_data)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def search_thoughts(
|
||||||
|
query: str,
|
||||||
|
limit: int = 10,
|
||||||
|
thought_type_filter: str | None = None,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Search thoughts using semantic similarity.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Search query text.
|
||||||
|
limit: Maximum number of results (1-100, default 10).
|
||||||
|
thought_type_filter: Filter by thought type (optional).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing:
|
||||||
|
- success: Whether search succeeded
|
||||||
|
- query: The original search query
|
||||||
|
- results: List of matching thoughts
|
||||||
|
- count: Number of results returned
|
||||||
|
"""
|
||||||
|
input_data = SearchThoughtsInput(
|
||||||
|
query=query,
|
||||||
|
limit=limit,
|
||||||
|
thought_type_filter=thought_type_filter,
|
||||||
|
)
|
||||||
|
result = await search_thoughts_handler(input_data)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def get_thought(uuid: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get a specific thought by UUID.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
uuid: Thought UUID.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing complete thought data or error message.
|
||||||
|
"""
|
||||||
|
result = await get_thought_handler(uuid)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def add_message(
|
||||||
|
content: str,
|
||||||
|
role: str,
|
||||||
|
conversation_id: str,
|
||||||
|
order_index: int = 0,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Add a new message to a conversation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: Message content.
|
||||||
|
role: Role (user, assistant, system).
|
||||||
|
conversation_id: Conversation identifier.
|
||||||
|
order_index: Position in conversation (default 0).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing:
|
||||||
|
- success: Whether message was added successfully
|
||||||
|
- uuid: UUID of the created message
|
||||||
|
- content: Preview of the message content
|
||||||
|
- role: The message role
|
||||||
|
- conversation_id: The conversation ID
|
||||||
|
"""
|
||||||
|
input_data = AddMessageInput(
|
||||||
|
content=content,
|
||||||
|
role=role,
|
||||||
|
conversation_id=conversation_id,
|
||||||
|
order_index=order_index,
|
||||||
|
)
|
||||||
|
result = await add_message_handler(input_data)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def get_messages(
|
||||||
|
conversation_id: str,
|
||||||
|
limit: int = 50,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get all messages from a conversation in order.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
conversation_id: Conversation identifier.
|
||||||
|
limit: Maximum messages to return (1-500, default 50).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing:
|
||||||
|
- success: Whether query succeeded
|
||||||
|
- conversation_id: The conversation ID
|
||||||
|
- messages: List of messages in order
|
||||||
|
- count: Number of messages returned
|
||||||
|
"""
|
||||||
|
input_data = GetMessagesInput(
|
||||||
|
conversation_id=conversation_id,
|
||||||
|
limit=limit,
|
||||||
|
)
|
||||||
|
result = await get_messages_handler(input_data)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def search_messages(
|
||||||
|
query: str,
|
||||||
|
limit: int = 10,
|
||||||
|
conversation_id_filter: str | None = None,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Search messages using semantic similarity.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Search query text.
|
||||||
|
limit: Maximum number of results (1-100, default 10).
|
||||||
|
conversation_id_filter: Filter by conversation ID (optional).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing:
|
||||||
|
- success: Whether search succeeded
|
||||||
|
- query: The original search query
|
||||||
|
- results: List of matching messages
|
||||||
|
- count: Number of results returned
|
||||||
|
"""
|
||||||
|
input_data = SearchMessagesInput(
|
||||||
|
query=query,
|
||||||
|
limit=limit,
|
||||||
|
conversation_id_filter=conversation_id_filter,
|
||||||
|
)
|
||||||
|
result = await search_messages_handler(input_data)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def get_conversation(conversation_id: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get a specific conversation by ID.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
conversation_id: Conversation identifier.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing:
|
||||||
|
- success: Whether conversation was found
|
||||||
|
- conversation_id: The conversation ID
|
||||||
|
- category: Conversation category
|
||||||
|
- summary: Conversation summary
|
||||||
|
- timestamp_start: Start time
|
||||||
|
- timestamp_end: End time
|
||||||
|
- participants: List of participants
|
||||||
|
- tags: Semantic tags
|
||||||
|
- message_count: Number of messages
|
||||||
|
"""
|
||||||
|
input_data = GetConversationInput(conversation_id=conversation_id)
|
||||||
|
result = await get_conversation_handler(input_data)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def search_conversations(
|
||||||
|
query: str,
|
||||||
|
limit: int = 10,
|
||||||
|
category_filter: str | None = None,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Search conversations using semantic similarity.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Search query text.
|
||||||
|
limit: Maximum number of results (1-50, default 10).
|
||||||
|
category_filter: Filter by category (optional).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing:
|
||||||
|
- success: Whether search succeeded
|
||||||
|
- query: The original search query
|
||||||
|
- results: List of matching conversations
|
||||||
|
- count: Number of results returned
|
||||||
|
"""
|
||||||
|
input_data = SearchConversationsInput(
|
||||||
|
query=query,
|
||||||
|
limit=limit,
|
||||||
|
category_filter=category_filter,
|
||||||
|
)
|
||||||
|
result = await search_conversations_handler(input_data)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def list_conversations(
|
||||||
|
limit: int = 20,
|
||||||
|
category_filter: str | None = None,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
List all conversations with optional filtering.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
limit: Maximum conversations to return (1-100, default 20).
|
||||||
|
category_filter: Filter by category (optional).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing:
|
||||||
|
- success: Whether query succeeded
|
||||||
|
- conversations: List of conversations
|
||||||
|
- count: Number of conversations returned
|
||||||
|
"""
|
||||||
|
input_data = ListConversationsInput(
|
||||||
|
limit=limit,
|
||||||
|
category_filter=category_filter,
|
||||||
|
)
|
||||||
|
result = await list_conversations_handler(input_data)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Signal Handlers
|
# Signal Handlers
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
@@ -718,6 +718,15 @@
|
|||||||
<span class="icon">📚</span>
|
<span class="icon">📚</span>
|
||||||
<span>Documents</span>
|
<span>Documents</span>
|
||||||
</a>
|
</a>
|
||||||
|
<div style="margin: 1rem 0; border-top: 1px solid rgba(255,255,255,0.1);"></div>
|
||||||
|
<a href="/memories" class="{{ 'active' if request.endpoint == 'memories' else '' }}">
|
||||||
|
<span class="icon">🧠</span>
|
||||||
|
<span>Memory (Ikario)</span>
|
||||||
|
</a>
|
||||||
|
<a href="/conversations" class="{{ 'active' if request.endpoint == 'conversations' else '' }}">
|
||||||
|
<span class="icon">💭</span>
|
||||||
|
<span>Conversations</span>
|
||||||
|
</a>
|
||||||
</div>
|
</div>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
@@ -736,6 +745,7 @@
|
|||||||
<a href="/chat" class="{{ 'active' if request.endpoint == 'chat' else '' }}">Conversation</a>
|
<a href="/chat" class="{{ 'active' if request.endpoint == 'chat' else '' }}">Conversation</a>
|
||||||
<a href="/upload" class="{{ 'active' if request.endpoint == 'upload' else '' }}">Parser PDF</a>
|
<a href="/upload" class="{{ 'active' if request.endpoint == 'upload' else '' }}">Parser PDF</a>
|
||||||
<a href="/documents" class="{{ 'active' if request.endpoint == 'documents' else '' }}">Documents</a>
|
<a href="/documents" class="{{ 'active' if request.endpoint == 'documents' else '' }}">Documents</a>
|
||||||
|
<a href="/memories" class="{{ 'active' if request.endpoint == 'memories' else '' }}">Memory</a>
|
||||||
</nav>
|
</nav>
|
||||||
</div>
|
</div>
|
||||||
</header>
|
</header>
|
||||||
|
|||||||
Reference in New Issue
Block a user