feat: Add vectorized summary field and migration tools
- Add 'summary' field to Chunk collection (vectorized with text2vec) - Migrate from Dynamic index to HNSW + RQ for both Chunk and Summary - Add LLM summarizer module (utils/llm_summarizer.py) - Add migration scripts (migrate_add_summary.py, restore_*.py) - Add summary generation utilities and progress tracking - Add testing and cleaning tools (outils_test_and_cleaning/) - Add comprehensive documentation (ANALYSE_*.md, guides) - Remove obsolete files (linear_config.py, old test files) - Update .gitignore to exclude backups and temp files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
67
generations/library_rag/check_progress.py
Normal file
67
generations/library_rag/check_progress.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""Script pour vérifier la progression de la génération de résumés."""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import weaviate
|
||||
|
||||
# Fix encoding
|
||||
if sys.platform == 'win32' and hasattr(sys.stdout, 'reconfigure'):
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
PROGRESS_FILE = Path("summary_generation_progress.json")
|
||||
|
||||
print("=" * 80)
|
||||
print("PROGRESSION GÉNÉRATION DE RÉSUMÉS")
|
||||
print("=" * 80)
|
||||
|
||||
# Lire la progression
|
||||
if not PROGRESS_FILE.exists():
|
||||
print("\n⚠ Aucune progression sauvegardée")
|
||||
print(" → Lancez resume_summaries.bat pour démarrer")
|
||||
sys.exit(0)
|
||||
|
||||
with open(PROGRESS_FILE, "r", encoding="utf-8") as f:
|
||||
progress = json.load(f)
|
||||
|
||||
processed = progress["total_processed"]
|
||||
last_update = progress.get("last_update", "N/A")
|
||||
|
||||
print(f"\n📊 Chunks traités : {processed}")
|
||||
print(f"🕒 Dernière MAJ : {last_update}")
|
||||
|
||||
# Connexion Weaviate pour vérifier le total
|
||||
try:
|
||||
client = weaviate.connect_to_local(host="localhost", port=8080, grpc_port=50051)
|
||||
|
||||
chunk_collection = client.collections.get("Chunk")
|
||||
all_chunks = chunk_collection.query.fetch_objects(limit=10000)
|
||||
|
||||
without_summary = sum(1 for obj in all_chunks.objects if not obj.properties.get("summary", ""))
|
||||
total = len(all_chunks.objects)
|
||||
with_summary = total - without_summary
|
||||
|
||||
print(f"\n📈 Total chunks : {total}")
|
||||
print(f"✓ Avec résumé : {with_summary} ({with_summary/total*100:.1f}%)")
|
||||
print(f"⏳ Sans résumé : {without_summary} ({without_summary/total*100:.1f}%)")
|
||||
|
||||
if without_summary > 0:
|
||||
print(f"\n🎯 Progression estimée : {with_summary}/{total} chunks")
|
||||
print(f" Reste à traiter : {without_summary} chunks")
|
||||
|
||||
# Estimation temps restant (basé sur 50s/chunk)
|
||||
time_remaining_hours = (without_summary * 50) / 3600
|
||||
print(f" ETA (~50s/chunk) : {time_remaining_hours:.1f} heures")
|
||||
else:
|
||||
print("\n✅ TERMINÉ ! Tous les chunks ont un résumé !")
|
||||
|
||||
client.close()
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n⚠ Erreur connexion Weaviate: {e}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("Pour relancer la génération : resume_summaries.bat")
|
||||
print("=" * 80)
|
||||
Reference in New Issue
Block a user