feat: Add vectorized summary field and migration tools
- Add 'summary' field to Chunk collection (vectorized with text2vec) - Migrate from Dynamic index to HNSW + RQ for both Chunk and Summary - Add LLM summarizer module (utils/llm_summarizer.py) - Add migration scripts (migrate_add_summary.py, restore_*.py) - Add summary generation utilities and progress tracking - Add testing and cleaning tools (outils_test_and_cleaning/) - Add comprehensive documentation (ANALYSE_*.md, guides) - Remove obsolete files (linear_config.py, old test files) - Update .gitignore to exclude backups and temp files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
229
generations/library_rag/restore_remaining_chunks.py
Normal file
229
generations/library_rag/restore_remaining_chunks.py
Normal file
@@ -0,0 +1,229 @@
|
||||
"""Script pour restaurer uniquement les chunks manquants.
|
||||
|
||||
Ce script:
|
||||
1. Récupère tous les chunks déjà présents dans Weaviate
|
||||
2. Compare avec le backup pour identifier les chunks manquants
|
||||
3. Importe uniquement les chunks manquants
|
||||
|
||||
Usage:
|
||||
python restore_remaining_chunks.py backup_migration_20260105_174349
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Set
|
||||
|
||||
import weaviate
|
||||
|
||||
# Configuration logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[%(asctime)s] %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def fix_date_format(value):
|
||||
"""Convertit les dates ISO8601 en RFC3339 (remplace espace par T)."""
|
||||
if isinstance(value, str) and re.match(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', value):
|
||||
return value.replace(' ', 'T', 1)
|
||||
return value
|
||||
|
||||
|
||||
def fix_dates_in_object(obj):
|
||||
"""Parcourt récursivement un objet et fixe les formats de date."""
|
||||
if isinstance(obj, dict):
|
||||
return {k: fix_dates_in_object(v) for k, v in obj.items()}
|
||||
elif isinstance(obj, list):
|
||||
return [fix_dates_in_object(item) for item in obj]
|
||||
else:
|
||||
return fix_date_format(obj)
|
||||
|
||||
|
||||
def get_existing_chunk_texts(client: weaviate.WeaviateClient) -> Set[str]:
|
||||
"""Récupère les textes de tous les chunks existants pour comparaison.
|
||||
|
||||
On utilise les premiers 100 caractères du texte comme clé unique.
|
||||
"""
|
||||
logger.info("Récupération des chunks existants...")
|
||||
|
||||
chunk_collection = client.collections.get("Chunk")
|
||||
existing_texts = set()
|
||||
|
||||
cursor = None
|
||||
batch_size = 1000
|
||||
|
||||
while True:
|
||||
if cursor:
|
||||
response = chunk_collection.query.fetch_objects(
|
||||
limit=batch_size,
|
||||
after=cursor
|
||||
)
|
||||
else:
|
||||
response = chunk_collection.query.fetch_objects(limit=batch_size)
|
||||
|
||||
if not response.objects:
|
||||
break
|
||||
|
||||
for obj in response.objects:
|
||||
text = obj.properties.get("text", "")
|
||||
# Utiliser les 100 premiers caractères comme clé unique
|
||||
text_key = text[:100] if text else ""
|
||||
existing_texts.add(text_key)
|
||||
|
||||
if len(response.objects) < batch_size:
|
||||
break
|
||||
|
||||
cursor = response.objects[-1].uuid
|
||||
|
||||
logger.info(f" ✓ {len(existing_texts)} chunks existants récupérés")
|
||||
return existing_texts
|
||||
|
||||
|
||||
def import_missing_chunks(
|
||||
client: weaviate.WeaviateClient,
|
||||
backup_file: Path,
|
||||
existing_texts: Set[str]
|
||||
) -> int:
|
||||
"""Importe uniquement les chunks manquants."""
|
||||
|
||||
logger.info(f"Chargement du backup depuis {backup_file}...")
|
||||
|
||||
if not backup_file.exists():
|
||||
logger.error(f" ✗ Fichier {backup_file} introuvable")
|
||||
return 0
|
||||
|
||||
try:
|
||||
with open(backup_file, "r", encoding="utf-8") as f:
|
||||
objects = json.load(f)
|
||||
|
||||
logger.info(f" ✓ {len(objects)} chunks dans le backup")
|
||||
|
||||
# Filtrer les chunks manquants
|
||||
missing_chunks = []
|
||||
for obj in objects:
|
||||
text = obj["properties"].get("text", "")
|
||||
text_key = text[:100] if text else ""
|
||||
|
||||
if text_key not in existing_texts:
|
||||
missing_chunks.append(obj)
|
||||
|
||||
logger.info(f" → {len(missing_chunks)} chunks manquants à restaurer")
|
||||
|
||||
if not missing_chunks:
|
||||
logger.info(" ✓ Aucun chunk manquant !")
|
||||
return 0
|
||||
|
||||
# Préparer les objets pour l'insertion
|
||||
collection = client.collections.get("Chunk")
|
||||
objects_to_insert = []
|
||||
|
||||
for obj in missing_chunks:
|
||||
props = obj["properties"]
|
||||
|
||||
# Ajouter le champ summary vide
|
||||
props["summary"] = ""
|
||||
|
||||
# Fixer les formats de date
|
||||
props = fix_dates_in_object(props)
|
||||
|
||||
objects_to_insert.append(props)
|
||||
|
||||
# Insertion par batch
|
||||
batch_size = 20 # Petit batch pour éviter OOM
|
||||
total_inserted = 0
|
||||
|
||||
logger.info("\nInsertion des chunks manquants...")
|
||||
for i in range(0, len(objects_to_insert), batch_size):
|
||||
batch = objects_to_insert[i:i + batch_size]
|
||||
|
||||
try:
|
||||
collection.data.insert_many(batch)
|
||||
total_inserted += len(batch)
|
||||
|
||||
if (i // batch_size + 1) % 10 == 0:
|
||||
logger.info(f" → {total_inserted}/{len(objects_to_insert)} objets insérés...")
|
||||
|
||||
# Pause entre batches pour éviter surcharge mémoire
|
||||
time.sleep(0.1)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f" ✗ Erreur batch {i//batch_size + 1}: {e}")
|
||||
|
||||
# En cas d'erreur, attendre plus longtemps et continuer
|
||||
time.sleep(5)
|
||||
|
||||
logger.info(f"\n ✓ {total_inserted} chunks manquants importés")
|
||||
return total_inserted
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f" ✗ Erreur lors de l'import: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python restore_remaining_chunks.py <backup_directory>")
|
||||
sys.exit(1)
|
||||
|
||||
backup_dir = Path(sys.argv[1])
|
||||
|
||||
if not backup_dir.exists():
|
||||
logger.error(f"Backup directory '{backup_dir}' does not exist")
|
||||
sys.exit(1)
|
||||
|
||||
logger.info("=" * 80)
|
||||
logger.info(f"RESTORATION DES CHUNKS MANQUANTS DEPUIS {backup_dir}")
|
||||
logger.info("=" * 80)
|
||||
|
||||
# Connexion à Weaviate
|
||||
logger.info("\nConnexion à Weaviate...")
|
||||
try:
|
||||
client = weaviate.connect_to_local(
|
||||
host="localhost",
|
||||
port=8080,
|
||||
grpc_port=50051,
|
||||
)
|
||||
logger.info(" ✓ Connexion établie")
|
||||
except Exception as e:
|
||||
logger.error(f" ✗ Erreur de connexion: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
# Étape 1: Récupérer les chunks existants
|
||||
existing_texts = get_existing_chunk_texts(client)
|
||||
|
||||
# Étape 2: Importer les chunks manquants
|
||||
backup_file = backup_dir / "chunk_backup.json"
|
||||
total_imported = import_missing_chunks(client, backup_file, existing_texts)
|
||||
|
||||
# Étape 3: Vérification finale
|
||||
logger.info("\nVérification finale...")
|
||||
chunk_collection = client.collections.get("Chunk")
|
||||
result = chunk_collection.aggregate.over_all()
|
||||
final_count = result.total_count
|
||||
|
||||
logger.info(f" ✓ Total de chunks dans Weaviate: {final_count}")
|
||||
|
||||
logger.info("\n" + "=" * 80)
|
||||
logger.info("RESTORATION DES CHUNKS MANQUANTS TERMINÉE !")
|
||||
logger.info("=" * 80)
|
||||
logger.info(f"✓ Chunks importés: {total_imported}")
|
||||
logger.info(f"✓ Total final: {final_count}/5246")
|
||||
logger.info("=" * 80)
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
logger.info("\n✓ Connexion fermée")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Fix encoding for Windows
|
||||
if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
main()
|
||||
Reference in New Issue
Block a user