- david_profile_declared.json: David's declared profile values from questionnaire - scripts/embed_david.py: Python script to generate embeddings using BGE-M3 model - questionnaire_david.md: Questionnaire template for profile values Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
374 lines
12 KiB
Python
374 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Restauration de collections Weaviate depuis un backup.
|
|
|
|
Usage:
|
|
python weaviate_restore.py backup.json
|
|
python weaviate_restore.py backup.json --collections Thought,Conversation
|
|
python weaviate_restore.py backup.json --dry-run
|
|
python weaviate_restore.py backup.json --clear-existing
|
|
|
|
ATTENTION: Ce script peut supprimer des données existantes!
|
|
Utilisez --dry-run pour prévisualiser les actions.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import requests
|
|
|
|
# Configuration par défaut
|
|
WEAVIATE_URL = os.getenv("WEAVIATE_URL", "http://localhost:8080")
|
|
|
|
|
|
def check_weaviate_ready() -> bool:
|
|
"""Vérifie que Weaviate est accessible."""
|
|
try:
|
|
response = requests.get(f"{WEAVIATE_URL}/v1/.well-known/ready", timeout=5)
|
|
return response.status_code == 200
|
|
except requests.RequestException:
|
|
return False
|
|
|
|
|
|
def get_existing_classes() -> list[str]:
|
|
"""Récupère la liste des classes existantes."""
|
|
response = requests.get(f"{WEAVIATE_URL}/v1/schema")
|
|
response.raise_for_status()
|
|
schema = response.json()
|
|
return [c["class"] for c in schema.get("classes", [])]
|
|
|
|
|
|
def delete_class(class_name: str) -> bool:
|
|
"""Supprime une classe et tous ses objets."""
|
|
response = requests.delete(f"{WEAVIATE_URL}/v1/schema/{class_name}")
|
|
return response.status_code == 200
|
|
|
|
|
|
def create_class(class_schema: dict) -> bool:
|
|
"""Crée une classe avec son schéma."""
|
|
response = requests.post(
|
|
f"{WEAVIATE_URL}/v1/schema",
|
|
json=class_schema,
|
|
headers={"Content-Type": "application/json"}
|
|
)
|
|
return response.status_code == 200
|
|
|
|
|
|
def insert_object(class_name: str, obj: dict) -> bool:
|
|
"""
|
|
Insère un objet dans une classe.
|
|
|
|
Args:
|
|
class_name: Nom de la classe
|
|
obj: Objet complet du backup (avec id, properties, vector)
|
|
"""
|
|
data = {
|
|
"class": class_name,
|
|
"properties": obj.get("properties", {}),
|
|
}
|
|
|
|
# Préserver l'ID original si présent
|
|
if "id" in obj:
|
|
data["id"] = obj["id"]
|
|
|
|
# Inclure le vecteur si présent
|
|
if "vector" in obj:
|
|
data["vector"] = obj["vector"]
|
|
|
|
response = requests.post(
|
|
f"{WEAVIATE_URL}/v1/objects",
|
|
json=data,
|
|
headers={"Content-Type": "application/json"}
|
|
)
|
|
|
|
return response.status_code in [200, 201]
|
|
|
|
|
|
def batch_insert_objects(class_name: str, objects: list[dict], batch_size: int = 100) -> tuple[int, int]:
|
|
"""
|
|
Insère des objets par batch.
|
|
|
|
Returns:
|
|
(succès, échecs)
|
|
"""
|
|
success = 0
|
|
failures = 0
|
|
|
|
for i in range(0, len(objects), batch_size):
|
|
batch = objects[i:i + batch_size]
|
|
|
|
batch_data = {
|
|
"objects": [
|
|
{
|
|
"class": class_name,
|
|
"properties": obj.get("properties", {}),
|
|
**({"id": obj["id"]} if "id" in obj else {}),
|
|
**({"vector": obj["vector"]} if "vector" in obj else {}),
|
|
}
|
|
for obj in batch
|
|
]
|
|
}
|
|
|
|
response = requests.post(
|
|
f"{WEAVIATE_URL}/v1/batch/objects",
|
|
json=batch_data,
|
|
headers={"Content-Type": "application/json"}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
for item in result:
|
|
if item.get("result", {}).get("status") == "SUCCESS":
|
|
success += 1
|
|
else:
|
|
failures += 1
|
|
error = item.get("result", {}).get("errors", {})
|
|
if error:
|
|
print(f" Erreur: {error}")
|
|
else:
|
|
failures += len(batch)
|
|
print(f" Erreur batch: {response.status_code}")
|
|
|
|
# Progress
|
|
progress = min(i + batch_size, len(objects))
|
|
print(f" {class_name}: {progress}/{len(objects)} objets traités...", end="\r")
|
|
|
|
print(f" {class_name}: {success} succès, {failures} échecs" + " " * 20)
|
|
return success, failures
|
|
|
|
|
|
def restore_weaviate(
|
|
backup_path: Path,
|
|
collections: list[str] | None = None,
|
|
clear_existing: bool = False,
|
|
dry_run: bool = False
|
|
) -> dict:
|
|
"""
|
|
Restaure des collections depuis un backup.
|
|
|
|
Args:
|
|
backup_path: Chemin du fichier de backup
|
|
collections: Collections à restaurer (None = toutes)
|
|
clear_existing: Supprimer les collections existantes avant restauration
|
|
dry_run: Prévisualiser sans effectuer les actions
|
|
|
|
Returns:
|
|
Statistiques de la restauration
|
|
"""
|
|
print("=" * 60)
|
|
print("RESTAURATION WEAVIATE")
|
|
if dry_run:
|
|
print("*** MODE DRY-RUN - Aucune modification ***")
|
|
print("=" * 60)
|
|
print(f"URL: {WEAVIATE_URL}")
|
|
print(f"Backup: {backup_path}")
|
|
print(f"Clear existing: {clear_existing}")
|
|
print("-" * 60)
|
|
|
|
# Vérifier la connexion
|
|
if not check_weaviate_ready():
|
|
print("ERREUR: Weaviate n'est pas accessible")
|
|
print(f"Vérifiez que le serveur tourne sur {WEAVIATE_URL}")
|
|
sys.exit(1)
|
|
|
|
print("Weaviate connecté ✓")
|
|
|
|
# Charger le backup
|
|
print(f"\n[1/4] Chargement du backup...")
|
|
with open(backup_path, "r", encoding="utf-8") as f:
|
|
backup_data = json.load(f)
|
|
|
|
metadata = backup_data.get("metadata", {})
|
|
print(f" Timestamp: {metadata.get('timestamp', 'N/A')}")
|
|
print(f" Source: {metadata.get('weaviate_url', 'N/A')}")
|
|
print(f" Vectors inclus: {metadata.get('include_vectors', False)}")
|
|
|
|
schema = backup_data.get("schema", {})
|
|
backup_collections = backup_data.get("collections", {})
|
|
|
|
# Déterminer les collections à restaurer
|
|
if collections:
|
|
classes_to_restore = [c for c in collections if c in backup_collections]
|
|
else:
|
|
classes_to_restore = list(backup_collections.keys())
|
|
|
|
print(f"\n Collections à restaurer: {', '.join(classes_to_restore)}")
|
|
|
|
# Vérifier les collections existantes
|
|
print(f"\n[2/4] Vérification des collections existantes...")
|
|
existing_classes = get_existing_classes()
|
|
print(f" Collections existantes: {', '.join(existing_classes) or '(aucune)'}")
|
|
|
|
conflicts = [c for c in classes_to_restore if c in existing_classes]
|
|
if conflicts:
|
|
print(f" Conflits détectés: {', '.join(conflicts)}")
|
|
if clear_existing:
|
|
print(" → Seront supprimées (--clear-existing)")
|
|
else:
|
|
print(" → Seront ignorées (utilisez --clear-existing pour les remplacer)")
|
|
classes_to_restore = [c for c in classes_to_restore if c not in conflicts]
|
|
|
|
if not classes_to_restore:
|
|
print("\nAucune collection à restaurer.")
|
|
return {}
|
|
|
|
# Préparer le schéma
|
|
print(f"\n[3/4] Préparation du schéma...")
|
|
schema_classes = {c["class"]: c for c in schema.get("classes", [])}
|
|
|
|
# Supprimer les collections existantes si demandé
|
|
if clear_existing and conflicts:
|
|
print("\n Suppression des collections existantes...")
|
|
for class_name in conflicts:
|
|
if dry_run:
|
|
print(f" [DRY-RUN] Suppression de {class_name}")
|
|
else:
|
|
if delete_class(class_name):
|
|
print(f" Supprimé: {class_name}")
|
|
else:
|
|
print(f" ERREUR suppression: {class_name}")
|
|
|
|
# Créer les classes
|
|
print("\n Création des classes...")
|
|
for class_name in classes_to_restore:
|
|
if class_name in schema_classes:
|
|
class_schema = schema_classes[class_name]
|
|
if dry_run:
|
|
print(f" [DRY-RUN] Création de {class_name}")
|
|
else:
|
|
# Vérifier si existe déjà (après clear)
|
|
current_classes = get_existing_classes()
|
|
if class_name not in current_classes:
|
|
if create_class(class_schema):
|
|
print(f" Créé: {class_name}")
|
|
else:
|
|
print(f" ERREUR création: {class_name}")
|
|
else:
|
|
print(f" Existe déjà: {class_name}")
|
|
else:
|
|
print(f" Schéma manquant pour: {class_name}")
|
|
|
|
# Insérer les objets
|
|
print(f"\n[4/4] Insertion des objets...")
|
|
stats = {"success": 0, "failures": 0, "by_class": {}}
|
|
|
|
for class_name in classes_to_restore:
|
|
objects = backup_collections.get(class_name, [])
|
|
if not objects:
|
|
print(f" {class_name}: 0 objets")
|
|
continue
|
|
|
|
if dry_run:
|
|
print(f" [DRY-RUN] {class_name}: {len(objects)} objets à insérer")
|
|
stats["by_class"][class_name] = {"success": len(objects), "failures": 0}
|
|
stats["success"] += len(objects)
|
|
else:
|
|
success, failures = batch_insert_objects(class_name, objects)
|
|
stats["by_class"][class_name] = {"success": success, "failures": failures}
|
|
stats["success"] += success
|
|
stats["failures"] += failures
|
|
|
|
# Résumé
|
|
print("\n" + "=" * 60)
|
|
print("RESTAURATION TERMINÉE" + (" (DRY-RUN)" if dry_run else ""))
|
|
print("=" * 60)
|
|
print("\nStatistiques par collection:")
|
|
for class_name, class_stats in stats.get("by_class", {}).items():
|
|
print(f" - {class_name}: {class_stats['success']} succès, {class_stats['failures']} échecs")
|
|
|
|
print(f"\nTotal: {stats['success']} succès, {stats['failures']} échecs")
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
global WEAVIATE_URL # Declare global at start of function
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Restauration de Weaviate depuis un backup",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Exemples:
|
|
python weaviate_restore.py backup.json
|
|
python weaviate_restore.py backup.json --dry-run
|
|
python weaviate_restore.py backup.json --collections Thought,Conversation
|
|
python weaviate_restore.py backup.json --clear-existing
|
|
|
|
ATTENTION: --clear-existing supprime les donnees existantes!
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
"backup",
|
|
type=Path,
|
|
help="Chemin du fichier de backup"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--collections", "-c",
|
|
type=str,
|
|
default=None,
|
|
help="Collections à restaurer (séparées par des virgules)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--clear-existing",
|
|
action="store_true",
|
|
help="Supprimer les collections existantes avant restauration"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Prévisualiser les actions sans les exécuter"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--url",
|
|
type=str,
|
|
default=None,
|
|
help=f"URL Weaviate (défaut: {WEAVIATE_URL})"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Vérifier que le fichier existe
|
|
if not args.backup.exists():
|
|
print(f"ERREUR: Fichier non trouvé: {args.backup}")
|
|
sys.exit(1)
|
|
|
|
# URL Weaviate
|
|
if args.url:
|
|
WEAVIATE_URL = args.url
|
|
|
|
# Collections
|
|
collections = None
|
|
if args.collections:
|
|
collections = [c.strip() for c in args.collections.split(",")]
|
|
|
|
# Confirmation si clear_existing et pas dry_run
|
|
if args.clear_existing and not args.dry_run:
|
|
print("⚠️ ATTENTION: --clear-existing va SUPPRIMER des données!")
|
|
print(" Utilisez --dry-run pour prévisualiser.")
|
|
response = input(" Continuer? [y/N] ")
|
|
if response.lower() != "y":
|
|
print("Annulé.")
|
|
sys.exit(0)
|
|
|
|
# Exécuter la restauration
|
|
restore_weaviate(
|
|
backup_path=args.backup,
|
|
collections=collections,
|
|
clear_existing=args.clear_existing,
|
|
dry_run=args.dry_run
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|