Add ikario_processual with David profile and embedding script
- david_profile_declared.json: David's declared profile values from questionnaire - scripts/embed_david.py: Python script to generate embeddings using BGE-M3 model - questionnaire_david.md: Questionnaire template for profile values Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
373
ikario_processual/scripts/weaviate_restore.py
Normal file
373
ikario_processual/scripts/weaviate_restore.py
Normal file
@@ -0,0 +1,373 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Restauration de collections Weaviate depuis un backup.
|
||||
|
||||
Usage:
|
||||
python weaviate_restore.py backup.json
|
||||
python weaviate_restore.py backup.json --collections Thought,Conversation
|
||||
python weaviate_restore.py backup.json --dry-run
|
||||
python weaviate_restore.py backup.json --clear-existing
|
||||
|
||||
ATTENTION: Ce script peut supprimer des données existantes!
|
||||
Utilisez --dry-run pour prévisualiser les actions.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
# Configuration par défaut
|
||||
WEAVIATE_URL = os.getenv("WEAVIATE_URL", "http://localhost:8080")
|
||||
|
||||
|
||||
def check_weaviate_ready() -> bool:
|
||||
"""Vérifie que Weaviate est accessible."""
|
||||
try:
|
||||
response = requests.get(f"{WEAVIATE_URL}/v1/.well-known/ready", timeout=5)
|
||||
return response.status_code == 200
|
||||
except requests.RequestException:
|
||||
return False
|
||||
|
||||
|
||||
def get_existing_classes() -> list[str]:
|
||||
"""Récupère la liste des classes existantes."""
|
||||
response = requests.get(f"{WEAVIATE_URL}/v1/schema")
|
||||
response.raise_for_status()
|
||||
schema = response.json()
|
||||
return [c["class"] for c in schema.get("classes", [])]
|
||||
|
||||
|
||||
def delete_class(class_name: str) -> bool:
|
||||
"""Supprime une classe et tous ses objets."""
|
||||
response = requests.delete(f"{WEAVIATE_URL}/v1/schema/{class_name}")
|
||||
return response.status_code == 200
|
||||
|
||||
|
||||
def create_class(class_schema: dict) -> bool:
|
||||
"""Crée une classe avec son schéma."""
|
||||
response = requests.post(
|
||||
f"{WEAVIATE_URL}/v1/schema",
|
||||
json=class_schema,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
return response.status_code == 200
|
||||
|
||||
|
||||
def insert_object(class_name: str, obj: dict) -> bool:
|
||||
"""
|
||||
Insère un objet dans une classe.
|
||||
|
||||
Args:
|
||||
class_name: Nom de la classe
|
||||
obj: Objet complet du backup (avec id, properties, vector)
|
||||
"""
|
||||
data = {
|
||||
"class": class_name,
|
||||
"properties": obj.get("properties", {}),
|
||||
}
|
||||
|
||||
# Préserver l'ID original si présent
|
||||
if "id" in obj:
|
||||
data["id"] = obj["id"]
|
||||
|
||||
# Inclure le vecteur si présent
|
||||
if "vector" in obj:
|
||||
data["vector"] = obj["vector"]
|
||||
|
||||
response = requests.post(
|
||||
f"{WEAVIATE_URL}/v1/objects",
|
||||
json=data,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
return response.status_code in [200, 201]
|
||||
|
||||
|
||||
def batch_insert_objects(class_name: str, objects: list[dict], batch_size: int = 100) -> tuple[int, int]:
|
||||
"""
|
||||
Insère des objets par batch.
|
||||
|
||||
Returns:
|
||||
(succès, échecs)
|
||||
"""
|
||||
success = 0
|
||||
failures = 0
|
||||
|
||||
for i in range(0, len(objects), batch_size):
|
||||
batch = objects[i:i + batch_size]
|
||||
|
||||
batch_data = {
|
||||
"objects": [
|
||||
{
|
||||
"class": class_name,
|
||||
"properties": obj.get("properties", {}),
|
||||
**({"id": obj["id"]} if "id" in obj else {}),
|
||||
**({"vector": obj["vector"]} if "vector" in obj else {}),
|
||||
}
|
||||
for obj in batch
|
||||
]
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{WEAVIATE_URL}/v1/batch/objects",
|
||||
json=batch_data,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
for item in result:
|
||||
if item.get("result", {}).get("status") == "SUCCESS":
|
||||
success += 1
|
||||
else:
|
||||
failures += 1
|
||||
error = item.get("result", {}).get("errors", {})
|
||||
if error:
|
||||
print(f" Erreur: {error}")
|
||||
else:
|
||||
failures += len(batch)
|
||||
print(f" Erreur batch: {response.status_code}")
|
||||
|
||||
# Progress
|
||||
progress = min(i + batch_size, len(objects))
|
||||
print(f" {class_name}: {progress}/{len(objects)} objets traités...", end="\r")
|
||||
|
||||
print(f" {class_name}: {success} succès, {failures} échecs" + " " * 20)
|
||||
return success, failures
|
||||
|
||||
|
||||
def restore_weaviate(
|
||||
backup_path: Path,
|
||||
collections: list[str] | None = None,
|
||||
clear_existing: bool = False,
|
||||
dry_run: bool = False
|
||||
) -> dict:
|
||||
"""
|
||||
Restaure des collections depuis un backup.
|
||||
|
||||
Args:
|
||||
backup_path: Chemin du fichier de backup
|
||||
collections: Collections à restaurer (None = toutes)
|
||||
clear_existing: Supprimer les collections existantes avant restauration
|
||||
dry_run: Prévisualiser sans effectuer les actions
|
||||
|
||||
Returns:
|
||||
Statistiques de la restauration
|
||||
"""
|
||||
print("=" * 60)
|
||||
print("RESTAURATION WEAVIATE")
|
||||
if dry_run:
|
||||
print("*** MODE DRY-RUN - Aucune modification ***")
|
||||
print("=" * 60)
|
||||
print(f"URL: {WEAVIATE_URL}")
|
||||
print(f"Backup: {backup_path}")
|
||||
print(f"Clear existing: {clear_existing}")
|
||||
print("-" * 60)
|
||||
|
||||
# Vérifier la connexion
|
||||
if not check_weaviate_ready():
|
||||
print("ERREUR: Weaviate n'est pas accessible")
|
||||
print(f"Vérifiez que le serveur tourne sur {WEAVIATE_URL}")
|
||||
sys.exit(1)
|
||||
|
||||
print("Weaviate connecté ✓")
|
||||
|
||||
# Charger le backup
|
||||
print(f"\n[1/4] Chargement du backup...")
|
||||
with open(backup_path, "r", encoding="utf-8") as f:
|
||||
backup_data = json.load(f)
|
||||
|
||||
metadata = backup_data.get("metadata", {})
|
||||
print(f" Timestamp: {metadata.get('timestamp', 'N/A')}")
|
||||
print(f" Source: {metadata.get('weaviate_url', 'N/A')}")
|
||||
print(f" Vectors inclus: {metadata.get('include_vectors', False)}")
|
||||
|
||||
schema = backup_data.get("schema", {})
|
||||
backup_collections = backup_data.get("collections", {})
|
||||
|
||||
# Déterminer les collections à restaurer
|
||||
if collections:
|
||||
classes_to_restore = [c for c in collections if c in backup_collections]
|
||||
else:
|
||||
classes_to_restore = list(backup_collections.keys())
|
||||
|
||||
print(f"\n Collections à restaurer: {', '.join(classes_to_restore)}")
|
||||
|
||||
# Vérifier les collections existantes
|
||||
print(f"\n[2/4] Vérification des collections existantes...")
|
||||
existing_classes = get_existing_classes()
|
||||
print(f" Collections existantes: {', '.join(existing_classes) or '(aucune)'}")
|
||||
|
||||
conflicts = [c for c in classes_to_restore if c in existing_classes]
|
||||
if conflicts:
|
||||
print(f" Conflits détectés: {', '.join(conflicts)}")
|
||||
if clear_existing:
|
||||
print(" → Seront supprimées (--clear-existing)")
|
||||
else:
|
||||
print(" → Seront ignorées (utilisez --clear-existing pour les remplacer)")
|
||||
classes_to_restore = [c for c in classes_to_restore if c not in conflicts]
|
||||
|
||||
if not classes_to_restore:
|
||||
print("\nAucune collection à restaurer.")
|
||||
return {}
|
||||
|
||||
# Préparer le schéma
|
||||
print(f"\n[3/4] Préparation du schéma...")
|
||||
schema_classes = {c["class"]: c for c in schema.get("classes", [])}
|
||||
|
||||
# Supprimer les collections existantes si demandé
|
||||
if clear_existing and conflicts:
|
||||
print("\n Suppression des collections existantes...")
|
||||
for class_name in conflicts:
|
||||
if dry_run:
|
||||
print(f" [DRY-RUN] Suppression de {class_name}")
|
||||
else:
|
||||
if delete_class(class_name):
|
||||
print(f" Supprimé: {class_name}")
|
||||
else:
|
||||
print(f" ERREUR suppression: {class_name}")
|
||||
|
||||
# Créer les classes
|
||||
print("\n Création des classes...")
|
||||
for class_name in classes_to_restore:
|
||||
if class_name in schema_classes:
|
||||
class_schema = schema_classes[class_name]
|
||||
if dry_run:
|
||||
print(f" [DRY-RUN] Création de {class_name}")
|
||||
else:
|
||||
# Vérifier si existe déjà (après clear)
|
||||
current_classes = get_existing_classes()
|
||||
if class_name not in current_classes:
|
||||
if create_class(class_schema):
|
||||
print(f" Créé: {class_name}")
|
||||
else:
|
||||
print(f" ERREUR création: {class_name}")
|
||||
else:
|
||||
print(f" Existe déjà: {class_name}")
|
||||
else:
|
||||
print(f" Schéma manquant pour: {class_name}")
|
||||
|
||||
# Insérer les objets
|
||||
print(f"\n[4/4] Insertion des objets...")
|
||||
stats = {"success": 0, "failures": 0, "by_class": {}}
|
||||
|
||||
for class_name in classes_to_restore:
|
||||
objects = backup_collections.get(class_name, [])
|
||||
if not objects:
|
||||
print(f" {class_name}: 0 objets")
|
||||
continue
|
||||
|
||||
if dry_run:
|
||||
print(f" [DRY-RUN] {class_name}: {len(objects)} objets à insérer")
|
||||
stats["by_class"][class_name] = {"success": len(objects), "failures": 0}
|
||||
stats["success"] += len(objects)
|
||||
else:
|
||||
success, failures = batch_insert_objects(class_name, objects)
|
||||
stats["by_class"][class_name] = {"success": success, "failures": failures}
|
||||
stats["success"] += success
|
||||
stats["failures"] += failures
|
||||
|
||||
# Résumé
|
||||
print("\n" + "=" * 60)
|
||||
print("RESTAURATION TERMINÉE" + (" (DRY-RUN)" if dry_run else ""))
|
||||
print("=" * 60)
|
||||
print("\nStatistiques par collection:")
|
||||
for class_name, class_stats in stats.get("by_class", {}).items():
|
||||
print(f" - {class_name}: {class_stats['success']} succès, {class_stats['failures']} échecs")
|
||||
|
||||
print(f"\nTotal: {stats['success']} succès, {stats['failures']} échecs")
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def main():
|
||||
global WEAVIATE_URL # Declare global at start of function
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Restauration de Weaviate depuis un backup",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Exemples:
|
||||
python weaviate_restore.py backup.json
|
||||
python weaviate_restore.py backup.json --dry-run
|
||||
python weaviate_restore.py backup.json --collections Thought,Conversation
|
||||
python weaviate_restore.py backup.json --clear-existing
|
||||
|
||||
ATTENTION: --clear-existing supprime les donnees existantes!
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"backup",
|
||||
type=Path,
|
||||
help="Chemin du fichier de backup"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--collections", "-c",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Collections à restaurer (séparées par des virgules)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--clear-existing",
|
||||
action="store_true",
|
||||
help="Supprimer les collections existantes avant restauration"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Prévisualiser les actions sans les exécuter"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
type=str,
|
||||
default=None,
|
||||
help=f"URL Weaviate (défaut: {WEAVIATE_URL})"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Vérifier que le fichier existe
|
||||
if not args.backup.exists():
|
||||
print(f"ERREUR: Fichier non trouvé: {args.backup}")
|
||||
sys.exit(1)
|
||||
|
||||
# URL Weaviate
|
||||
if args.url:
|
||||
WEAVIATE_URL = args.url
|
||||
|
||||
# Collections
|
||||
collections = None
|
||||
if args.collections:
|
||||
collections = [c.strip() for c in args.collections.split(",")]
|
||||
|
||||
# Confirmation si clear_existing et pas dry_run
|
||||
if args.clear_existing and not args.dry_run:
|
||||
print("⚠️ ATTENTION: --clear-existing va SUPPRIMER des données!")
|
||||
print(" Utilisez --dry-run pour prévisualiser.")
|
||||
response = input(" Continuer? [y/N] ")
|
||||
if response.lower() != "y":
|
||||
print("Annulé.")
|
||||
sys.exit(0)
|
||||
|
||||
# Exécuter la restauration
|
||||
restore_weaviate(
|
||||
backup_path=args.backup,
|
||||
collections=collections,
|
||||
clear_existing=args.clear_existing,
|
||||
dry_run=args.dry_run
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user