Add ikario_processual with David profile and embedding script
- david_profile_declared.json: David's declared profile values from questionnaire - scripts/embed_david.py: Python script to generate embeddings using BGE-M3 model - questionnaire_david.md: Questionnaire template for profile values Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
250
ikario_processual/scripts/weaviate_backup.py
Normal file
250
ikario_processual/scripts/weaviate_backup.py
Normal file
@@ -0,0 +1,250 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Backup complet de toutes les collections Weaviate.
|
||||
|
||||
Usage:
|
||||
python weaviate_backup.py
|
||||
python weaviate_backup.py --output exports/backup_20260131.json
|
||||
python weaviate_backup.py --collections Thought,Conversation
|
||||
|
||||
Ce script exporte:
|
||||
- Le schéma complet (classes et propriétés)
|
||||
- Tous les objets de chaque collection
|
||||
- Les vecteurs (embeddings) de chaque objet
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
# Configuration par défaut
|
||||
WEAVIATE_URL = os.getenv("WEAVIATE_URL", "http://localhost:8080")
|
||||
DEFAULT_OUTPUT_DIR = Path(__file__).parent.parent.parent / "exports"
|
||||
|
||||
|
||||
def check_weaviate_ready() -> bool:
|
||||
"""Vérifie que Weaviate est accessible."""
|
||||
try:
|
||||
response = requests.get(f"{WEAVIATE_URL}/v1/.well-known/ready", timeout=5)
|
||||
return response.status_code == 200
|
||||
except requests.RequestException:
|
||||
return False
|
||||
|
||||
|
||||
def get_schema() -> dict:
|
||||
"""Récupère le schéma complet de Weaviate."""
|
||||
response = requests.get(f"{WEAVIATE_URL}/v1/schema")
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def get_all_objects(class_name: str, include_vector: bool = True) -> list[dict]:
|
||||
"""
|
||||
Récupère tous les objets d'une classe avec pagination.
|
||||
|
||||
Args:
|
||||
class_name: Nom de la collection
|
||||
include_vector: Inclure les vecteurs (embeddings)
|
||||
|
||||
Returns:
|
||||
Liste de tous les objets
|
||||
"""
|
||||
objects = []
|
||||
limit = 100
|
||||
offset = 0
|
||||
|
||||
include_param = "vector" if include_vector else ""
|
||||
|
||||
while True:
|
||||
url = f"{WEAVIATE_URL}/v1/objects?class={class_name}&limit={limit}&offset={offset}"
|
||||
if include_param:
|
||||
url += f"&include={include_param}"
|
||||
|
||||
response = requests.get(url)
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f" Erreur lors de la récupération de {class_name}: {response.status_code}")
|
||||
break
|
||||
|
||||
data = response.json()
|
||||
batch = data.get("objects", [])
|
||||
|
||||
if not batch:
|
||||
break
|
||||
|
||||
objects.extend(batch)
|
||||
offset += limit
|
||||
|
||||
# Progress
|
||||
print(f" {class_name}: {len(objects)} objets récupérés...", end="\r")
|
||||
|
||||
print(f" {class_name}: {len(objects)} objets au total")
|
||||
return objects
|
||||
|
||||
|
||||
def backup_weaviate(
|
||||
output_path: Path,
|
||||
collections: list[str] | None = None,
|
||||
include_vectors: bool = True
|
||||
) -> dict:
|
||||
"""
|
||||
Effectue un backup complet de Weaviate.
|
||||
|
||||
Args:
|
||||
output_path: Chemin du fichier de sortie
|
||||
collections: Liste des collections à exporter (None = toutes)
|
||||
include_vectors: Inclure les vecteurs
|
||||
|
||||
Returns:
|
||||
Statistiques du backup
|
||||
"""
|
||||
print("=" * 60)
|
||||
print("BACKUP WEAVIATE")
|
||||
print("=" * 60)
|
||||
print(f"URL: {WEAVIATE_URL}")
|
||||
print(f"Output: {output_path}")
|
||||
print(f"Include vectors: {include_vectors}")
|
||||
print("-" * 60)
|
||||
|
||||
# Vérifier la connexion
|
||||
if not check_weaviate_ready():
|
||||
print("ERREUR: Weaviate n'est pas accessible")
|
||||
print(f"Vérifiez que le serveur tourne sur {WEAVIATE_URL}")
|
||||
sys.exit(1)
|
||||
|
||||
print("Weaviate connecte [OK]")
|
||||
|
||||
# Récupérer le schéma
|
||||
print("\n[1/3] Récupération du schéma...")
|
||||
schema = get_schema()
|
||||
all_classes = [c["class"] for c in schema.get("classes", [])]
|
||||
print(f" Classes trouvées: {', '.join(all_classes)}")
|
||||
|
||||
# Filtrer les collections si spécifié
|
||||
if collections:
|
||||
classes_to_backup = [c for c in all_classes if c in collections]
|
||||
print(f" Collections sélectionnées: {', '.join(classes_to_backup)}")
|
||||
else:
|
||||
classes_to_backup = all_classes
|
||||
|
||||
# Récupérer les objets de chaque classe
|
||||
print("\n[2/3] Récupération des objets...")
|
||||
backup_data = {
|
||||
"metadata": {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"weaviate_url": WEAVIATE_URL,
|
||||
"include_vectors": include_vectors,
|
||||
"version": "1.0"
|
||||
},
|
||||
"schema": schema,
|
||||
"collections": {}
|
||||
}
|
||||
|
||||
stats = {}
|
||||
for class_name in classes_to_backup:
|
||||
objects = get_all_objects(class_name, include_vector=include_vectors)
|
||||
backup_data["collections"][class_name] = objects
|
||||
stats[class_name] = len(objects)
|
||||
|
||||
# Sauvegarder
|
||||
print(f"\n[3/3] Sauvegarde dans {output_path}...")
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(backup_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
file_size = output_path.stat().st_size / (1024 * 1024) # MB
|
||||
|
||||
# Résumé
|
||||
print("\n" + "=" * 60)
|
||||
print("BACKUP TERMINÉ")
|
||||
print("=" * 60)
|
||||
print(f"Fichier: {output_path}")
|
||||
print(f"Taille: {file_size:.2f} MB")
|
||||
print("\nStatistiques par collection:")
|
||||
total = 0
|
||||
for class_name, count in stats.items():
|
||||
print(f" - {class_name}: {count} objets")
|
||||
total += count
|
||||
print(f"\nTotal: {total} objets")
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def main():
|
||||
global WEAVIATE_URL # Declare global at start of function
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Backup complet de Weaviate",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Exemples:
|
||||
python weaviate_backup.py
|
||||
python weaviate_backup.py --output backup.json
|
||||
python weaviate_backup.py --collections Thought,Conversation
|
||||
python weaviate_backup.py --no-vectors
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output", "-o",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Chemin du fichier de sortie (defaut: exports/backup_YYYYMMDD_HHMMSS.json)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--collections", "-c",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Collections a exporter (separees par des virgules)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--no-vectors",
|
||||
action="store_true",
|
||||
help="Ne pas inclure les vecteurs (plus rapide, fichier plus petit)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
type=str,
|
||||
default=None,
|
||||
help=f"URL Weaviate (defaut: {WEAVIATE_URL})"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# URL Weaviate
|
||||
if args.url:
|
||||
WEAVIATE_URL = args.url
|
||||
|
||||
# Chemin de sortie
|
||||
if args.output:
|
||||
output_path = args.output
|
||||
else:
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
output_path = DEFAULT_OUTPUT_DIR / f"backup_{timestamp}.json"
|
||||
|
||||
# Collections
|
||||
collections = None
|
||||
if args.collections:
|
||||
collections = [c.strip() for c in args.collections.split(",")]
|
||||
|
||||
# Exécuter le backup
|
||||
backup_weaviate(
|
||||
output_path=output_path,
|
||||
collections=collections,
|
||||
include_vectors=not args.no_vectors
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user