Add ikario_processual with David profile and embedding script

- david_profile_declared.json: David's declared profile values from questionnaire
- scripts/embed_david.py: Python script to generate embeddings using BGE-M3 model
- questionnaire_david.md: Questionnaire template for profile values

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-31 16:56:41 +01:00
parent 9e657cbf29
commit 21f5676c7b
18 changed files with 5463 additions and 0 deletions

View File

@@ -0,0 +1,250 @@
#!/usr/bin/env python3
"""
Backup complet de toutes les collections Weaviate.
Usage:
python weaviate_backup.py
python weaviate_backup.py --output exports/backup_20260131.json
python weaviate_backup.py --collections Thought,Conversation
Ce script exporte:
- Le schéma complet (classes et propriétés)
- Tous les objets de chaque collection
- Les vecteurs (embeddings) de chaque objet
"""
import argparse
import json
import os
import sys
from datetime import datetime
from pathlib import Path
from typing import Any
import requests
# Configuration par défaut
WEAVIATE_URL = os.getenv("WEAVIATE_URL", "http://localhost:8080")
DEFAULT_OUTPUT_DIR = Path(__file__).parent.parent.parent / "exports"
def check_weaviate_ready() -> bool:
"""Vérifie que Weaviate est accessible."""
try:
response = requests.get(f"{WEAVIATE_URL}/v1/.well-known/ready", timeout=5)
return response.status_code == 200
except requests.RequestException:
return False
def get_schema() -> dict:
"""Récupère le schéma complet de Weaviate."""
response = requests.get(f"{WEAVIATE_URL}/v1/schema")
response.raise_for_status()
return response.json()
def get_all_objects(class_name: str, include_vector: bool = True) -> list[dict]:
"""
Récupère tous les objets d'une classe avec pagination.
Args:
class_name: Nom de la collection
include_vector: Inclure les vecteurs (embeddings)
Returns:
Liste de tous les objets
"""
objects = []
limit = 100
offset = 0
include_param = "vector" if include_vector else ""
while True:
url = f"{WEAVIATE_URL}/v1/objects?class={class_name}&limit={limit}&offset={offset}"
if include_param:
url += f"&include={include_param}"
response = requests.get(url)
if response.status_code != 200:
print(f" Erreur lors de la récupération de {class_name}: {response.status_code}")
break
data = response.json()
batch = data.get("objects", [])
if not batch:
break
objects.extend(batch)
offset += limit
# Progress
print(f" {class_name}: {len(objects)} objets récupérés...", end="\r")
print(f" {class_name}: {len(objects)} objets au total")
return objects
def backup_weaviate(
output_path: Path,
collections: list[str] | None = None,
include_vectors: bool = True
) -> dict:
"""
Effectue un backup complet de Weaviate.
Args:
output_path: Chemin du fichier de sortie
collections: Liste des collections à exporter (None = toutes)
include_vectors: Inclure les vecteurs
Returns:
Statistiques du backup
"""
print("=" * 60)
print("BACKUP WEAVIATE")
print("=" * 60)
print(f"URL: {WEAVIATE_URL}")
print(f"Output: {output_path}")
print(f"Include vectors: {include_vectors}")
print("-" * 60)
# Vérifier la connexion
if not check_weaviate_ready():
print("ERREUR: Weaviate n'est pas accessible")
print(f"Vérifiez que le serveur tourne sur {WEAVIATE_URL}")
sys.exit(1)
print("Weaviate connecte [OK]")
# Récupérer le schéma
print("\n[1/3] Récupération du schéma...")
schema = get_schema()
all_classes = [c["class"] for c in schema.get("classes", [])]
print(f" Classes trouvées: {', '.join(all_classes)}")
# Filtrer les collections si spécifié
if collections:
classes_to_backup = [c for c in all_classes if c in collections]
print(f" Collections sélectionnées: {', '.join(classes_to_backup)}")
else:
classes_to_backup = all_classes
# Récupérer les objets de chaque classe
print("\n[2/3] Récupération des objets...")
backup_data = {
"metadata": {
"timestamp": datetime.now().isoformat(),
"weaviate_url": WEAVIATE_URL,
"include_vectors": include_vectors,
"version": "1.0"
},
"schema": schema,
"collections": {}
}
stats = {}
for class_name in classes_to_backup:
objects = get_all_objects(class_name, include_vector=include_vectors)
backup_data["collections"][class_name] = objects
stats[class_name] = len(objects)
# Sauvegarder
print(f"\n[3/3] Sauvegarde dans {output_path}...")
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(backup_data, f, indent=2, ensure_ascii=False)
file_size = output_path.stat().st_size / (1024 * 1024) # MB
# Résumé
print("\n" + "=" * 60)
print("BACKUP TERMINÉ")
print("=" * 60)
print(f"Fichier: {output_path}")
print(f"Taille: {file_size:.2f} MB")
print("\nStatistiques par collection:")
total = 0
for class_name, count in stats.items():
print(f" - {class_name}: {count} objets")
total += count
print(f"\nTotal: {total} objets")
return stats
def main():
global WEAVIATE_URL # Declare global at start of function
parser = argparse.ArgumentParser(
description="Backup complet de Weaviate",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Exemples:
python weaviate_backup.py
python weaviate_backup.py --output backup.json
python weaviate_backup.py --collections Thought,Conversation
python weaviate_backup.py --no-vectors
"""
)
parser.add_argument(
"--output", "-o",
type=Path,
default=None,
help="Chemin du fichier de sortie (defaut: exports/backup_YYYYMMDD_HHMMSS.json)"
)
parser.add_argument(
"--collections", "-c",
type=str,
default=None,
help="Collections a exporter (separees par des virgules)"
)
parser.add_argument(
"--no-vectors",
action="store_true",
help="Ne pas inclure les vecteurs (plus rapide, fichier plus petit)"
)
parser.add_argument(
"--url",
type=str,
default=None,
help=f"URL Weaviate (defaut: {WEAVIATE_URL})"
)
args = parser.parse_args()
# URL Weaviate
if args.url:
WEAVIATE_URL = args.url
# Chemin de sortie
if args.output:
output_path = args.output
else:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
output_path = DEFAULT_OUTPUT_DIR / f"backup_{timestamp}.json"
# Collections
collections = None
if args.collections:
collections = [c.strip() for c in args.collections.split(",")]
# Exécuter le backup
backup_weaviate(
output_path=output_path,
collections=collections,
include_vectors=not args.no_vectors
)
if __name__ == "__main__":
main()