Add ikario_processual with David profile and embedding script

- david_profile_declared.json: David's declared profile values from questionnaire - scripts/embed_david.py: Python script to generate embeddings using BGE-M3 model - questionnaire_david.md: Questionnaire template for profile values Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-31 16:56:41 +01:00
parent 9e657cbf29
commit 21f5676c7b
18 changed files with 5463 additions and 0 deletions
--- a/ikario_processual/tests/test_phase0_backup.py
+++ b/ikario_processual/tests/test_phase0_backup.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+"""
+Tests pour la Phase 0: Backup et restauration Weaviate.
+
+Usage:
+    pytest tests/test_phase0_backup.py -v
+    pytest tests/test_phase0_backup.py -v -k test_backup
+"""
+
+import json
+import os
+import tempfile
+from pathlib import Path
+
+import pytest
+import requests
+
+# Configuration
+WEAVIATE_URL = os.getenv("WEAVIATE_URL", "http://localhost:8080")
+
+
+def weaviate_is_available() -> bool:
+    """Vérifie si Weaviate est accessible."""
+    try:
+        response = requests.get(f"{WEAVIATE_URL}/v1/.well-known/ready", timeout=5)
+        return response.status_code == 200
+    except requests.RequestException:
+        return False
+
+
+# Skip tous les tests si Weaviate n'est pas disponible
+pytestmark = pytest.mark.skipif(
+    not weaviate_is_available(),
+    reason=f"Weaviate non disponible sur {WEAVIATE_URL}"
+)
+
+
+class TestWeaviateConnection:
+    """Tests de connexion à Weaviate."""
+
+    def test_weaviate_ready(self):
+        """Weaviate doit être accessible."""
+        response = requests.get(f"{WEAVIATE_URL}/v1/.well-known/ready")
+        assert response.status_code == 200
+
+    def test_weaviate_schema_accessible(self):
+        """Le schéma doit être récupérable."""
+        response = requests.get(f"{WEAVIATE_URL}/v1/schema")
+        assert response.status_code == 200
+        data = response.json()
+        assert "classes" in data
+
+    def test_weaviate_has_collections(self):
+        """Au moins une collection doit exister (Thought, Conversation, etc.)."""
+        response = requests.get(f"{WEAVIATE_URL}/v1/schema")
+        data = response.json()
+        classes = [c["class"] for c in data.get("classes", [])]
+
+        # Au moins une des collections attendues
+        expected = ["Thought", "Conversation", "Message", "Chunk", "Work", "Summary"]
+        found = [c for c in classes if c in expected]
+
+        assert len(found) > 0, f"Aucune collection trouvée parmi {expected}. Classes existantes: {classes}"
+
+
+class TestBackupScript:
+    """Tests du script de backup."""
+
+    def test_backup_creates_file(self):
+        """Le backup doit créer un fichier JSON."""
+        # Import dynamique pour éviter les erreurs si requests manque
+        import sys
+        sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
+
+        from weaviate_backup import backup_weaviate
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            output_path = Path(tmpdir) / "test_backup.json"
+
+            stats = backup_weaviate(
+                output_path=output_path,
+                collections=None,  # Toutes
+                include_vectors=False  # Plus rapide pour le test
+            )
+
+            assert output_path.exists(), "Le fichier de backup n'a pas été créé"
+            assert output_path.stat().st_size > 0, "Le fichier de backup est vide"
+
+    def test_backup_structure(self):
+        """Le backup doit avoir la bonne structure."""
+        import sys
+        sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
+
+        from weaviate_backup import backup_weaviate
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            output_path = Path(tmpdir) / "test_backup.json"
+
+            backup_weaviate(
+                output_path=output_path,
+                collections=["Thought"],  # Une seule collection pour le test
+                include_vectors=False
+            )
+
+            with open(output_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+
+            # Vérifier la structure
+            assert "metadata" in data
+            assert "schema" in data
+            assert "collections" in data
+
+            # Vérifier les métadonnées
+            assert "timestamp" in data["metadata"]
+            assert "weaviate_url" in data["metadata"]
+            assert "version" in data["metadata"]
+
+    def test_backup_with_vectors(self):
+        """Le backup avec vecteurs doit inclure les embeddings."""
+        import sys
+        sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
+
+        from weaviate_backup import backup_weaviate
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            output_path = Path(tmpdir) / "test_backup_vectors.json"
+
+            backup_weaviate(
+                output_path=output_path,
+                collections=["Thought"],
+                include_vectors=True
+            )
+
+            with open(output_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+
+            # Vérifier qu'au moins un objet a un vecteur
+            thoughts = data.get("collections", {}).get("Thought", [])
+            if thoughts:
+                # Au moins un objet devrait avoir un vecteur
+                has_vector = any("vector" in obj for obj in thoughts)
+                assert has_vector, "Aucun objet n'a de vecteur alors que include_vectors=True"
+
+
+class TestRestoreScript:
+    """Tests du script de restauration."""
+
+    def test_restore_dry_run(self):
+        """Le dry-run ne doit pas modifier les données."""
+        import sys
+        sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
+
+        from weaviate_backup import backup_weaviate
+        from weaviate_restore import restore_weaviate, get_existing_classes
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # D'abord, faire un backup
+            backup_path = Path(tmpdir) / "test_backup.json"
+            backup_weaviate(
+                output_path=backup_path,
+                collections=["Thought"],
+                include_vectors=False
+            )
+
+            # Compter les objets avant
+            response = requests.get(f"{WEAVIATE_URL}/v1/objects?class=Thought&limit=1")
+            count_before = len(response.json().get("objects", []))
+
+            # Restaurer en dry-run
+            stats = restore_weaviate(
+                backup_path=backup_path,
+                collections=["Thought"],
+                clear_existing=False,
+                dry_run=True
+            )
+
+            # Compter après
+            response = requests.get(f"{WEAVIATE_URL}/v1/objects?class=Thought&limit=1")
+            count_after = len(response.json().get("objects", []))
+
+            # Pas de changement
+            assert count_before == count_after, "Le dry-run a modifié les données!"
+
+
+class TestBackupRestoreCycle:
+    """Tests du cycle complet backup → restore."""
+
+    def test_backup_restore_roundtrip(self):
+        """
+        Test complet: backup → restore → vérification.
+
+        Ce test utilise une collection temporaire pour ne pas
+        affecter les données existantes.
+        """
+        # Ce test nécessiterait de créer une collection temporaire
+        # Pour l'instant, on vérifie juste que les scripts fonctionnent
+        pass
+
+
+def test_exports_directory_exists():
+    """Le dossier exports doit exister ou être créable."""
+    exports_dir = Path(__file__).parent.parent.parent / "exports"
+    exports_dir.mkdir(parents=True, exist_ok=True)
+    assert exports_dir.exists()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])