Add ikario_processual with David profile and embedding script
- david_profile_declared.json: David's declared profile values from questionnaire - scripts/embed_david.py: Python script to generate embeddings using BGE-M3 model - questionnaire_david.md: Questionnaire template for profile values Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
198
ikario_processual/scripts/phase1_state_vector.py
Normal file
198
ikario_processual/scripts/phase1_state_vector.py
Normal file
@@ -0,0 +1,198 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 1 : Creation de la collection StateVector et de S(0).
|
||||
|
||||
Ce script:
|
||||
1. Cree la collection StateVector dans Weaviate
|
||||
2. Recupere et filtre les pensees (exclut les tests)
|
||||
3. Recupere et filtre les messages d'Ikario (assistant uniquement)
|
||||
4. Calcule l'embedding agrege avec BGE-M3
|
||||
5. Cree l'etat initial S(0)
|
||||
|
||||
Usage:
|
||||
python phase1_state_vector.py
|
||||
python phase1_state_vector.py --dry-run
|
||||
python phase1_state_vector.py --reset # Supprime et recree
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Ajouter le parent au path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from state_vector import (
|
||||
check_weaviate_ready,
|
||||
get_existing_classes,
|
||||
create_state_vector_collection,
|
||||
delete_state_vector_collection,
|
||||
get_all_thoughts,
|
||||
get_all_messages,
|
||||
filter_thoughts,
|
||||
filter_assistant_messages,
|
||||
compute_aggregate_embedding,
|
||||
create_initial_state,
|
||||
get_current_state_id,
|
||||
get_state_vector,
|
||||
)
|
||||
|
||||
|
||||
def print_section(title: str):
|
||||
print("\n" + "=" * 60)
|
||||
print(title)
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Phase 1: Creation StateVector et S(0)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Simuler sans creer"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reset",
|
||||
action="store_true",
|
||||
help="Supprimer et recreer la collection"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print_section("PHASE 1 : STATEVECTOR ET S(0)")
|
||||
|
||||
# 1. Verifier Weaviate
|
||||
print("\n[1/6] Verification Weaviate...")
|
||||
if not check_weaviate_ready():
|
||||
print("ERREUR: Weaviate non accessible")
|
||||
sys.exit(1)
|
||||
print(" Weaviate [OK]")
|
||||
|
||||
# 2. Gerer la collection StateVector
|
||||
print("\n[2/6] Collection StateVector...")
|
||||
existing = get_existing_classes()
|
||||
|
||||
if "StateVector" in existing:
|
||||
if args.reset:
|
||||
print(" Suppression de la collection existante...")
|
||||
if not args.dry_run:
|
||||
delete_state_vector_collection()
|
||||
print(" Collection supprimee")
|
||||
else:
|
||||
print(" [DRY-RUN] Suppression simulee")
|
||||
else:
|
||||
# Verifier si S(0) existe deja
|
||||
current_id = get_current_state_id()
|
||||
if current_id >= 0:
|
||||
print(f" Collection existe avec {current_id + 1} etat(s)")
|
||||
print(" Utilisez --reset pour reinitialiser")
|
||||
sys.exit(0)
|
||||
|
||||
# Creer la collection
|
||||
if args.dry_run:
|
||||
print(" [DRY-RUN] Creation collection simulee")
|
||||
else:
|
||||
if "StateVector" not in get_existing_classes():
|
||||
create_state_vector_collection()
|
||||
|
||||
# 3. Recuperer et filtrer les pensees
|
||||
print("\n[3/6] Recuperation des pensees...")
|
||||
all_thoughts = get_all_thoughts()
|
||||
print(f" Total pensees: {len(all_thoughts)}")
|
||||
|
||||
filtered_thoughts = filter_thoughts(all_thoughts)
|
||||
excluded = len(all_thoughts) - len(filtered_thoughts)
|
||||
print(f" Pensees filtrees: {len(filtered_thoughts)} (exclues: {excluded})")
|
||||
|
||||
# Afficher quelques exemples de pensees gardees
|
||||
if filtered_thoughts:
|
||||
print("\n Exemples de pensees gardees:")
|
||||
for t in filtered_thoughts[:3]:
|
||||
content = t.get("properties", {}).get("content", "")[:80]
|
||||
print(f" - {content}...")
|
||||
|
||||
# 4. Recuperer et filtrer les messages
|
||||
print("\n[4/6] Recuperation des messages...")
|
||||
all_messages = get_all_messages()
|
||||
print(f" Total messages: {len(all_messages)}")
|
||||
|
||||
filtered_messages = filter_assistant_messages(all_messages)
|
||||
excluded = len(all_messages) - len(filtered_messages)
|
||||
print(f" Messages Ikario: {len(filtered_messages)} (exclues: {excluded})")
|
||||
|
||||
# Afficher quelques exemples
|
||||
if filtered_messages:
|
||||
print("\n Exemples de messages Ikario:")
|
||||
for m in filtered_messages[:3]:
|
||||
content = m.get("properties", {}).get("content", "")[:80]
|
||||
print(f" - {content}...")
|
||||
|
||||
# 5. Calculer l'embedding agrege
|
||||
print("\n[5/6] Calcul de l'embedding agrege...")
|
||||
|
||||
if args.dry_run:
|
||||
print(" [DRY-RUN] Embedding simule (1024 dims)")
|
||||
embedding = None
|
||||
else:
|
||||
# Charger le modele BGE-M3
|
||||
print(" Chargement du modele BGE-M3...")
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
model = SentenceTransformer('BAAI/bge-m3')
|
||||
print(" Modele charge [OK]")
|
||||
except ImportError:
|
||||
print("ERREUR: sentence-transformers non installe")
|
||||
print(" pip install sentence-transformers")
|
||||
sys.exit(1)
|
||||
|
||||
# Calculer l'embedding
|
||||
print(" Calcul de l'embedding agrege...")
|
||||
embedding = compute_aggregate_embedding(
|
||||
filtered_thoughts,
|
||||
filtered_messages,
|
||||
model
|
||||
)
|
||||
print(f" Embedding calcule: {embedding.shape} (norme: {embedding.sum():.4f})")
|
||||
|
||||
# 6. Creer S(0)
|
||||
print("\n[6/6] Creation de S(0)...")
|
||||
|
||||
if args.dry_run:
|
||||
print(" [DRY-RUN] S(0) simule")
|
||||
print(f" - {len(filtered_thoughts)} pensees")
|
||||
print(f" - {len(filtered_messages)} messages")
|
||||
else:
|
||||
s0 = create_initial_state(
|
||||
filtered_thoughts,
|
||||
filtered_messages,
|
||||
embedding
|
||||
)
|
||||
print(f" S(0) cree avec succes!")
|
||||
print(f" - ID: {s0.get('id', 'N/A')}")
|
||||
print(f" - Pensees sources: {s0['source_thoughts_count']}")
|
||||
print(f" - Messages sources: {s0['source_messages_count']}")
|
||||
|
||||
# Resume
|
||||
print_section("PHASE 1 TERMINEE")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n[DRY-RUN] Aucune modification effectuee")
|
||||
else:
|
||||
print("\nResultat:")
|
||||
print(f" - Collection StateVector creee")
|
||||
print(f" - S(0) cree a partir de:")
|
||||
print(f" {len(filtered_thoughts)} pensees")
|
||||
print(f" {len(filtered_messages)} messages")
|
||||
|
||||
print("\nTests de validation:")
|
||||
print(" curl -s http://localhost:8080/v1/schema | jq '.classes[] | select(.class == \"StateVector\")'")
|
||||
print(" curl -s 'http://localhost:8080/v1/objects?class=StateVector&limit=1' | jq '.objects[0].properties'")
|
||||
|
||||
print("\nProchaine etape:")
|
||||
print(" python scripts/phase2_projection_directions.py")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user