feat: Add data quality verification & cleanup scripts

## Data Quality & Cleanup (Priorities 1-6) Added comprehensive data quality verification and cleanup system: **Scripts créés**: - verify_data_quality.py: Analyse qualité complète œuvre par œuvre - clean_duplicate_documents.py: Nettoyage doublons Documents - populate_work_collection.py/clean.py: Peuplement Work collection - fix_chunks_count.py: Correction chunksCount incohérents - manage_orphan_chunks.py: Gestion chunks orphelins (3 options) - clean_orphan_works.py: Suppression Works sans chunks - add_missing_work.py: Création Work manquant - generate_schema_stats.py: Génération stats auto - migrate_add_work_collection.py: Migration sûre Work collection **Documentation**: - WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes) - WEAVIATE_SCHEMA.md: Référence schéma rapide - NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session - ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale - rapport_qualite_donnees.txt: Output brut vérification **Résultats nettoyage**: - Documents: 16 → 9 (7 doublons supprimés) - Works: 0 → 9 (peuplé + nettoyé) - Chunks: 5,404 → 5,230 (174 orphelins supprimés) - chunksCount: Corrigés (231 → 5,230 déclaré = réel) - Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres **Modifications code**: - schema.py: Ajout Work collection avec vectorisation - utils/weaviate_ingest.py: Support Work ingestion - utils/word_pipeline.py: Désactivation concepts (problème .lower()) - utils/word_toc_extractor.py: Métadonnées Word correctes - .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-01 11:57:26 +01:00
parent 845ffb4b06
commit 04ee3f9e39
26 changed files with 6945 additions and 16 deletions
--- a/generations/library_rag/migrate_add_work_collection.py
+++ b/generations/library_rag/migrate_add_work_collection.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+"""Migration script: Add Work collection with vectorization.
+
+This script safely adds the Work collection to the existing Weaviate schema
+WITHOUT deleting the existing Chunk, Document, and Summary collections.
+
+Migration Steps:
+    1. Connect to Weaviate
+    2. Check if Work collection already exists
+    3. If exists, delete ONLY Work collection
+    4. Create new Work collection with vectorization enabled
+    5. Optionally populate Work from existing Chunk metadata
+    6. Verify all 4 collections exist
+
+Usage:
+    python migrate_add_work_collection.py
+
+Safety:
+    - Does NOT touch Chunk collection (5400+ chunks preserved)
+    - Does NOT touch Document collection
+    - Does NOT touch Summary collection
+    - Only creates/recreates Work collection
+"""
+
+import sys
+from typing import Set
+
+import weaviate
+import weaviate.classes.config as wvc
+
+
+def create_work_collection_vectorized(client: weaviate.WeaviateClient) -> None:
+    """Create the Work collection WITH vectorization enabled.
+
+    This is the new version that enables semantic search on work titles
+    and author names.
+
+    Args:
+        client: Connected Weaviate client.
+    """
+    client.collections.create(
+        name="Work",
+        description="A philosophical or scholarly work (e.g., Meno, Republic, Apology).",
+        # ✅ NEW: Enable vectorization for semantic search on titles/authors
+        vectorizer_config=wvc.Configure.Vectorizer.text2vec_transformers(
+            vectorize_collection_name=False,
+        ),
+        properties=[
+            wvc.Property(
+                name="title",
+                description="Title of the work.",
+                data_type=wvc.DataType.TEXT,
+                # ✅ VECTORIZED by default (semantic search enabled)
+            ),
+            wvc.Property(
+                name="author",
+                description="Author of the work.",
+                data_type=wvc.DataType.TEXT,
+                # ✅ VECTORIZED by default (semantic search enabled)
+            ),
+            wvc.Property(
+                name="originalTitle",
+                description="Original title in source language (optional).",
+                data_type=wvc.DataType.TEXT,
+                skip_vectorization=True,  # Metadata only
+            ),
+            wvc.Property(
+                name="year",
+                description="Year of composition or publication (negative for BCE).",
+                data_type=wvc.DataType.INT,
+                # INT is never vectorized
+            ),
+            wvc.Property(
+                name="language",
+                description="Original language (e.g., 'gr', 'la', 'fr').",
+                data_type=wvc.DataType.TEXT,
+                skip_vectorization=True,  # ISO code, no need to vectorize
+            ),
+            wvc.Property(
+                name="genre",
+                description="Genre or type (e.g., 'dialogue', 'treatise', 'commentary').",
+                data_type=wvc.DataType.TEXT,
+                skip_vectorization=True,  # Metadata only
+            ),
+        ],
+    )
+
+
+def migrate_work_collection(client: weaviate.WeaviateClient) -> None:
+    """Migrate Work collection by adding vectorization.
+
+    This function:
+    1. Checks if Work exists
+    2. Deletes ONLY Work if it exists
+    3. Creates new Work with vectorization
+    4. Leaves all other collections untouched
+
+    Args:
+        client: Connected Weaviate client.
+    """
+    print("\n" + "=" * 80)
+    print("MIGRATION: Ajouter vectorisation à Work")
+    print("=" * 80)
+
+    # Step 1: Check existing collections
+    print("\n[1/5] Vérification des collections existantes...")
+    collections = client.collections.list_all()
+    existing: Set[str] = set(collections.keys())
+    print(f"      Collections trouvées: {sorted(existing)}")
+
+    # Step 2: Delete ONLY Work if it exists
+    print("\n[2/5] Suppression de Work (si elle existe)...")
+    if "Work" in existing:
+        try:
+            client.collections.delete("Work")
+            print("      ✓ Work supprimée")
+        except Exception as e:
+            print(f"      ⚠ Erreur suppression Work: {e}")
+    else:
+        print("      ℹ Work n'existe pas encore")
+
+    # Step 3: Create new Work with vectorization
+    print("\n[3/5] Création de Work avec vectorisation...")
+    try:
+        create_work_collection_vectorized(client)
+        print("      ✓ Work créée (vectorisation activée)")
+    except Exception as e:
+        print(f"      ✗ Erreur création Work: {e}")
+        raise
+
+    # Step 4: Verify all 4 collections exist
+    print("\n[4/5] Vérification finale...")
+    collections = client.collections.list_all()
+    actual: Set[str] = set(collections.keys())
+    expected: Set[str] = {"Work", "Document", "Chunk", "Summary"}
+
+    if expected == actual:
+        print(f"      ✓ Toutes les collections présentes: {sorted(actual)}")
+    else:
+        missing: Set[str] = expected - actual
+        extra: Set[str] = actual - expected
+        if missing:
+            print(f"      ⚠ Collections manquantes: {missing}")
+        if extra:
+            print(f"      ℹ Collections supplémentaires: {extra}")
+
+    # Step 5: Display Work config
+    print("\n[5/5] Configuration de Work:")
+    print("─" * 80)
+    work_config = collections["Work"]
+    print(f"Description: {work_config.description}")
+
+    vectorizer_str: str = str(work_config.vectorizer)
+    if "text2vec" in vectorizer_str.lower():
+        print("Vectorizer:  text2vec-transformers ✅")
+    else:
+        print("Vectorizer:  none ❌")
+
+    print("\nPropriétés vectorisées:")
+    for prop in work_config.properties:
+        if prop.name in ["title", "author"]:
+            skip = "[skip_vec]" if (hasattr(prop, 'skip_vectorization') and prop.skip_vectorization) else "[VECTORIZED ✅]"
+            print(f"  • {prop.name:<20} {skip}")
+
+    print("\n" + "=" * 80)
+    print("MIGRATION TERMINÉE AVEC SUCCÈS!")
+    print("=" * 80)
+    print("\n✓ Work collection vectorisée")
+    print("✓ Chunk collection PRÉSERVÉE (aucune donnée perdue)")
+    print("✓ Document collection PRÉSERVÉE")
+    print("✓ Summary collection PRÉSERVÉE")
+    print("\n💡 Prochaine étape (optionnel):")
+    print("   Peupler Work en extrayant les œuvres uniques depuis Chunk.work")
+    print("=" * 80 + "\n")
+
+
+def main() -> None:
+    """Main entry point for migration script."""
+    # Fix encoding for Windows console
+    if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
+        sys.stdout.reconfigure(encoding='utf-8')
+
+    # Connect to local Weaviate
+    client: weaviate.WeaviateClient = weaviate.connect_to_local(
+        host="localhost",
+        port=8080,
+        grpc_port=50051,
+    )
+
+    try:
+        migrate_work_collection(client)
+    finally:
+        client.close()
+        print("\n✓ Connexion fermée\n")
+
+
+if __name__ == "__main__":
+    main()