feat: Add data quality verification & cleanup scripts

## Data Quality & Cleanup (Priorities 1-6) Added comprehensive data quality verification and cleanup system: **Scripts créés**: - verify_data_quality.py: Analyse qualité complète œuvre par œuvre - clean_duplicate_documents.py: Nettoyage doublons Documents - populate_work_collection.py/clean.py: Peuplement Work collection - fix_chunks_count.py: Correction chunksCount incohérents - manage_orphan_chunks.py: Gestion chunks orphelins (3 options) - clean_orphan_works.py: Suppression Works sans chunks - add_missing_work.py: Création Work manquant - generate_schema_stats.py: Génération stats auto - migrate_add_work_collection.py: Migration sûre Work collection **Documentation**: - WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes) - WEAVIATE_SCHEMA.md: Référence schéma rapide - NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session - ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale - rapport_qualite_donnees.txt: Output brut vérification **Résultats nettoyage**: - Documents: 16 → 9 (7 doublons supprimés) - Works: 0 → 9 (peuplé + nettoyé) - Chunks: 5,404 → 5,230 (174 orphelins supprimés) - chunksCount: Corrigés (231 → 5,230 déclaré = réel) - Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres **Modifications code**: - schema.py: Ajout Work collection avec vectorisation - utils/weaviate_ingest.py: Support Work ingestion - utils/word_pipeline.py: Désactivation concepts (problème .lower()) - utils/word_toc_extractor.py: Métadonnées Word correctes - .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-01 11:57:26 +01:00
parent 845ffb4b06
commit 04ee3f9e39
26 changed files with 6945 additions and 16 deletions
--- a/generations/library_rag/clean_orphan_works.py
+++ b/generations/library_rag/clean_orphan_works.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+"""Supprimer les Works orphelins (sans chunks associés).
+
+Un Work est orphelin si aucun chunk ne référence cette œuvre dans son nested object.
+
+Usage:
+    # Dry-run (affiche ce qui serait supprimé, sans rien faire)
+    python clean_orphan_works.py
+
+    # Exécution réelle (supprime les Works orphelins)
+    python clean_orphan_works.py --execute
+"""
+
+import sys
+import argparse
+from typing import Any, Dict, List, Set, Tuple
+
+import weaviate
+
+
+def get_works_from_chunks(client: weaviate.WeaviateClient) -> Set[Tuple[str, str]]:
+    """Extraire les œuvres uniques depuis les chunks.
+
+    Args:
+        client: Connected Weaviate client.
+
+    Returns:
+        Set of (title, author) tuples for works that have chunks.
+    """
+    print("📊 Récupération de tous les chunks...")
+
+    chunk_collection = client.collections.get("Chunk")
+    chunks_response = chunk_collection.query.fetch_objects(
+        limit=10000,
+    )
+
+    print(f"   ✓ {len(chunks_response.objects)} chunks récupérés")
+    print()
+
+    # Extraire les œuvres uniques (normalisation pour comparaison)
+    works_with_chunks: Set[Tuple[str, str]] = set()
+
+    for chunk_obj in chunks_response.objects:
+        props = chunk_obj.properties
+
+        if "work" in props and isinstance(props["work"], dict):
+            work = props["work"]
+            title = work.get("title")
+            author = work.get("author")
+
+            if title and author:
+                # Normaliser pour comparaison (lowercase pour ignorer casse)
+                works_with_chunks.add((title.lower(), author.lower()))
+
+    print(f"📚 {len(works_with_chunks)} œuvres uniques dans les chunks")
+    print()
+
+    return works_with_chunks
+
+
+def identify_orphan_works(
+    client: weaviate.WeaviateClient,
+    works_with_chunks: Set[Tuple[str, str]],
+) -> List[Any]:
+    """Identifier les Works orphelins (sans chunks).
+
+    Args:
+        client: Connected Weaviate client.
+        works_with_chunks: Set of (title, author) that have chunks.
+
+    Returns:
+        List of orphan Work objects.
+    """
+    print("📊 Récupération de tous les Works...")
+
+    work_collection = client.collections.get("Work")
+    works_response = work_collection.query.fetch_objects(
+        limit=1000,
+    )
+
+    print(f"   ✓ {len(works_response.objects)} Works récupérés")
+    print()
+
+    # Identifier les orphelins
+    orphan_works: List[Any] = []
+
+    for work_obj in works_response.objects:
+        props = work_obj.properties
+        title = props.get("title")
+        author = props.get("author")
+
+        if title and author:
+            # Normaliser pour comparaison (lowercase)
+            if (title.lower(), author.lower()) not in works_with_chunks:
+                orphan_works.append(work_obj)
+
+    print(f"🔍 {len(orphan_works)} Works orphelins détectés")
+    print()
+
+    return orphan_works
+
+
+def display_orphans_report(orphan_works: List[Any]) -> None:
+    """Afficher le rapport des Works orphelins.
+
+    Args:
+        orphan_works: List of orphan Work objects.
+    """
+    if not orphan_works:
+        print("✅ Aucun Work orphelin détecté !")
+        print()
+        return
+
+    print("=" * 80)
+    print("WORKS ORPHELINS DÉTECTÉS")
+    print("=" * 80)
+    print()
+
+    print(f"📌 {len(orphan_works)} Works sans chunks associés")
+    print()
+
+    for i, work_obj in enumerate(orphan_works, 1):
+        props = work_obj.properties
+        print(f"[{i}/{len(orphan_works)}] {props.get('title', 'N/A')}")
+        print("─" * 80)
+        print(f"   Auteur : {props.get('author', 'N/A')}")
+
+        if props.get("year"):
+            year = props["year"]
+            if year < 0:
+                print(f"   Année : {abs(year)} av. J.-C.")
+            else:
+                print(f"   Année : {year}")
+
+        if props.get("language"):
+            print(f"   Langue : {props['language']}")
+
+        if props.get("genre"):
+            print(f"   Genre : {props['genre']}")
+
+        print(f"   UUID : {work_obj.uuid}")
+        print()
+
+    print("=" * 80)
+    print()
+
+
+def delete_orphan_works(
+    client: weaviate.WeaviateClient,
+    orphan_works: List[Any],
+    dry_run: bool = True,
+) -> Dict[str, int]:
+    """Supprimer les Works orphelins.
+
+    Args:
+        client: Connected Weaviate client.
+        orphan_works: List of orphan Work objects.
+        dry_run: If True, only simulate (don't actually delete).
+
+    Returns:
+        Dict with statistics: deleted, errors.
+    """
+    stats = {
+        "deleted": 0,
+        "errors": 0,
+    }
+
+    if not orphan_works:
+        print("✅ Aucun Work à supprimer (pas d'orphelins)")
+        return stats
+
+    if dry_run:
+        print("🔍 MODE DRY-RUN (simulation, aucune suppression réelle)")
+    else:
+        print("⚠️  MODE EXÉCUTION (suppression réelle)")
+
+    print("=" * 80)
+    print()
+
+    work_collection = client.collections.get("Work")
+
+    for work_obj in orphan_works:
+        props = work_obj.properties
+        title = props.get("title", "N/A")
+        author = props.get("author", "N/A")
+
+        print(f"Traitement de '{title}' par {author}...")
+
+        if dry_run:
+            print(f"   🔍 [DRY-RUN] Supprimerait UUID {work_obj.uuid}")
+            stats["deleted"] += 1
+        else:
+            try:
+                work_collection.data.delete_by_id(work_obj.uuid)
+                print(f"   ❌ Supprimé UUID {work_obj.uuid}")
+                stats["deleted"] += 1
+            except Exception as e:
+                print(f"   ⚠️  Erreur suppression UUID {work_obj.uuid}: {e}")
+                stats["errors"] += 1
+
+        print()
+
+    print("=" * 80)
+    print("RÉSUMÉ")
+    print("=" * 80)
+    print(f"   Works supprimés : {stats['deleted']}")
+    print(f"   Erreurs : {stats['errors']}")
+    print()
+
+    return stats
+
+
+def verify_cleanup(client: weaviate.WeaviateClient) -> None:
+    """Vérifier le résultat du nettoyage.
+
+    Args:
+        client: Connected Weaviate client.
+    """
+    print("=" * 80)
+    print("VÉRIFICATION POST-NETTOYAGE")
+    print("=" * 80)
+    print()
+
+    works_with_chunks = get_works_from_chunks(client)
+    orphan_works = identify_orphan_works(client, works_with_chunks)
+
+    if not orphan_works:
+        print("✅ Aucun Work orphelin restant !")
+        print()
+
+        # Statistiques finales
+        work_coll = client.collections.get("Work")
+        work_result = work_coll.aggregate.over_all(total_count=True)
+
+        print(f"📊 Works totaux : {work_result.total_count}")
+        print(f"📊 Œuvres avec chunks : {len(works_with_chunks)}")
+        print()
+
+        if work_result.total_count == len(works_with_chunks):
+            print("✅ Cohérence parfaite : 1 Work = 1 œuvre avec chunks")
+            print()
+    else:
+        print(f"⚠️  {len(orphan_works)} Works orphelins persistent")
+        print()
+
+    print("=" * 80)
+    print()
+
+
+def main() -> None:
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Supprimer les Works orphelins (sans chunks associés)"
+    )
+    parser.add_argument(
+        "--execute",
+        action="store_true",
+        help="Exécuter la suppression (par défaut: dry-run)",
+    )
+
+    args = parser.parse_args()
+
+    # Fix encoding for Windows console
+    if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
+        sys.stdout.reconfigure(encoding='utf-8')
+
+    print("=" * 80)
+    print("NETTOYAGE DES WORKS ORPHELINS")
+    print("=" * 80)
+    print()
+
+    client = weaviate.connect_to_local(
+        host="localhost",
+        port=8080,
+        grpc_port=50051,
+    )
+
+    try:
+        if not client.is_ready():
+            print("❌ Weaviate is not ready. Ensure docker-compose is running.")
+            sys.exit(1)
+
+        print("✓ Weaviate is ready")
+        print()
+
+        # Étape 1 : Identifier les œuvres avec chunks
+        works_with_chunks = get_works_from_chunks(client)
+
+        # Étape 2 : Identifier les Works orphelins
+        orphan_works = identify_orphan_works(client, works_with_chunks)
+
+        # Étape 3 : Afficher le rapport
+        display_orphans_report(orphan_works)
+
+        if not orphan_works:
+            print("✅ Aucune action nécessaire (pas d'orphelins)")
+            sys.exit(0)
+
+        # Étape 4 : Supprimer (ou simuler)
+        if args.execute:
+            print(f"⚠️  ATTENTION : {len(orphan_works)} Works vont être supprimés !")
+            print()
+            response = input("Continuer ? (oui/non) : ").strip().lower()
+            if response not in ["oui", "yes", "o", "y"]:
+                print("❌ Annulé par l'utilisateur.")
+                sys.exit(0)
+            print()
+
+        stats = delete_orphan_works(client, orphan_works, dry_run=not args.execute)
+
+        # Étape 5 : Vérifier le résultat (seulement si exécution réelle)
+        if args.execute and stats["deleted"] > 0:
+            verify_cleanup(client)
+        else:
+            print("=" * 80)
+            print("💡 NEXT STEP")
+            print("=" * 80)
+            print()
+            print("Pour exécuter le nettoyage, lancez :")
+            print("   python clean_orphan_works.py --execute")
+            print()
+
+    finally:
+        client.close()
+
+
+if __name__ == "__main__":
+    main()