feat: Add data quality verification & cleanup scripts
## Data Quality & Cleanup (Priorities 1-6) Added comprehensive data quality verification and cleanup system: **Scripts créés**: - verify_data_quality.py: Analyse qualité complète œuvre par œuvre - clean_duplicate_documents.py: Nettoyage doublons Documents - populate_work_collection.py/clean.py: Peuplement Work collection - fix_chunks_count.py: Correction chunksCount incohérents - manage_orphan_chunks.py: Gestion chunks orphelins (3 options) - clean_orphan_works.py: Suppression Works sans chunks - add_missing_work.py: Création Work manquant - generate_schema_stats.py: Génération stats auto - migrate_add_work_collection.py: Migration sûre Work collection **Documentation**: - WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes) - WEAVIATE_SCHEMA.md: Référence schéma rapide - NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session - ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale - rapport_qualite_donnees.txt: Output brut vérification **Résultats nettoyage**: - Documents: 16 → 9 (7 doublons supprimés) - Works: 0 → 9 (peuplé + nettoyé) - Chunks: 5,404 → 5,230 (174 orphelins supprimés) - chunksCount: Corrigés (231 → 5,230 déclaré = réel) - Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres **Modifications code**: - schema.py: Ajout Work collection avec vectorisation - utils/weaviate_ingest.py: Support Work ingestion - utils/word_pipeline.py: Désactivation concepts (problème .lower()) - utils/word_toc_extractor.py: Métadonnées Word correctes - .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
328
generations/library_rag/clean_orphan_works.py
Normal file
328
generations/library_rag/clean_orphan_works.py
Normal file
@@ -0,0 +1,328 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Supprimer les Works orphelins (sans chunks associés).
|
||||
|
||||
Un Work est orphelin si aucun chunk ne référence cette œuvre dans son nested object.
|
||||
|
||||
Usage:
|
||||
# Dry-run (affiche ce qui serait supprimé, sans rien faire)
|
||||
python clean_orphan_works.py
|
||||
|
||||
# Exécution réelle (supprime les Works orphelins)
|
||||
python clean_orphan_works.py --execute
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
from typing import Any, Dict, List, Set, Tuple
|
||||
|
||||
import weaviate
|
||||
|
||||
|
||||
def get_works_from_chunks(client: weaviate.WeaviateClient) -> Set[Tuple[str, str]]:
|
||||
"""Extraire les œuvres uniques depuis les chunks.
|
||||
|
||||
Args:
|
||||
client: Connected Weaviate client.
|
||||
|
||||
Returns:
|
||||
Set of (title, author) tuples for works that have chunks.
|
||||
"""
|
||||
print("📊 Récupération de tous les chunks...")
|
||||
|
||||
chunk_collection = client.collections.get("Chunk")
|
||||
chunks_response = chunk_collection.query.fetch_objects(
|
||||
limit=10000,
|
||||
)
|
||||
|
||||
print(f" ✓ {len(chunks_response.objects)} chunks récupérés")
|
||||
print()
|
||||
|
||||
# Extraire les œuvres uniques (normalisation pour comparaison)
|
||||
works_with_chunks: Set[Tuple[str, str]] = set()
|
||||
|
||||
for chunk_obj in chunks_response.objects:
|
||||
props = chunk_obj.properties
|
||||
|
||||
if "work" in props and isinstance(props["work"], dict):
|
||||
work = props["work"]
|
||||
title = work.get("title")
|
||||
author = work.get("author")
|
||||
|
||||
if title and author:
|
||||
# Normaliser pour comparaison (lowercase pour ignorer casse)
|
||||
works_with_chunks.add((title.lower(), author.lower()))
|
||||
|
||||
print(f"📚 {len(works_with_chunks)} œuvres uniques dans les chunks")
|
||||
print()
|
||||
|
||||
return works_with_chunks
|
||||
|
||||
|
||||
def identify_orphan_works(
|
||||
client: weaviate.WeaviateClient,
|
||||
works_with_chunks: Set[Tuple[str, str]],
|
||||
) -> List[Any]:
|
||||
"""Identifier les Works orphelins (sans chunks).
|
||||
|
||||
Args:
|
||||
client: Connected Weaviate client.
|
||||
works_with_chunks: Set of (title, author) that have chunks.
|
||||
|
||||
Returns:
|
||||
List of orphan Work objects.
|
||||
"""
|
||||
print("📊 Récupération de tous les Works...")
|
||||
|
||||
work_collection = client.collections.get("Work")
|
||||
works_response = work_collection.query.fetch_objects(
|
||||
limit=1000,
|
||||
)
|
||||
|
||||
print(f" ✓ {len(works_response.objects)} Works récupérés")
|
||||
print()
|
||||
|
||||
# Identifier les orphelins
|
||||
orphan_works: List[Any] = []
|
||||
|
||||
for work_obj in works_response.objects:
|
||||
props = work_obj.properties
|
||||
title = props.get("title")
|
||||
author = props.get("author")
|
||||
|
||||
if title and author:
|
||||
# Normaliser pour comparaison (lowercase)
|
||||
if (title.lower(), author.lower()) not in works_with_chunks:
|
||||
orphan_works.append(work_obj)
|
||||
|
||||
print(f"🔍 {len(orphan_works)} Works orphelins détectés")
|
||||
print()
|
||||
|
||||
return orphan_works
|
||||
|
||||
|
||||
def display_orphans_report(orphan_works: List[Any]) -> None:
|
||||
"""Afficher le rapport des Works orphelins.
|
||||
|
||||
Args:
|
||||
orphan_works: List of orphan Work objects.
|
||||
"""
|
||||
if not orphan_works:
|
||||
print("✅ Aucun Work orphelin détecté !")
|
||||
print()
|
||||
return
|
||||
|
||||
print("=" * 80)
|
||||
print("WORKS ORPHELINS DÉTECTÉS")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
print(f"📌 {len(orphan_works)} Works sans chunks associés")
|
||||
print()
|
||||
|
||||
for i, work_obj in enumerate(orphan_works, 1):
|
||||
props = work_obj.properties
|
||||
print(f"[{i}/{len(orphan_works)}] {props.get('title', 'N/A')}")
|
||||
print("─" * 80)
|
||||
print(f" Auteur : {props.get('author', 'N/A')}")
|
||||
|
||||
if props.get("year"):
|
||||
year = props["year"]
|
||||
if year < 0:
|
||||
print(f" Année : {abs(year)} av. J.-C.")
|
||||
else:
|
||||
print(f" Année : {year}")
|
||||
|
||||
if props.get("language"):
|
||||
print(f" Langue : {props['language']}")
|
||||
|
||||
if props.get("genre"):
|
||||
print(f" Genre : {props['genre']}")
|
||||
|
||||
print(f" UUID : {work_obj.uuid}")
|
||||
print()
|
||||
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
|
||||
def delete_orphan_works(
|
||||
client: weaviate.WeaviateClient,
|
||||
orphan_works: List[Any],
|
||||
dry_run: bool = True,
|
||||
) -> Dict[str, int]:
|
||||
"""Supprimer les Works orphelins.
|
||||
|
||||
Args:
|
||||
client: Connected Weaviate client.
|
||||
orphan_works: List of orphan Work objects.
|
||||
dry_run: If True, only simulate (don't actually delete).
|
||||
|
||||
Returns:
|
||||
Dict with statistics: deleted, errors.
|
||||
"""
|
||||
stats = {
|
||||
"deleted": 0,
|
||||
"errors": 0,
|
||||
}
|
||||
|
||||
if not orphan_works:
|
||||
print("✅ Aucun Work à supprimer (pas d'orphelins)")
|
||||
return stats
|
||||
|
||||
if dry_run:
|
||||
print("🔍 MODE DRY-RUN (simulation, aucune suppression réelle)")
|
||||
else:
|
||||
print("⚠️ MODE EXÉCUTION (suppression réelle)")
|
||||
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
work_collection = client.collections.get("Work")
|
||||
|
||||
for work_obj in orphan_works:
|
||||
props = work_obj.properties
|
||||
title = props.get("title", "N/A")
|
||||
author = props.get("author", "N/A")
|
||||
|
||||
print(f"Traitement de '{title}' par {author}...")
|
||||
|
||||
if dry_run:
|
||||
print(f" 🔍 [DRY-RUN] Supprimerait UUID {work_obj.uuid}")
|
||||
stats["deleted"] += 1
|
||||
else:
|
||||
try:
|
||||
work_collection.data.delete_by_id(work_obj.uuid)
|
||||
print(f" ❌ Supprimé UUID {work_obj.uuid}")
|
||||
stats["deleted"] += 1
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Erreur suppression UUID {work_obj.uuid}: {e}")
|
||||
stats["errors"] += 1
|
||||
|
||||
print()
|
||||
|
||||
print("=" * 80)
|
||||
print("RÉSUMÉ")
|
||||
print("=" * 80)
|
||||
print(f" Works supprimés : {stats['deleted']}")
|
||||
print(f" Erreurs : {stats['errors']}")
|
||||
print()
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def verify_cleanup(client: weaviate.WeaviateClient) -> None:
|
||||
"""Vérifier le résultat du nettoyage.
|
||||
|
||||
Args:
|
||||
client: Connected Weaviate client.
|
||||
"""
|
||||
print("=" * 80)
|
||||
print("VÉRIFICATION POST-NETTOYAGE")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
works_with_chunks = get_works_from_chunks(client)
|
||||
orphan_works = identify_orphan_works(client, works_with_chunks)
|
||||
|
||||
if not orphan_works:
|
||||
print("✅ Aucun Work orphelin restant !")
|
||||
print()
|
||||
|
||||
# Statistiques finales
|
||||
work_coll = client.collections.get("Work")
|
||||
work_result = work_coll.aggregate.over_all(total_count=True)
|
||||
|
||||
print(f"📊 Works totaux : {work_result.total_count}")
|
||||
print(f"📊 Œuvres avec chunks : {len(works_with_chunks)}")
|
||||
print()
|
||||
|
||||
if work_result.total_count == len(works_with_chunks):
|
||||
print("✅ Cohérence parfaite : 1 Work = 1 œuvre avec chunks")
|
||||
print()
|
||||
else:
|
||||
print(f"⚠️ {len(orphan_works)} Works orphelins persistent")
|
||||
print()
|
||||
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Supprimer les Works orphelins (sans chunks associés)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--execute",
|
||||
action="store_true",
|
||||
help="Exécuter la suppression (par défaut: dry-run)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Fix encoding for Windows console
|
||||
if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
print("=" * 80)
|
||||
print("NETTOYAGE DES WORKS ORPHELINS")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
client = weaviate.connect_to_local(
|
||||
host="localhost",
|
||||
port=8080,
|
||||
grpc_port=50051,
|
||||
)
|
||||
|
||||
try:
|
||||
if not client.is_ready():
|
||||
print("❌ Weaviate is not ready. Ensure docker-compose is running.")
|
||||
sys.exit(1)
|
||||
|
||||
print("✓ Weaviate is ready")
|
||||
print()
|
||||
|
||||
# Étape 1 : Identifier les œuvres avec chunks
|
||||
works_with_chunks = get_works_from_chunks(client)
|
||||
|
||||
# Étape 2 : Identifier les Works orphelins
|
||||
orphan_works = identify_orphan_works(client, works_with_chunks)
|
||||
|
||||
# Étape 3 : Afficher le rapport
|
||||
display_orphans_report(orphan_works)
|
||||
|
||||
if not orphan_works:
|
||||
print("✅ Aucune action nécessaire (pas d'orphelins)")
|
||||
sys.exit(0)
|
||||
|
||||
# Étape 4 : Supprimer (ou simuler)
|
||||
if args.execute:
|
||||
print(f"⚠️ ATTENTION : {len(orphan_works)} Works vont être supprimés !")
|
||||
print()
|
||||
response = input("Continuer ? (oui/non) : ").strip().lower()
|
||||
if response not in ["oui", "yes", "o", "y"]:
|
||||
print("❌ Annulé par l'utilisateur.")
|
||||
sys.exit(0)
|
||||
print()
|
||||
|
||||
stats = delete_orphan_works(client, orphan_works, dry_run=not args.execute)
|
||||
|
||||
# Étape 5 : Vérifier le résultat (seulement si exécution réelle)
|
||||
if args.execute and stats["deleted"] > 0:
|
||||
verify_cleanup(client)
|
||||
else:
|
||||
print("=" * 80)
|
||||
print("💡 NEXT STEP")
|
||||
print("=" * 80)
|
||||
print()
|
||||
print("Pour exécuter le nettoyage, lancez :")
|
||||
print(" python clean_orphan_works.py --execute")
|
||||
print()
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user