feat: Add data quality verification & cleanup scripts
## Data Quality & Cleanup (Priorities 1-6) Added comprehensive data quality verification and cleanup system: **Scripts créés**: - verify_data_quality.py: Analyse qualité complète œuvre par œuvre - clean_duplicate_documents.py: Nettoyage doublons Documents - populate_work_collection.py/clean.py: Peuplement Work collection - fix_chunks_count.py: Correction chunksCount incohérents - manage_orphan_chunks.py: Gestion chunks orphelins (3 options) - clean_orphan_works.py: Suppression Works sans chunks - add_missing_work.py: Création Work manquant - generate_schema_stats.py: Génération stats auto - migrate_add_work_collection.py: Migration sûre Work collection **Documentation**: - WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes) - WEAVIATE_SCHEMA.md: Référence schéma rapide - NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session - ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale - rapport_qualite_donnees.txt: Output brut vérification **Résultats nettoyage**: - Documents: 16 → 9 (7 doublons supprimés) - Works: 0 → 9 (peuplé + nettoyé) - Chunks: 5,404 → 5,230 (174 orphelins supprimés) - chunksCount: Corrigés (231 → 5,230 déclaré = réel) - Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres **Modifications code**: - schema.py: Ajout Work collection avec vectorisation - utils/weaviate_ingest.py: Support Work ingestion - utils/word_pipeline.py: Désactivation concepts (problème .lower()) - utils/word_toc_extractor.py: Métadonnées Word correctes - .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
352
generations/library_rag/fix_chunks_count.py
Normal file
352
generations/library_rag/fix_chunks_count.py
Normal file
@@ -0,0 +1,352 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Recalculer et corriger le champ chunksCount des Documents.
|
||||
|
||||
Ce script :
|
||||
1. Récupère tous les chunks et documents
|
||||
2. Compte le nombre réel de chunks pour chaque document (via document.sourceId)
|
||||
3. Compare avec le chunksCount déclaré dans Document
|
||||
4. Met à jour les Documents avec les valeurs correctes
|
||||
|
||||
Usage:
|
||||
# Dry-run (affiche ce qui serait corrigé, sans rien faire)
|
||||
python fix_chunks_count.py
|
||||
|
||||
# Exécution réelle (met à jour les chunksCount)
|
||||
python fix_chunks_count.py --execute
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
from typing import Any, Dict, List
|
||||
from collections import defaultdict
|
||||
|
||||
import weaviate
|
||||
|
||||
|
||||
def count_chunks_per_document(
|
||||
all_chunks: List[Any],
|
||||
) -> Dict[str, int]:
|
||||
"""Compter le nombre de chunks pour chaque sourceId.
|
||||
|
||||
Args:
|
||||
all_chunks: All chunks from database.
|
||||
|
||||
Returns:
|
||||
Dict mapping sourceId to chunk count.
|
||||
"""
|
||||
counts: Dict[str, int] = defaultdict(int)
|
||||
|
||||
for chunk_obj in all_chunks:
|
||||
props = chunk_obj.properties
|
||||
if "document" in props and isinstance(props["document"], dict):
|
||||
source_id = props["document"].get("sourceId")
|
||||
if source_id:
|
||||
counts[source_id] += 1
|
||||
|
||||
return counts
|
||||
|
||||
|
||||
def analyze_chunks_count_discrepancies(
|
||||
client: weaviate.WeaviateClient,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Analyser les incohérences entre chunksCount déclaré et réel.
|
||||
|
||||
Args:
|
||||
client: Connected Weaviate client.
|
||||
|
||||
Returns:
|
||||
List of dicts with document info and discrepancies.
|
||||
"""
|
||||
print("📊 Récupération de tous les chunks...")
|
||||
|
||||
chunk_collection = client.collections.get("Chunk")
|
||||
chunks_response = chunk_collection.query.fetch_objects(
|
||||
limit=10000,
|
||||
)
|
||||
|
||||
all_chunks = chunks_response.objects
|
||||
print(f" ✓ {len(all_chunks)} chunks récupérés")
|
||||
print()
|
||||
|
||||
print("📊 Comptage par document...")
|
||||
real_counts = count_chunks_per_document(all_chunks)
|
||||
print(f" ✓ {len(real_counts)} documents avec chunks")
|
||||
print()
|
||||
|
||||
print("📊 Récupération de tous les documents...")
|
||||
doc_collection = client.collections.get("Document")
|
||||
docs_response = doc_collection.query.fetch_objects(
|
||||
limit=1000,
|
||||
)
|
||||
|
||||
print(f" ✓ {len(docs_response.objects)} documents récupérés")
|
||||
print()
|
||||
|
||||
# Analyser les discordances
|
||||
discrepancies: List[Dict[str, Any]] = []
|
||||
|
||||
for doc_obj in docs_response.objects:
|
||||
props = doc_obj.properties
|
||||
source_id = props.get("sourceId", "unknown")
|
||||
declared_count = props.get("chunksCount", 0)
|
||||
real_count = real_counts.get(source_id, 0)
|
||||
|
||||
discrepancy = {
|
||||
"uuid": doc_obj.uuid,
|
||||
"sourceId": source_id,
|
||||
"title": props.get("title", "N/A"),
|
||||
"author": props.get("author", "N/A"),
|
||||
"declared_count": declared_count,
|
||||
"real_count": real_count,
|
||||
"difference": real_count - declared_count,
|
||||
"needs_update": declared_count != real_count,
|
||||
}
|
||||
|
||||
discrepancies.append(discrepancy)
|
||||
|
||||
return discrepancies
|
||||
|
||||
|
||||
def display_discrepancies_report(discrepancies: List[Dict[str, Any]]) -> None:
|
||||
"""Afficher le rapport des incohérences.
|
||||
|
||||
Args:
|
||||
discrepancies: List of document discrepancy dicts.
|
||||
"""
|
||||
print("=" * 80)
|
||||
print("RAPPORT DES INCOHÉRENCES chunksCount")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
total_declared = sum(d["declared_count"] for d in discrepancies)
|
||||
total_real = sum(d["real_count"] for d in discrepancies)
|
||||
total_difference = total_real - total_declared
|
||||
|
||||
needs_update = [d for d in discrepancies if d["needs_update"]]
|
||||
|
||||
print(f"📌 {len(discrepancies)} documents au total")
|
||||
print(f"📌 {len(needs_update)} documents à corriger")
|
||||
print()
|
||||
print(f"📊 Total déclaré (somme chunksCount) : {total_declared:,}")
|
||||
print(f"📊 Total réel (comptage chunks) : {total_real:,}")
|
||||
print(f"📊 Différence globale : {total_difference:+,}")
|
||||
print()
|
||||
|
||||
if not needs_update:
|
||||
print("✅ Tous les chunksCount sont corrects !")
|
||||
print()
|
||||
return
|
||||
|
||||
print("─" * 80)
|
||||
print()
|
||||
|
||||
for i, doc in enumerate(discrepancies, 1):
|
||||
if not doc["needs_update"]:
|
||||
status = "✅"
|
||||
elif doc["difference"] > 0:
|
||||
status = "⚠️ "
|
||||
else:
|
||||
status = "⚠️ "
|
||||
|
||||
print(f"{status} [{i}/{len(discrepancies)}] {doc['sourceId']}")
|
||||
|
||||
if doc["needs_update"]:
|
||||
print("─" * 80)
|
||||
print(f" Titre : {doc['title']}")
|
||||
print(f" Auteur : {doc['author']}")
|
||||
print(f" chunksCount déclaré : {doc['declared_count']:,}")
|
||||
print(f" Chunks réels : {doc['real_count']:,}")
|
||||
print(f" Différence : {doc['difference']:+,}")
|
||||
print(f" UUID : {doc['uuid']}")
|
||||
print()
|
||||
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
|
||||
def fix_chunks_count(
|
||||
client: weaviate.WeaviateClient,
|
||||
discrepancies: List[Dict[str, Any]],
|
||||
dry_run: bool = True,
|
||||
) -> Dict[str, int]:
|
||||
"""Corriger les chunksCount dans les Documents.
|
||||
|
||||
Args:
|
||||
client: Connected Weaviate client.
|
||||
discrepancies: List of document discrepancy dicts.
|
||||
dry_run: If True, only simulate (don't actually update).
|
||||
|
||||
Returns:
|
||||
Dict with statistics: updated, unchanged, errors.
|
||||
"""
|
||||
stats = {
|
||||
"updated": 0,
|
||||
"unchanged": 0,
|
||||
"errors": 0,
|
||||
}
|
||||
|
||||
needs_update = [d for d in discrepancies if d["needs_update"]]
|
||||
|
||||
if not needs_update:
|
||||
print("✅ Aucune correction nécessaire !")
|
||||
stats["unchanged"] = len(discrepancies)
|
||||
return stats
|
||||
|
||||
if dry_run:
|
||||
print("🔍 MODE DRY-RUN (simulation, aucune mise à jour réelle)")
|
||||
else:
|
||||
print("⚠️ MODE EXÉCUTION (mise à jour réelle)")
|
||||
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
doc_collection = client.collections.get("Document")
|
||||
|
||||
for doc in discrepancies:
|
||||
if not doc["needs_update"]:
|
||||
stats["unchanged"] += 1
|
||||
continue
|
||||
|
||||
source_id = doc["sourceId"]
|
||||
old_count = doc["declared_count"]
|
||||
new_count = doc["real_count"]
|
||||
|
||||
print(f"Traitement de {source_id}...")
|
||||
print(f" {old_count:,} → {new_count:,} chunks")
|
||||
|
||||
if dry_run:
|
||||
print(f" 🔍 [DRY-RUN] Mettrait à jour UUID {doc['uuid']}")
|
||||
stats["updated"] += 1
|
||||
else:
|
||||
try:
|
||||
# Mettre à jour l'objet Document
|
||||
doc_collection.data.update(
|
||||
uuid=doc["uuid"],
|
||||
properties={"chunksCount": new_count},
|
||||
)
|
||||
print(f" ✅ Mis à jour UUID {doc['uuid']}")
|
||||
stats["updated"] += 1
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Erreur mise à jour UUID {doc['uuid']}: {e}")
|
||||
stats["errors"] += 1
|
||||
|
||||
print()
|
||||
|
||||
print("=" * 80)
|
||||
print("RÉSUMÉ")
|
||||
print("=" * 80)
|
||||
print(f" Documents mis à jour : {stats['updated']}")
|
||||
print(f" Documents inchangés : {stats['unchanged']}")
|
||||
print(f" Erreurs : {stats['errors']}")
|
||||
print()
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def verify_fix(client: weaviate.WeaviateClient) -> None:
|
||||
"""Vérifier le résultat de la correction.
|
||||
|
||||
Args:
|
||||
client: Connected Weaviate client.
|
||||
"""
|
||||
print("=" * 80)
|
||||
print("VÉRIFICATION POST-CORRECTION")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
discrepancies = analyze_chunks_count_discrepancies(client)
|
||||
needs_update = [d for d in discrepancies if d["needs_update"]]
|
||||
|
||||
if not needs_update:
|
||||
print("✅ Tous les chunksCount sont désormais corrects !")
|
||||
print()
|
||||
|
||||
total_declared = sum(d["declared_count"] for d in discrepancies)
|
||||
total_real = sum(d["real_count"] for d in discrepancies)
|
||||
|
||||
print(f"📊 Total déclaré : {total_declared:,}")
|
||||
print(f"📊 Total réel : {total_real:,}")
|
||||
print(f"📊 Différence : {total_real - total_declared:+,}")
|
||||
print()
|
||||
else:
|
||||
print(f"⚠️ {len(needs_update)} incohérences persistent :")
|
||||
display_discrepancies_report(discrepancies)
|
||||
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Recalculer et corriger les chunksCount des Documents"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--execute",
|
||||
action="store_true",
|
||||
help="Exécuter la correction (par défaut: dry-run)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Fix encoding for Windows console
|
||||
if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
print("=" * 80)
|
||||
print("CORRECTION DES chunksCount")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
client = weaviate.connect_to_local(
|
||||
host="localhost",
|
||||
port=8080,
|
||||
grpc_port=50051,
|
||||
)
|
||||
|
||||
try:
|
||||
if not client.is_ready():
|
||||
print("❌ Weaviate is not ready. Ensure docker-compose is running.")
|
||||
sys.exit(1)
|
||||
|
||||
print("✓ Weaviate is ready")
|
||||
print()
|
||||
|
||||
# Étape 1 : Analyser les incohérences
|
||||
discrepancies = analyze_chunks_count_discrepancies(client)
|
||||
|
||||
# Étape 2 : Afficher le rapport
|
||||
display_discrepancies_report(discrepancies)
|
||||
|
||||
# Étape 3 : Corriger (ou simuler)
|
||||
if args.execute:
|
||||
needs_update = [d for d in discrepancies if d["needs_update"]]
|
||||
if needs_update:
|
||||
print(f"⚠️ ATTENTION : {len(needs_update)} documents vont être mis à jour !")
|
||||
print()
|
||||
response = input("Continuer ? (oui/non) : ").strip().lower()
|
||||
if response not in ["oui", "yes", "o", "y"]:
|
||||
print("❌ Annulé par l'utilisateur.")
|
||||
sys.exit(0)
|
||||
print()
|
||||
|
||||
stats = fix_chunks_count(client, discrepancies, dry_run=not args.execute)
|
||||
|
||||
# Étape 4 : Vérifier le résultat (seulement si exécution réelle)
|
||||
if args.execute and stats["updated"] > 0:
|
||||
verify_fix(client)
|
||||
elif not args.execute:
|
||||
print("=" * 80)
|
||||
print("💡 NEXT STEP")
|
||||
print("=" * 80)
|
||||
print()
|
||||
print("Pour exécuter la correction, lancez :")
|
||||
print(" python fix_chunks_count.py --execute")
|
||||
print()
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user