## Data Quality & Cleanup (Priorities 1-6) Added comprehensive data quality verification and cleanup system: **Scripts créés**: - verify_data_quality.py: Analyse qualité complète œuvre par œuvre - clean_duplicate_documents.py: Nettoyage doublons Documents - populate_work_collection.py/clean.py: Peuplement Work collection - fix_chunks_count.py: Correction chunksCount incohérents - manage_orphan_chunks.py: Gestion chunks orphelins (3 options) - clean_orphan_works.py: Suppression Works sans chunks - add_missing_work.py: Création Work manquant - generate_schema_stats.py: Génération stats auto - migrate_add_work_collection.py: Migration sûre Work collection **Documentation**: - WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes) - WEAVIATE_SCHEMA.md: Référence schéma rapide - NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session - ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale - rapport_qualite_donnees.txt: Output brut vérification **Résultats nettoyage**: - Documents: 16 → 9 (7 doublons supprimés) - Works: 0 → 9 (peuplé + nettoyé) - Chunks: 5,404 → 5,230 (174 orphelins supprimés) - chunksCount: Corrigés (231 → 5,230 déclaré = réel) - Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres **Modifications code**: - schema.py: Ajout Work collection avec vectorisation - utils/weaviate_ingest.py: Support Work ingestion - utils/word_pipeline.py: Désactivation concepts (problème .lower()) - utils/word_toc_extractor.py: Métadonnées Word correctes - .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
514 lines
17 KiB
Python
514 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""Peupler la collection Work avec nettoyage des doublons et corrections.
|
|
|
|
Ce script :
|
|
1. Extrait les œuvres uniques depuis les nested objects des Chunks
|
|
2. Applique un mapping de corrections pour résoudre les incohérences :
|
|
- Variations de titres (ex: Darwin - 3 titres différents)
|
|
- Variations d'auteurs (ex: Peirce - 3 orthographes)
|
|
- Titres génériques à corriger
|
|
3. Consolide les œuvres par (canonical_title, canonical_author)
|
|
4. Insère les Works canoniques dans la collection Work
|
|
|
|
Usage:
|
|
# Dry-run (affiche ce qui serait inséré, sans rien faire)
|
|
python populate_work_collection_clean.py
|
|
|
|
# Exécution réelle (insère les Works)
|
|
python populate_work_collection_clean.py --execute
|
|
"""
|
|
|
|
import sys
|
|
import argparse
|
|
from typing import Any, Dict, List, Set, Tuple, Optional
|
|
from collections import defaultdict
|
|
|
|
import weaviate
|
|
|
|
|
|
# =============================================================================
|
|
# Mapping de corrections manuelles
|
|
# =============================================================================
|
|
|
|
# Corrections de titres : original_title -> canonical_title
|
|
TITLE_CORRECTIONS = {
|
|
# Peirce : titre générique → titre correct
|
|
"Titre corrigé si nécessaire (ex: 'The Fixation of Belief')": "The Fixation of Belief",
|
|
|
|
# Darwin : variations du même ouvrage (Historical Sketch)
|
|
"An Historical Sketch of the Progress of Opinion on the Origin of Species":
|
|
"An Historical Sketch of the Progress of Opinion on the Origin of Species",
|
|
"An Historical Sketch of the Progress of Opinion on the Origin of Species, Previously to the Publication of the First Edition of This Work":
|
|
"An Historical Sketch of the Progress of Opinion on the Origin of Species",
|
|
|
|
# Darwin : On the Origin of Species (titre complet -> titre court)
|
|
"On the Origin of Species BY MEANS OF NATURAL SELECTION, OR THE PRESERVATION OF FAVOURED RACES IN THE STRUGGLE FOR LIFE.":
|
|
"On the Origin of Species",
|
|
}
|
|
|
|
# Corrections d'auteurs : original_author -> canonical_author
|
|
AUTHOR_CORRECTIONS = {
|
|
# Peirce : 3 variations → 1 seule
|
|
"Charles Sanders PEIRCE": "Charles Sanders Peirce",
|
|
"C. S. Peirce": "Charles Sanders Peirce",
|
|
|
|
# Darwin : MAJUSCULES → Capitalisé
|
|
"Charles DARWIN": "Charles Darwin",
|
|
}
|
|
|
|
# Métadonnées supplémentaires pour certaines œuvres (optionnel)
|
|
WORK_METADATA = {
|
|
("On the Origin of Species", "Charles Darwin"): {
|
|
"originalTitle": "On the Origin of Species by Means of Natural Selection",
|
|
"year": 1859,
|
|
"language": "en",
|
|
"genre": "scientific treatise",
|
|
},
|
|
("The Fixation of Belief", "Charles Sanders Peirce"): {
|
|
"year": 1877,
|
|
"language": "en",
|
|
"genre": "philosophical article",
|
|
},
|
|
("Collected papers", "Charles Sanders Peirce"): {
|
|
"originalTitle": "Collected Papers of Charles Sanders Peirce",
|
|
"year": 1931, # Publication date of volumes 1-6
|
|
"language": "en",
|
|
"genre": "collected works",
|
|
},
|
|
("La pensée-signe. Études sur C. S. Peirce", "Claudine Tiercelin"): {
|
|
"year": 1993,
|
|
"language": "fr",
|
|
"genre": "philosophical study",
|
|
},
|
|
("Platon - Ménon", "Platon"): {
|
|
"originalTitle": "Μένων",
|
|
"year": -380, # Environ 380 avant J.-C.
|
|
"language": "gr",
|
|
"genre": "dialogue",
|
|
},
|
|
("Mind Design III: Philosophy, Psychology, and Artificial Intelligence (si confirmation)",
|
|
"John Haugeland, Carl F. Craver, and Colin Klein"): {
|
|
"year": 2023,
|
|
"language": "en",
|
|
"genre": "anthology",
|
|
},
|
|
("Artificial Intelligence: The Very Idea (1985)", "John Haugeland"): {
|
|
"originalTitle": "Artificial Intelligence: The Very Idea",
|
|
"year": 1985,
|
|
"language": "en",
|
|
"genre": "philosophical monograph",
|
|
},
|
|
("Between Past and Future", "Hannah Arendt"): {
|
|
"year": 1961,
|
|
"language": "en",
|
|
"genre": "political philosophy",
|
|
},
|
|
("On a New List of Categories", "Charles Sanders Peirce"): {
|
|
"year": 1867,
|
|
"language": "en",
|
|
"genre": "philosophical article",
|
|
},
|
|
("La logique de la science", "Charles Sanders Peirce"): {
|
|
"year": 1878,
|
|
"language": "fr",
|
|
"genre": "philosophical article",
|
|
},
|
|
("An Historical Sketch of the Progress of Opinion on the Origin of Species", "Charles Darwin"): {
|
|
"year": 1861,
|
|
"language": "en",
|
|
"genre": "historical sketch",
|
|
},
|
|
}
|
|
|
|
|
|
def apply_corrections(title: str, author: str) -> Tuple[str, str]:
|
|
"""Appliquer les corrections de titre et auteur.
|
|
|
|
Args:
|
|
title: Original title from nested object.
|
|
author: Original author from nested object.
|
|
|
|
Returns:
|
|
Tuple of (canonical_title, canonical_author).
|
|
"""
|
|
canonical_title = TITLE_CORRECTIONS.get(title, title)
|
|
canonical_author = AUTHOR_CORRECTIONS.get(author, author)
|
|
return (canonical_title, canonical_author)
|
|
|
|
|
|
def extract_unique_works_from_chunks(
|
|
client: weaviate.WeaviateClient
|
|
) -> Dict[Tuple[str, str], Dict[str, Any]]:
|
|
"""Extraire les œuvres uniques depuis les nested objects des Chunks (avec corrections).
|
|
|
|
Args:
|
|
client: Connected Weaviate client.
|
|
|
|
Returns:
|
|
Dict mapping (canonical_title, canonical_author) to work metadata.
|
|
"""
|
|
print("📊 Récupération de tous les chunks...")
|
|
|
|
chunk_collection = client.collections.get("Chunk")
|
|
chunks_response = chunk_collection.query.fetch_objects(
|
|
limit=10000,
|
|
)
|
|
|
|
print(f" ✓ {len(chunks_response.objects)} chunks récupérés")
|
|
print()
|
|
|
|
# Extraire les œuvres uniques avec corrections
|
|
works_data: Dict[Tuple[str, str], Dict[str, Any]] = {}
|
|
corrections_applied: Dict[Tuple[str, str], Tuple[str, str]] = {} # original -> canonical
|
|
|
|
for chunk_obj in chunks_response.objects:
|
|
props = chunk_obj.properties
|
|
|
|
if "work" in props and isinstance(props["work"], dict):
|
|
work = props["work"]
|
|
original_title = work.get("title")
|
|
original_author = work.get("author")
|
|
|
|
if original_title and original_author:
|
|
# Appliquer corrections
|
|
canonical_title, canonical_author = apply_corrections(original_title, original_author)
|
|
canonical_key = (canonical_title, canonical_author)
|
|
original_key = (original_title, original_author)
|
|
|
|
# Tracker les corrections
|
|
if original_key != canonical_key:
|
|
corrections_applied[original_key] = canonical_key
|
|
|
|
# Initialiser si première occurrence
|
|
if canonical_key not in works_data:
|
|
works_data[canonical_key] = {
|
|
"title": canonical_title,
|
|
"author": canonical_author,
|
|
"chunk_count": 0,
|
|
"languages": set(),
|
|
"original_titles": set(),
|
|
"original_authors": set(),
|
|
}
|
|
|
|
# Compter les chunks
|
|
works_data[canonical_key]["chunk_count"] += 1
|
|
|
|
# Collecter les langues
|
|
if "language" in props and props["language"]:
|
|
works_data[canonical_key]["languages"].add(props["language"])
|
|
|
|
# Tracker les titres/auteurs originaux (pour rapport)
|
|
works_data[canonical_key]["original_titles"].add(original_title)
|
|
works_data[canonical_key]["original_authors"].add(original_author)
|
|
|
|
print(f"📚 {len(works_data)} œuvres uniques (après corrections)")
|
|
print(f"🔧 {len(corrections_applied)} corrections appliquées")
|
|
print()
|
|
|
|
return works_data
|
|
|
|
|
|
def display_corrections_report(works_data: Dict[Tuple[str, str], Dict[str, Any]]) -> None:
|
|
"""Afficher un rapport des corrections appliquées.
|
|
|
|
Args:
|
|
works_data: Dict mapping (canonical_title, canonical_author) to work metadata.
|
|
"""
|
|
print("=" * 80)
|
|
print("CORRECTIONS APPLIQUÉES")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
corrections_found = False
|
|
|
|
for (title, author), work_info in sorted(works_data.items()):
|
|
original_titles = work_info.get("original_titles", set())
|
|
original_authors = work_info.get("original_authors", set())
|
|
|
|
# Si plus d'un titre ou auteur original, il y a eu consolidation
|
|
if len(original_titles) > 1 or len(original_authors) > 1:
|
|
corrections_found = True
|
|
print(f"✅ {title}")
|
|
print("─" * 80)
|
|
|
|
if len(original_titles) > 1:
|
|
print(f" Titres consolidés ({len(original_titles)}) :")
|
|
for orig_title in sorted(original_titles):
|
|
if orig_title != title:
|
|
print(f" • {orig_title}")
|
|
|
|
if len(original_authors) > 1:
|
|
print(f" Auteurs consolidés ({len(original_authors)}) :")
|
|
for orig_author in sorted(original_authors):
|
|
if orig_author != author:
|
|
print(f" • {orig_author}")
|
|
|
|
print(f" Chunks total : {work_info['chunk_count']:,}")
|
|
print()
|
|
|
|
if not corrections_found:
|
|
print("Aucune consolidation nécessaire.")
|
|
print()
|
|
|
|
print("=" * 80)
|
|
print()
|
|
|
|
|
|
def display_works_report(works_data: Dict[Tuple[str, str], Dict[str, Any]]) -> None:
|
|
"""Afficher un rapport des œuvres à insérer.
|
|
|
|
Args:
|
|
works_data: Dict mapping (title, author) to work metadata.
|
|
"""
|
|
print("=" * 80)
|
|
print("ŒUVRES À INSÉRER DANS WORK COLLECTION")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
total_chunks = sum(work["chunk_count"] for work in works_data.values())
|
|
|
|
print(f"📌 {len(works_data)} œuvres uniques")
|
|
print(f"📌 {total_chunks:,} chunks au total")
|
|
print()
|
|
|
|
for i, ((title, author), work_info) in enumerate(sorted(works_data.items()), 1):
|
|
print(f"[{i}/{len(works_data)}] {title}")
|
|
print("─" * 80)
|
|
print(f" Auteur : {author}")
|
|
print(f" Chunks : {work_info['chunk_count']:,}")
|
|
|
|
if work_info.get("languages"):
|
|
langs = ", ".join(sorted(work_info["languages"]))
|
|
print(f" Langues : {langs}")
|
|
|
|
# Métadonnées enrichies
|
|
enriched = WORK_METADATA.get((title, author))
|
|
if enriched:
|
|
if enriched.get("year"):
|
|
year = enriched["year"]
|
|
if year < 0:
|
|
print(f" Année : {abs(year)} av. J.-C.")
|
|
else:
|
|
print(f" Année : {year}")
|
|
if enriched.get("genre"):
|
|
print(f" Genre : {enriched['genre']}")
|
|
|
|
print()
|
|
|
|
print("=" * 80)
|
|
print()
|
|
|
|
|
|
def insert_works(
|
|
client: weaviate.WeaviateClient,
|
|
works_data: Dict[Tuple[str, str], Dict[str, Any]],
|
|
dry_run: bool = True,
|
|
) -> Dict[str, int]:
|
|
"""Insérer les œuvres dans la collection Work.
|
|
|
|
Args:
|
|
client: Connected Weaviate client.
|
|
works_data: Dict mapping (title, author) to work metadata.
|
|
dry_run: If True, only simulate (don't actually insert).
|
|
|
|
Returns:
|
|
Dict with statistics: inserted, errors.
|
|
"""
|
|
stats = {
|
|
"inserted": 0,
|
|
"errors": 0,
|
|
}
|
|
|
|
if dry_run:
|
|
print("🔍 MODE DRY-RUN (simulation, aucune insertion réelle)")
|
|
else:
|
|
print("⚠️ MODE EXÉCUTION (insertion réelle)")
|
|
|
|
print("=" * 80)
|
|
print()
|
|
|
|
work_collection = client.collections.get("Work")
|
|
|
|
for (title, author), work_info in sorted(works_data.items()):
|
|
print(f"Traitement de '{title}' par {author}...")
|
|
|
|
# Préparer l'objet Work avec métadonnées enrichies
|
|
work_obj: Dict[str, Any] = {
|
|
"title": title,
|
|
"author": author,
|
|
"originalTitle": None,
|
|
"year": None,
|
|
"language": None,
|
|
"genre": None,
|
|
}
|
|
|
|
# Si une seule langue détectée, l'utiliser
|
|
if work_info.get("languages") and len(work_info["languages"]) == 1:
|
|
work_obj["language"] = list(work_info["languages"])[0]
|
|
|
|
# Enrichir avec métadonnées manuelles si disponibles
|
|
enriched = WORK_METADATA.get((title, author))
|
|
if enriched:
|
|
work_obj.update(enriched)
|
|
|
|
if dry_run:
|
|
print(f" 🔍 [DRY-RUN] Insérerait : {work_obj}")
|
|
stats["inserted"] += 1
|
|
else:
|
|
try:
|
|
uuid = work_collection.data.insert(work_obj)
|
|
print(f" ✅ Inséré UUID {uuid}")
|
|
stats["inserted"] += 1
|
|
except Exception as e:
|
|
print(f" ⚠️ Erreur insertion : {e}")
|
|
stats["errors"] += 1
|
|
|
|
print()
|
|
|
|
print("=" * 80)
|
|
print("RÉSUMÉ")
|
|
print("=" * 80)
|
|
print(f" Works insérés : {stats['inserted']}")
|
|
print(f" Erreurs : {stats['errors']}")
|
|
print()
|
|
|
|
return stats
|
|
|
|
|
|
def verify_insertion(client: weaviate.WeaviateClient) -> None:
|
|
"""Vérifier le résultat de l'insertion.
|
|
|
|
Args:
|
|
client: Connected Weaviate client.
|
|
"""
|
|
print("=" * 80)
|
|
print("VÉRIFICATION POST-INSERTION")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
work_coll = client.collections.get("Work")
|
|
result = work_coll.aggregate.over_all(total_count=True)
|
|
|
|
print(f"📊 Works dans la collection : {result.total_count}")
|
|
|
|
if result.total_count > 0:
|
|
works_response = work_coll.query.fetch_objects(
|
|
limit=100,
|
|
)
|
|
|
|
print()
|
|
print("📚 Works créés :")
|
|
for i, work_obj in enumerate(works_response.objects, 1):
|
|
props = work_obj.properties
|
|
print(f" {i:2d}. {props['title']}")
|
|
print(f" Auteur : {props['author']}")
|
|
|
|
if props.get("year"):
|
|
year = props["year"]
|
|
if year < 0:
|
|
print(f" Année : {abs(year)} av. J.-C.")
|
|
else:
|
|
print(f" Année : {year}")
|
|
|
|
if props.get("language"):
|
|
print(f" Langue : {props['language']}")
|
|
|
|
if props.get("genre"):
|
|
print(f" Genre : {props['genre']}")
|
|
|
|
print()
|
|
|
|
print("=" * 80)
|
|
print()
|
|
|
|
|
|
def main() -> None:
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Peupler la collection Work avec corrections des doublons"
|
|
)
|
|
parser.add_argument(
|
|
"--execute",
|
|
action="store_true",
|
|
help="Exécuter l'insertion (par défaut: dry-run)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Fix encoding for Windows console
|
|
if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
|
|
print("=" * 80)
|
|
print("PEUPLEMENT DE LA COLLECTION WORK (AVEC CORRECTIONS)")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
client = weaviate.connect_to_local(
|
|
host="localhost",
|
|
port=8080,
|
|
grpc_port=50051,
|
|
)
|
|
|
|
try:
|
|
if not client.is_ready():
|
|
print("❌ Weaviate is not ready. Ensure docker-compose is running.")
|
|
sys.exit(1)
|
|
|
|
print("✓ Weaviate is ready")
|
|
print()
|
|
|
|
# Vérifier que Work collection existe
|
|
collections = client.collections.list_all()
|
|
if "Work" not in collections:
|
|
print("❌ ERREUR : La collection Work n'existe pas !")
|
|
print()
|
|
print(" Créez-la d'abord avec :")
|
|
print(" python migrate_add_work_collection.py")
|
|
print()
|
|
sys.exit(1)
|
|
|
|
# Étape 1 : Extraire les œuvres avec corrections
|
|
works_data = extract_unique_works_from_chunks(client)
|
|
|
|
if not works_data:
|
|
print("❌ Aucune œuvre détectée dans les chunks !")
|
|
sys.exit(1)
|
|
|
|
# Étape 2 : Afficher le rapport des corrections
|
|
display_corrections_report(works_data)
|
|
|
|
# Étape 3 : Afficher le rapport des œuvres à insérer
|
|
display_works_report(works_data)
|
|
|
|
# Étape 4 : Insérer (ou simuler)
|
|
if args.execute:
|
|
print("⚠️ ATTENTION : Les œuvres vont être INSÉRÉES dans la collection Work !")
|
|
print()
|
|
response = input("Continuer ? (oui/non) : ").strip().lower()
|
|
if response not in ["oui", "yes", "o", "y"]:
|
|
print("❌ Annulé par l'utilisateur.")
|
|
sys.exit(0)
|
|
print()
|
|
|
|
stats = insert_works(client, works_data, dry_run=not args.execute)
|
|
|
|
# Étape 5 : Vérifier le résultat (seulement si exécution réelle)
|
|
if args.execute:
|
|
verify_insertion(client)
|
|
else:
|
|
print("=" * 80)
|
|
print("💡 NEXT STEP")
|
|
print("=" * 80)
|
|
print()
|
|
print("Pour exécuter l'insertion, lancez :")
|
|
print(" python populate_work_collection_clean.py --execute")
|
|
print()
|
|
|
|
finally:
|
|
client.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|