feat: Add data quality verification & cleanup scripts
## Data Quality & Cleanup (Priorities 1-6) Added comprehensive data quality verification and cleanup system: **Scripts créés**: - verify_data_quality.py: Analyse qualité complète œuvre par œuvre - clean_duplicate_documents.py: Nettoyage doublons Documents - populate_work_collection.py/clean.py: Peuplement Work collection - fix_chunks_count.py: Correction chunksCount incohérents - manage_orphan_chunks.py: Gestion chunks orphelins (3 options) - clean_orphan_works.py: Suppression Works sans chunks - add_missing_work.py: Création Work manquant - generate_schema_stats.py: Génération stats auto - migrate_add_work_collection.py: Migration sûre Work collection **Documentation**: - WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes) - WEAVIATE_SCHEMA.md: Référence schéma rapide - NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session - ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale - rapport_qualite_donnees.txt: Output brut vérification **Résultats nettoyage**: - Documents: 16 → 9 (7 doublons supprimés) - Works: 0 → 9 (peuplé + nettoyé) - Chunks: 5,404 → 5,230 (174 orphelins supprimés) - chunksCount: Corrigés (231 → 5,230 déclaré = réel) - Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres **Modifications code**: - schema.py: Ajout Work collection avec vectorisation - utils/weaviate_ingest.py: Support Work ingestion - utils/word_pipeline.py: Désactivation concepts (problème .lower()) - utils/word_toc_extractor.py: Métadonnées Word correctes - .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
164
generations/library_rag/generate_schema_stats.py
Normal file
164
generations/library_rag/generate_schema_stats.py
Normal file
@@ -0,0 +1,164 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate statistics for WEAVIATE_SCHEMA.md documentation.
|
||||
|
||||
This script queries Weaviate and generates updated statistics to keep
|
||||
the schema documentation in sync with reality.
|
||||
|
||||
Usage:
|
||||
python generate_schema_stats.py
|
||||
|
||||
Output:
|
||||
Prints formatted markdown table with current statistics that can be
|
||||
copy-pasted into WEAVIATE_SCHEMA.md
|
||||
"""
|
||||
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Dict
|
||||
|
||||
import weaviate
|
||||
|
||||
|
||||
def get_collection_stats(client: weaviate.WeaviateClient) -> Dict[str, int]:
|
||||
"""Get object counts for all collections.
|
||||
|
||||
Args:
|
||||
client: Connected Weaviate client.
|
||||
|
||||
Returns:
|
||||
Dict mapping collection name to object count.
|
||||
"""
|
||||
stats: Dict[str, int] = {}
|
||||
|
||||
collections = client.collections.list_all()
|
||||
|
||||
for name in ["Work", "Document", "Chunk", "Summary"]:
|
||||
if name in collections:
|
||||
try:
|
||||
coll = client.collections.get(name)
|
||||
result = coll.aggregate.over_all(total_count=True)
|
||||
stats[name] = result.total_count
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not get count for {name}: {e}", file=sys.stderr)
|
||||
stats[name] = 0
|
||||
else:
|
||||
stats[name] = 0
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def print_markdown_stats(stats: Dict[str, int]) -> None:
|
||||
"""Print statistics in markdown table format for WEAVIATE_SCHEMA.md.
|
||||
|
||||
Args:
|
||||
stats: Dict mapping collection name to object count.
|
||||
"""
|
||||
total_vectors = stats["Chunk"] + stats["Summary"]
|
||||
ratio = stats["Summary"] / stats["Chunk"] if stats["Chunk"] > 0 else 0
|
||||
|
||||
today = datetime.now().strftime("%d/%m/%Y")
|
||||
|
||||
print(f"## Contenu actuel (au {today})")
|
||||
print()
|
||||
print(f"**Dernière vérification** : {datetime.now().strftime('%d %B %Y')} via `generate_schema_stats.py`")
|
||||
print()
|
||||
print("### Statistiques par collection")
|
||||
print()
|
||||
print("| Collection | Objets | Vectorisé | Utilisation |")
|
||||
print("|------------|--------|-----------|-------------|")
|
||||
print(f"| **Chunk** | **{stats['Chunk']:,}** | ✅ Oui | Recherche sémantique principale |")
|
||||
print(f"| **Summary** | **{stats['Summary']:,}** | ✅ Oui | Recherche hiérarchique (chapitres/sections) |")
|
||||
print(f"| **Document** | **{stats['Document']:,}** | ❌ Non | Métadonnées d'éditions |")
|
||||
print(f"| **Work** | **{stats['Work']:,}** | ✅ Oui* | Métadonnées d'œuvres (vide, prêt pour migration) |")
|
||||
print()
|
||||
print(f"**Total vecteurs** : {total_vectors:,} ({stats['Chunk']:,} chunks + {stats['Summary']:,} summaries)")
|
||||
print(f"**Ratio Summary/Chunk** : {ratio:.2f} ", end="")
|
||||
|
||||
if ratio > 1:
|
||||
print("(plus de summaries que de chunks, bon pour recherche hiérarchique)")
|
||||
else:
|
||||
print("(plus de chunks que de summaries)")
|
||||
|
||||
print()
|
||||
print("\\* *Work est configuré avec vectorisation (depuis migration 2026-01) mais n'a pas encore d'objets*")
|
||||
print()
|
||||
|
||||
# Additional insights
|
||||
print("### Insights")
|
||||
print()
|
||||
|
||||
if stats["Chunk"] > 0:
|
||||
avg_summaries_per_chunk = stats["Summary"] / stats["Chunk"]
|
||||
print(f"- **Granularité** : {avg_summaries_per_chunk:.1f} summaries par chunk en moyenne")
|
||||
|
||||
if stats["Document"] > 0:
|
||||
avg_chunks_per_doc = stats["Chunk"] / stats["Document"]
|
||||
avg_summaries_per_doc = stats["Summary"] / stats["Document"]
|
||||
print(f"- **Taille moyenne document** : {avg_chunks_per_doc:.0f} chunks, {avg_summaries_per_doc:.0f} summaries")
|
||||
|
||||
if stats["Chunk"] >= 50000:
|
||||
print("- **⚠️ Index Switch** : Collection Chunk a dépassé 50k → HNSW activé (Dynamic index)")
|
||||
elif stats["Chunk"] >= 40000:
|
||||
print(f"- **📊 Proche seuil** : {50000 - stats['Chunk']:,} chunks avant switch FLAT→HNSW (50k)")
|
||||
|
||||
if stats["Summary"] >= 10000:
|
||||
print("- **⚠️ Index Switch** : Collection Summary a dépassé 10k → HNSW activé (Dynamic index)")
|
||||
elif stats["Summary"] >= 8000:
|
||||
print(f"- **📊 Proche seuil** : {10000 - stats['Summary']:,} summaries avant switch FLAT→HNSW (10k)")
|
||||
|
||||
# Memory estimation
|
||||
vectors_total = total_vectors
|
||||
# BGE-M3: 1024 dim × 4 bytes (float32) = 4KB per vector
|
||||
# + metadata ~1KB per object
|
||||
estimated_ram_gb = (vectors_total * 5) / (1024 * 1024) # 5KB per vector with metadata
|
||||
estimated_ram_with_rq_gb = estimated_ram_gb * 0.25 # RQ saves 75%
|
||||
|
||||
print()
|
||||
print(f"- **RAM estimée** : ~{estimated_ram_gb:.1f} GB sans RQ, ~{estimated_ram_with_rq_gb:.1f} GB avec RQ (économie 75%)")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point."""
|
||||
# Fix encoding for Windows console
|
||||
if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
print("=" * 80, file=sys.stderr)
|
||||
print("GÉNÉRATION DES STATISTIQUES WEAVIATE", file=sys.stderr)
|
||||
print("=" * 80, file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
|
||||
client: weaviate.WeaviateClient = weaviate.connect_to_local(
|
||||
host="localhost",
|
||||
port=8080,
|
||||
grpc_port=50051,
|
||||
)
|
||||
|
||||
try:
|
||||
if not client.is_ready():
|
||||
print("❌ Weaviate is not ready. Ensure docker-compose is running.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print("✓ Weaviate is ready", file=sys.stderr)
|
||||
print("✓ Querying collections...", file=sys.stderr)
|
||||
|
||||
stats = get_collection_stats(client)
|
||||
|
||||
print("✓ Statistics retrieved", file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print("=" * 80, file=sys.stderr)
|
||||
print("MARKDOWN OUTPUT (copy to WEAVIATE_SCHEMA.md):", file=sys.stderr)
|
||||
print("=" * 80, file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
|
||||
# Print to stdout (can be redirected to file)
|
||||
print_markdown_stats(stats)
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user