feat: Add data quality verification & cleanup scripts
## Data Quality & Cleanup (Priorities 1-6) Added comprehensive data quality verification and cleanup system: **Scripts créés**: - verify_data_quality.py: Analyse qualité complète œuvre par œuvre - clean_duplicate_documents.py: Nettoyage doublons Documents - populate_work_collection.py/clean.py: Peuplement Work collection - fix_chunks_count.py: Correction chunksCount incohérents - manage_orphan_chunks.py: Gestion chunks orphelins (3 options) - clean_orphan_works.py: Suppression Works sans chunks - add_missing_work.py: Création Work manquant - generate_schema_stats.py: Génération stats auto - migrate_add_work_collection.py: Migration sûre Work collection **Documentation**: - WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes) - WEAVIATE_SCHEMA.md: Référence schéma rapide - NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session - ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale - rapport_qualite_donnees.txt: Output brut vérification **Résultats nettoyage**: - Documents: 16 → 9 (7 doublons supprimés) - Works: 0 → 9 (peuplé + nettoyé) - Chunks: 5,404 → 5,230 (174 orphelins supprimés) - chunksCount: Corrigés (231 → 5,230 déclaré = réel) - Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres **Modifications code**: - schema.py: Ajout Work collection avec vectorisation - utils/weaviate_ingest.py: Support Work ingestion - utils/word_pipeline.py: Désactivation concepts (problème .lower()) - utils/word_toc_extractor.py: Métadonnées Word correctes - .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
185
generations/library_rag/verify_vector_index.py
Normal file
185
generations/library_rag/verify_vector_index.py
Normal file
@@ -0,0 +1,185 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Verify vector index configuration for Chunk and Summary collections.
|
||||
|
||||
This script checks if the dynamic index with RQ is properly configured
|
||||
for vectorized collections. It displays:
|
||||
- Index type (flat, hnsw, or dynamic)
|
||||
- Quantization status (RQ enabled/disabled)
|
||||
- Distance metric
|
||||
- Dynamic threshold (if applicable)
|
||||
|
||||
Usage:
|
||||
python verify_vector_index.py
|
||||
"""
|
||||
|
||||
import sys
|
||||
from typing import Any, Dict
|
||||
|
||||
import weaviate
|
||||
|
||||
|
||||
def check_collection_index(client: weaviate.WeaviateClient, collection_name: str) -> None:
|
||||
"""Check and display vector index configuration for a collection.
|
||||
|
||||
Args:
|
||||
client: Connected Weaviate client.
|
||||
collection_name: Name of the collection to check.
|
||||
"""
|
||||
try:
|
||||
collections = client.collections.list_all()
|
||||
|
||||
if collection_name not in collections:
|
||||
print(f" ❌ Collection '{collection_name}' not found")
|
||||
return
|
||||
|
||||
config = collections[collection_name]
|
||||
|
||||
print(f"\n📦 {collection_name}")
|
||||
print("─" * 80)
|
||||
|
||||
# Check vectorizer
|
||||
vectorizer_str: str = str(config.vectorizer)
|
||||
if "text2vec" in vectorizer_str.lower():
|
||||
print(" ✓ Vectorizer: text2vec-transformers")
|
||||
elif "none" in vectorizer_str.lower():
|
||||
print(" ℹ Vectorizer: NONE (metadata collection)")
|
||||
return
|
||||
else:
|
||||
print(f" ⚠ Vectorizer: {vectorizer_str}")
|
||||
|
||||
# Try to get vector index config (API structure varies)
|
||||
# Access via config object properties
|
||||
config_dict: Dict[str, Any] = {}
|
||||
|
||||
# Try different API paths to get config info
|
||||
if hasattr(config, 'vector_index_config'):
|
||||
vector_config = config.vector_index_config
|
||||
config_dict['vector_config'] = str(vector_config)
|
||||
|
||||
# Check for specific attributes
|
||||
if hasattr(vector_config, 'quantizer'):
|
||||
config_dict['quantizer'] = str(vector_config.quantizer)
|
||||
if hasattr(vector_config, 'distance_metric'):
|
||||
config_dict['distance_metric'] = str(vector_config.distance_metric)
|
||||
|
||||
# Display available info
|
||||
if config_dict:
|
||||
print(f" • Configuration détectée:")
|
||||
for key, value in config_dict.items():
|
||||
print(f" - {key}: {value}")
|
||||
|
||||
# Simplified detection based on config representation
|
||||
config_full_str = str(config)
|
||||
|
||||
# Detect index type
|
||||
if "dynamic" in config_full_str.lower():
|
||||
print(" • Index Type: DYNAMIC")
|
||||
elif "hnsw" in config_full_str.lower():
|
||||
print(" • Index Type: HNSW")
|
||||
elif "flat" in config_full_str.lower():
|
||||
print(" • Index Type: FLAT")
|
||||
else:
|
||||
print(" • Index Type: UNKNOWN (default HNSW probable)")
|
||||
|
||||
# Check for RQ
|
||||
if "rq" in config_full_str.lower() or "quantizer" in config_full_str.lower():
|
||||
print(" ✓ RQ (Rotational Quantization): Probablement ENABLED")
|
||||
else:
|
||||
print(" ⚠ RQ (Rotational Quantization): NOT DETECTED (ou désactivé)")
|
||||
|
||||
# Check distance metric
|
||||
if "cosine" in config_full_str.lower():
|
||||
print(" • Distance Metric: COSINE (détecté)")
|
||||
elif "dot" in config_full_str.lower():
|
||||
print(" • Distance Metric: DOT PRODUCT (détecté)")
|
||||
elif "l2" in config_full_str.lower():
|
||||
print(" • Distance Metric: L2 SQUARED (détecté)")
|
||||
|
||||
print("\n Interpretation:")
|
||||
if "dynamic" in config_full_str.lower() and ("rq" in config_full_str.lower() or "quantizer" in config_full_str.lower()):
|
||||
print(" ✅ OPTIMIZED: Dynamic index with RQ enabled")
|
||||
print(" → Memory savings: ~75% at scale")
|
||||
print(" → Auto-switches from flat to HNSW at threshold")
|
||||
elif "hnsw" in config_full_str.lower():
|
||||
if "rq" in config_full_str.lower() or "quantizer" in config_full_str.lower():
|
||||
print(" ✅ HNSW with RQ: Good for large collections")
|
||||
else:
|
||||
print(" ⚠ HNSW without RQ: Consider enabling RQ for memory savings")
|
||||
elif "flat" in config_full_str.lower():
|
||||
print(" ℹ FLAT index: Good for small collections (<100k vectors)")
|
||||
else:
|
||||
print(" ⚠ Unknown index configuration (probably default HNSW)")
|
||||
print(" → Collections créées sans config explicite utilisent HNSW par défaut")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Error checking {collection_name}: {e}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point."""
|
||||
# Fix encoding for Windows console
|
||||
if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
print("=" * 80)
|
||||
print("VÉRIFICATION DES INDEX VECTORIELS WEAVIATE")
|
||||
print("=" * 80)
|
||||
|
||||
client: weaviate.WeaviateClient = weaviate.connect_to_local(
|
||||
host="localhost",
|
||||
port=8080,
|
||||
grpc_port=50051,
|
||||
)
|
||||
|
||||
try:
|
||||
# Check if Weaviate is ready
|
||||
if not client.is_ready():
|
||||
print("\n❌ Weaviate is not ready. Ensure docker-compose is running.")
|
||||
return
|
||||
|
||||
print("\n✓ Weaviate is ready")
|
||||
|
||||
# Get all collections
|
||||
collections = client.collections.list_all()
|
||||
print(f"✓ Found {len(collections)} collections: {sorted(collections.keys())}")
|
||||
|
||||
# Check vectorized collections (Chunk and Summary)
|
||||
print("\n" + "=" * 80)
|
||||
print("COLLECTIONS VECTORISÉES")
|
||||
print("=" * 80)
|
||||
|
||||
check_collection_index(client, "Chunk")
|
||||
check_collection_index(client, "Summary")
|
||||
|
||||
# Check non-vectorized collections (for reference)
|
||||
print("\n" + "=" * 80)
|
||||
print("COLLECTIONS MÉTADONNÉES (Non vectorisées)")
|
||||
print("=" * 80)
|
||||
|
||||
check_collection_index(client, "Work")
|
||||
check_collection_index(client, "Document")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("VÉRIFICATION TERMINÉE")
|
||||
print("=" * 80)
|
||||
|
||||
# Count objects in each collection
|
||||
print("\n📊 STATISTIQUES:")
|
||||
for name in ["Work", "Document", "Chunk", "Summary"]:
|
||||
if name in collections:
|
||||
try:
|
||||
coll = client.collections.get(name)
|
||||
# Simple count using aggregate (works for all collections)
|
||||
result = coll.aggregate.over_all(total_count=True)
|
||||
count = result.total_count
|
||||
print(f" • {name:<12} {count:>8,} objets")
|
||||
except Exception as e:
|
||||
print(f" • {name:<12} Error: {e}")
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
print("\n✓ Connexion fermée\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user