## Data Quality & Cleanup (Priorities 1-6) Added comprehensive data quality verification and cleanup system: **Scripts créés**: - verify_data_quality.py: Analyse qualité complète œuvre par œuvre - clean_duplicate_documents.py: Nettoyage doublons Documents - populate_work_collection.py/clean.py: Peuplement Work collection - fix_chunks_count.py: Correction chunksCount incohérents - manage_orphan_chunks.py: Gestion chunks orphelins (3 options) - clean_orphan_works.py: Suppression Works sans chunks - add_missing_work.py: Création Work manquant - generate_schema_stats.py: Génération stats auto - migrate_add_work_collection.py: Migration sûre Work collection **Documentation**: - WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes) - WEAVIATE_SCHEMA.md: Référence schéma rapide - NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session - ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale - rapport_qualite_donnees.txt: Output brut vérification **Résultats nettoyage**: - Documents: 16 → 9 (7 doublons supprimés) - Works: 0 → 9 (peuplé + nettoyé) - Chunks: 5,404 → 5,230 (174 orphelins supprimés) - chunksCount: Corrigés (231 → 5,230 déclaré = réel) - Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres **Modifications code**: - schema.py: Ajout Work collection avec vectorisation - utils/weaviate_ingest.py: Support Work ingestion - utils/word_pipeline.py: Désactivation concepts (problème .lower()) - utils/word_toc_extractor.py: Métadonnées Word correctes - .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
186 lines
6.6 KiB
Python
186 lines
6.6 KiB
Python
#!/usr/bin/env python3
|
||
"""Verify vector index configuration for Chunk and Summary collections.
|
||
|
||
This script checks if the dynamic index with RQ is properly configured
|
||
for vectorized collections. It displays:
|
||
- Index type (flat, hnsw, or dynamic)
|
||
- Quantization status (RQ enabled/disabled)
|
||
- Distance metric
|
||
- Dynamic threshold (if applicable)
|
||
|
||
Usage:
|
||
python verify_vector_index.py
|
||
"""
|
||
|
||
import sys
|
||
from typing import Any, Dict
|
||
|
||
import weaviate
|
||
|
||
|
||
def check_collection_index(client: weaviate.WeaviateClient, collection_name: str) -> None:
|
||
"""Check and display vector index configuration for a collection.
|
||
|
||
Args:
|
||
client: Connected Weaviate client.
|
||
collection_name: Name of the collection to check.
|
||
"""
|
||
try:
|
||
collections = client.collections.list_all()
|
||
|
||
if collection_name not in collections:
|
||
print(f" ❌ Collection '{collection_name}' not found")
|
||
return
|
||
|
||
config = collections[collection_name]
|
||
|
||
print(f"\n📦 {collection_name}")
|
||
print("─" * 80)
|
||
|
||
# Check vectorizer
|
||
vectorizer_str: str = str(config.vectorizer)
|
||
if "text2vec" in vectorizer_str.lower():
|
||
print(" ✓ Vectorizer: text2vec-transformers")
|
||
elif "none" in vectorizer_str.lower():
|
||
print(" ℹ Vectorizer: NONE (metadata collection)")
|
||
return
|
||
else:
|
||
print(f" ⚠ Vectorizer: {vectorizer_str}")
|
||
|
||
# Try to get vector index config (API structure varies)
|
||
# Access via config object properties
|
||
config_dict: Dict[str, Any] = {}
|
||
|
||
# Try different API paths to get config info
|
||
if hasattr(config, 'vector_index_config'):
|
||
vector_config = config.vector_index_config
|
||
config_dict['vector_config'] = str(vector_config)
|
||
|
||
# Check for specific attributes
|
||
if hasattr(vector_config, 'quantizer'):
|
||
config_dict['quantizer'] = str(vector_config.quantizer)
|
||
if hasattr(vector_config, 'distance_metric'):
|
||
config_dict['distance_metric'] = str(vector_config.distance_metric)
|
||
|
||
# Display available info
|
||
if config_dict:
|
||
print(f" • Configuration détectée:")
|
||
for key, value in config_dict.items():
|
||
print(f" - {key}: {value}")
|
||
|
||
# Simplified detection based on config representation
|
||
config_full_str = str(config)
|
||
|
||
# Detect index type
|
||
if "dynamic" in config_full_str.lower():
|
||
print(" • Index Type: DYNAMIC")
|
||
elif "hnsw" in config_full_str.lower():
|
||
print(" • Index Type: HNSW")
|
||
elif "flat" in config_full_str.lower():
|
||
print(" • Index Type: FLAT")
|
||
else:
|
||
print(" • Index Type: UNKNOWN (default HNSW probable)")
|
||
|
||
# Check for RQ
|
||
if "rq" in config_full_str.lower() or "quantizer" in config_full_str.lower():
|
||
print(" ✓ RQ (Rotational Quantization): Probablement ENABLED")
|
||
else:
|
||
print(" ⚠ RQ (Rotational Quantization): NOT DETECTED (ou désactivé)")
|
||
|
||
# Check distance metric
|
||
if "cosine" in config_full_str.lower():
|
||
print(" • Distance Metric: COSINE (détecté)")
|
||
elif "dot" in config_full_str.lower():
|
||
print(" • Distance Metric: DOT PRODUCT (détecté)")
|
||
elif "l2" in config_full_str.lower():
|
||
print(" • Distance Metric: L2 SQUARED (détecté)")
|
||
|
||
print("\n Interpretation:")
|
||
if "dynamic" in config_full_str.lower() and ("rq" in config_full_str.lower() or "quantizer" in config_full_str.lower()):
|
||
print(" ✅ OPTIMIZED: Dynamic index with RQ enabled")
|
||
print(" → Memory savings: ~75% at scale")
|
||
print(" → Auto-switches from flat to HNSW at threshold")
|
||
elif "hnsw" in config_full_str.lower():
|
||
if "rq" in config_full_str.lower() or "quantizer" in config_full_str.lower():
|
||
print(" ✅ HNSW with RQ: Good for large collections")
|
||
else:
|
||
print(" ⚠ HNSW without RQ: Consider enabling RQ for memory savings")
|
||
elif "flat" in config_full_str.lower():
|
||
print(" ℹ FLAT index: Good for small collections (<100k vectors)")
|
||
else:
|
||
print(" ⚠ Unknown index configuration (probably default HNSW)")
|
||
print(" → Collections créées sans config explicite utilisent HNSW par défaut")
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Error checking {collection_name}: {e}")
|
||
|
||
|
||
def main() -> None:
|
||
"""Main entry point."""
|
||
# Fix encoding for Windows console
|
||
if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
|
||
sys.stdout.reconfigure(encoding='utf-8')
|
||
|
||
print("=" * 80)
|
||
print("VÉRIFICATION DES INDEX VECTORIELS WEAVIATE")
|
||
print("=" * 80)
|
||
|
||
client: weaviate.WeaviateClient = weaviate.connect_to_local(
|
||
host="localhost",
|
||
port=8080,
|
||
grpc_port=50051,
|
||
)
|
||
|
||
try:
|
||
# Check if Weaviate is ready
|
||
if not client.is_ready():
|
||
print("\n❌ Weaviate is not ready. Ensure docker-compose is running.")
|
||
return
|
||
|
||
print("\n✓ Weaviate is ready")
|
||
|
||
# Get all collections
|
||
collections = client.collections.list_all()
|
||
print(f"✓ Found {len(collections)} collections: {sorted(collections.keys())}")
|
||
|
||
# Check vectorized collections (Chunk and Summary)
|
||
print("\n" + "=" * 80)
|
||
print("COLLECTIONS VECTORISÉES")
|
||
print("=" * 80)
|
||
|
||
check_collection_index(client, "Chunk")
|
||
check_collection_index(client, "Summary")
|
||
|
||
# Check non-vectorized collections (for reference)
|
||
print("\n" + "=" * 80)
|
||
print("COLLECTIONS MÉTADONNÉES (Non vectorisées)")
|
||
print("=" * 80)
|
||
|
||
check_collection_index(client, "Work")
|
||
check_collection_index(client, "Document")
|
||
|
||
print("\n" + "=" * 80)
|
||
print("VÉRIFICATION TERMINÉE")
|
||
print("=" * 80)
|
||
|
||
# Count objects in each collection
|
||
print("\n📊 STATISTIQUES:")
|
||
for name in ["Work", "Document", "Chunk", "Summary"]:
|
||
if name in collections:
|
||
try:
|
||
coll = client.collections.get(name)
|
||
# Simple count using aggregate (works for all collections)
|
||
result = coll.aggregate.over_all(total_count=True)
|
||
count = result.total_count
|
||
print(f" • {name:<12} {count:>8,} objets")
|
||
except Exception as e:
|
||
print(f" • {name:<12} Error: {e}")
|
||
|
||
finally:
|
||
client.close()
|
||
print("\n✓ Connexion fermée\n")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|