Files
linear-coding-agent/generations/library_rag/verify_vector_index.py
David Blanc Brioir 04ee3f9e39 feat: Add data quality verification & cleanup scripts
## Data Quality & Cleanup (Priorities 1-6)

Added comprehensive data quality verification and cleanup system:

**Scripts créés**:
- verify_data_quality.py: Analyse qualité complète œuvre par œuvre
- clean_duplicate_documents.py: Nettoyage doublons Documents
- populate_work_collection.py/clean.py: Peuplement Work collection
- fix_chunks_count.py: Correction chunksCount incohérents
- manage_orphan_chunks.py: Gestion chunks orphelins (3 options)
- clean_orphan_works.py: Suppression Works sans chunks
- add_missing_work.py: Création Work manquant
- generate_schema_stats.py: Génération stats auto
- migrate_add_work_collection.py: Migration sûre Work collection

**Documentation**:
- WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes)
- WEAVIATE_SCHEMA.md: Référence schéma rapide
- NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session
- ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale
- rapport_qualite_donnees.txt: Output brut vérification

**Résultats nettoyage**:
- Documents: 16 → 9 (7 doublons supprimés)
- Works: 0 → 9 (peuplé + nettoyé)
- Chunks: 5,404 → 5,230 (174 orphelins supprimés)
- chunksCount: Corrigés (231 → 5,230 déclaré = réel)
- Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres

**Modifications code**:
- schema.py: Ajout Work collection avec vectorisation
- utils/weaviate_ingest.py: Support Work ingestion
- utils/word_pipeline.py: Désactivation concepts (problème .lower())
- utils/word_toc_extractor.py: Métadonnées Word correctes
- .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-01 11:57:26 +01:00

186 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Verify vector index configuration for Chunk and Summary collections.
This script checks if the dynamic index with RQ is properly configured
for vectorized collections. It displays:
- Index type (flat, hnsw, or dynamic)
- Quantization status (RQ enabled/disabled)
- Distance metric
- Dynamic threshold (if applicable)
Usage:
python verify_vector_index.py
"""
import sys
from typing import Any, Dict
import weaviate
def check_collection_index(client: weaviate.WeaviateClient, collection_name: str) -> None:
"""Check and display vector index configuration for a collection.
Args:
client: Connected Weaviate client.
collection_name: Name of the collection to check.
"""
try:
collections = client.collections.list_all()
if collection_name not in collections:
print(f" ❌ Collection '{collection_name}' not found")
return
config = collections[collection_name]
print(f"\n📦 {collection_name}")
print("" * 80)
# Check vectorizer
vectorizer_str: str = str(config.vectorizer)
if "text2vec" in vectorizer_str.lower():
print(" ✓ Vectorizer: text2vec-transformers")
elif "none" in vectorizer_str.lower():
print(" Vectorizer: NONE (metadata collection)")
return
else:
print(f" ⚠ Vectorizer: {vectorizer_str}")
# Try to get vector index config (API structure varies)
# Access via config object properties
config_dict: Dict[str, Any] = {}
# Try different API paths to get config info
if hasattr(config, 'vector_index_config'):
vector_config = config.vector_index_config
config_dict['vector_config'] = str(vector_config)
# Check for specific attributes
if hasattr(vector_config, 'quantizer'):
config_dict['quantizer'] = str(vector_config.quantizer)
if hasattr(vector_config, 'distance_metric'):
config_dict['distance_metric'] = str(vector_config.distance_metric)
# Display available info
if config_dict:
print(f" • Configuration détectée:")
for key, value in config_dict.items():
print(f" - {key}: {value}")
# Simplified detection based on config representation
config_full_str = str(config)
# Detect index type
if "dynamic" in config_full_str.lower():
print(" • Index Type: DYNAMIC")
elif "hnsw" in config_full_str.lower():
print(" • Index Type: HNSW")
elif "flat" in config_full_str.lower():
print(" • Index Type: FLAT")
else:
print(" • Index Type: UNKNOWN (default HNSW probable)")
# Check for RQ
if "rq" in config_full_str.lower() or "quantizer" in config_full_str.lower():
print(" ✓ RQ (Rotational Quantization): Probablement ENABLED")
else:
print(" ⚠ RQ (Rotational Quantization): NOT DETECTED (ou désactivé)")
# Check distance metric
if "cosine" in config_full_str.lower():
print(" • Distance Metric: COSINE (détecté)")
elif "dot" in config_full_str.lower():
print(" • Distance Metric: DOT PRODUCT (détecté)")
elif "l2" in config_full_str.lower():
print(" • Distance Metric: L2 SQUARED (détecté)")
print("\n Interpretation:")
if "dynamic" in config_full_str.lower() and ("rq" in config_full_str.lower() or "quantizer" in config_full_str.lower()):
print(" ✅ OPTIMIZED: Dynamic index with RQ enabled")
print(" → Memory savings: ~75% at scale")
print(" → Auto-switches from flat to HNSW at threshold")
elif "hnsw" in config_full_str.lower():
if "rq" in config_full_str.lower() or "quantizer" in config_full_str.lower():
print(" ✅ HNSW with RQ: Good for large collections")
else:
print(" ⚠ HNSW without RQ: Consider enabling RQ for memory savings")
elif "flat" in config_full_str.lower():
print(" FLAT index: Good for small collections (<100k vectors)")
else:
print(" ⚠ Unknown index configuration (probably default HNSW)")
print(" → Collections créées sans config explicite utilisent HNSW par défaut")
except Exception as e:
print(f" ❌ Error checking {collection_name}: {e}")
def main() -> None:
"""Main entry point."""
# Fix encoding for Windows console
if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8')
print("=" * 80)
print("VÉRIFICATION DES INDEX VECTORIELS WEAVIATE")
print("=" * 80)
client: weaviate.WeaviateClient = weaviate.connect_to_local(
host="localhost",
port=8080,
grpc_port=50051,
)
try:
# Check if Weaviate is ready
if not client.is_ready():
print("\n❌ Weaviate is not ready. Ensure docker-compose is running.")
return
print("\n✓ Weaviate is ready")
# Get all collections
collections = client.collections.list_all()
print(f"✓ Found {len(collections)} collections: {sorted(collections.keys())}")
# Check vectorized collections (Chunk and Summary)
print("\n" + "=" * 80)
print("COLLECTIONS VECTORISÉES")
print("=" * 80)
check_collection_index(client, "Chunk")
check_collection_index(client, "Summary")
# Check non-vectorized collections (for reference)
print("\n" + "=" * 80)
print("COLLECTIONS MÉTADONNÉES (Non vectorisées)")
print("=" * 80)
check_collection_index(client, "Work")
check_collection_index(client, "Document")
print("\n" + "=" * 80)
print("VÉRIFICATION TERMINÉE")
print("=" * 80)
# Count objects in each collection
print("\n📊 STATISTIQUES:")
for name in ["Work", "Document", "Chunk", "Summary"]:
if name in collections:
try:
coll = client.collections.get(name)
# Simple count using aggregate (works for all collections)
result = coll.aggregate.over_all(total_count=True)
count = result.total_count
print(f"{name:<12} {count:>8,} objets")
except Exception as e:
print(f"{name:<12} Error: {e}")
finally:
client.close()
print("\n✓ Connexion fermée\n")
if __name__ == "__main__":
main()