linear-coding-agent/generations/library_rag/verify_vector_index.py

#!/usr/bin/env python3
"""Verify vector index configuration for Chunk and Summary collections.

This script checks if the dynamic index with RQ is properly configured
for vectorized collections. It displays:
- Index type (flat, hnsw, or dynamic)
- Quantization status (RQ enabled/disabled)
- Distance metric
- Dynamic threshold (if applicable)

Usage:
    python verify_vector_index.py
"""

import sys
from typing import Any, Dict

import weaviate


def check_collection_index(client: weaviate.WeaviateClient, collection_name: str) -> None:
    """Check and display vector index configuration for a collection.

    Args:
        client: Connected Weaviate client.
        collection_name: Name of the collection to check.
    """
    try:
        collections = client.collections.list_all()

        if collection_name not in collections:
            print(f"  ❌ Collection '{collection_name}' not found")
            return

        config = collections[collection_name]

        print(f"\n📦 {collection_name}")
        print("─" * 80)

        # Check vectorizer
        vectorizer_str: str = str(config.vectorizer)
        if "text2vec" in vectorizer_str.lower():
            print("  ✓ Vectorizer: text2vec-transformers")
        elif "none" in vectorizer_str.lower():
            print("  ℹ Vectorizer: NONE (metadata collection)")
            return
        else:
            print(f"  ⚠ Vectorizer: {vectorizer_str}")

        # Try to get vector index config (API structure varies)
        # Access via config object properties
        config_dict: Dict[str, Any] = {}

        # Try different API paths to get config info
        if hasattr(config, 'vector_index_config'):
            vector_config = config.vector_index_config
            config_dict['vector_config'] = str(vector_config)

            # Check for specific attributes
            if hasattr(vector_config, 'quantizer'):
                config_dict['quantizer'] = str(vector_config.quantizer)
            if hasattr(vector_config, 'distance_metric'):
                config_dict['distance_metric'] = str(vector_config.distance_metric)

        # Display available info
        if config_dict:
            print(f"  • Configuration détectée:")
            for key, value in config_dict.items():
                print(f"    - {key}: {value}")

        # Simplified detection based on config representation
        config_full_str = str(config)

        # Detect index type
        if "dynamic" in config_full_str.lower():
            print("  • Index Type: DYNAMIC")
        elif "hnsw" in config_full_str.lower():
            print("  • Index Type: HNSW")
        elif "flat" in config_full_str.lower():
            print("  • Index Type: FLAT")
        else:
            print("  • Index Type: UNKNOWN (default HNSW probable)")

        # Check for RQ
        if "rq" in config_full_str.lower() or "quantizer" in config_full_str.lower():
            print("  ✓ RQ (Rotational Quantization): Probablement ENABLED")
        else:
            print("  ⚠ RQ (Rotational Quantization): NOT DETECTED (ou désactivé)")

        # Check distance metric
        if "cosine" in config_full_str.lower():
            print("  • Distance Metric: COSINE (détecté)")
        elif "dot" in config_full_str.lower():
            print("  • Distance Metric: DOT PRODUCT (détecté)")
        elif "l2" in config_full_str.lower():
            print("  • Distance Metric: L2 SQUARED (détecté)")

        print("\n  Interpretation:")
        if "dynamic" in config_full_str.lower() and ("rq" in config_full_str.lower() or "quantizer" in config_full_str.lower()):
            print("  ✅ OPTIMIZED: Dynamic index with RQ enabled")
            print("     → Memory savings: ~75% at scale")
            print("     → Auto-switches from flat to HNSW at threshold")
        elif "hnsw" in config_full_str.lower():
            if "rq" in config_full_str.lower() or "quantizer" in config_full_str.lower():
                print("  ✅ HNSW with RQ: Good for large collections")
            else:
                print("  ⚠ HNSW without RQ: Consider enabling RQ for memory savings")
        elif "flat" in config_full_str.lower():
            print("  ℹ FLAT index: Good for small collections (<100k vectors)")
        else:
            print("  ⚠ Unknown index configuration (probably default HNSW)")
            print("     → Collections créées sans config explicite utilisent HNSW par défaut")

    except Exception as e:
        print(f"  ❌ Error checking {collection_name}: {e}")


def main() -> None:
    """Main entry point."""
    # Fix encoding for Windows console
    if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
        sys.stdout.reconfigure(encoding='utf-8')

    print("=" * 80)
    print("VÉRIFICATION DES INDEX VECTORIELS WEAVIATE")
    print("=" * 80)

    client: weaviate.WeaviateClient = weaviate.connect_to_local(
        host="localhost",
        port=8080,
        grpc_port=50051,
    )

    try:
        # Check if Weaviate is ready
        if not client.is_ready():
            print("\n❌ Weaviate is not ready. Ensure docker-compose is running.")
            return

        print("\n✓ Weaviate is ready")

        # Get all collections
        collections = client.collections.list_all()
        print(f"✓ Found {len(collections)} collections: {sorted(collections.keys())}")

        # Check vectorized collections (Chunk and Summary)
        print("\n" + "=" * 80)
        print("COLLECTIONS VECTORISÉES")
        print("=" * 80)

        check_collection_index(client, "Chunk")
        check_collection_index(client, "Summary")

        # Check non-vectorized collections (for reference)
        print("\n" + "=" * 80)
        print("COLLECTIONS MÉTADONNÉES (Non vectorisées)")
        print("=" * 80)

        check_collection_index(client, "Work")
        check_collection_index(client, "Document")

        print("\n" + "=" * 80)
        print("VÉRIFICATION TERMINÉE")
        print("=" * 80)

        # Count objects in each collection
        print("\n📊 STATISTIQUES:")
        for name in ["Work", "Document", "Chunk", "Summary"]:
            if name in collections:
                try:
                    coll = client.collections.get(name)
                    # Simple count using aggregate (works for all collections)
                    result = coll.aggregate.over_all(total_count=True)
                    count = result.total_count
                    print(f"  • {name:<12} {count:>8,} objets")
                except Exception as e:
                    print(f"  • {name:<12} Error: {e}")

    finally:
        client.close()
        print("\n✓ Connexion fermée\n")


if __name__ == "__main__":
    main()