feat: Add data quality verification & cleanup scripts

## Data Quality & Cleanup (Priorities 1-6)

Added comprehensive data quality verification and cleanup system:

**Scripts créés**:
- verify_data_quality.py: Analyse qualité complète œuvre par œuvre
- clean_duplicate_documents.py: Nettoyage doublons Documents
- populate_work_collection.py/clean.py: Peuplement Work collection
- fix_chunks_count.py: Correction chunksCount incohérents
- manage_orphan_chunks.py: Gestion chunks orphelins (3 options)
- clean_orphan_works.py: Suppression Works sans chunks
- add_missing_work.py: Création Work manquant
- generate_schema_stats.py: Génération stats auto
- migrate_add_work_collection.py: Migration sûre Work collection

**Documentation**:
- WEAVIATE_GUIDE_COMPLET.md: Guide consolidé complet (600+ lignes)
- WEAVIATE_SCHEMA.md: Référence schéma rapide
- NETTOYAGE_COMPLETE_RAPPORT.md: Rapport nettoyage session
- ANALYSE_QUALITE_DONNEES.md: Analyse qualité initiale
- rapport_qualite_donnees.txt: Output brut vérification

**Résultats nettoyage**:
- Documents: 16 → 9 (7 doublons supprimés)
- Works: 0 → 9 (peuplé + nettoyé)
- Chunks: 5,404 → 5,230 (174 orphelins supprimés)
- chunksCount: Corrigés (231 → 5,230 déclaré = réel)
- Cohérence parfaite: 9 Works = 9 Documents = 9 œuvres

**Modifications code**:
- schema.py: Ajout Work collection avec vectorisation
- utils/weaviate_ingest.py: Support Work ingestion
- utils/word_pipeline.py: Désactivation concepts (problème .lower())
- utils/word_toc_extractor.py: Métadonnées Word correctes
- .gitignore: Exclusion fichiers temporaires (*.wav, output/*, NUL)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-01 11:57:26 +01:00
parent 845ffb4b06
commit 04ee3f9e39
26 changed files with 6945 additions and 16 deletions

View File

@@ -0,0 +1,185 @@
#!/usr/bin/env python3
"""Verify vector index configuration for Chunk and Summary collections.
This script checks if the dynamic index with RQ is properly configured
for vectorized collections. It displays:
- Index type (flat, hnsw, or dynamic)
- Quantization status (RQ enabled/disabled)
- Distance metric
- Dynamic threshold (if applicable)
Usage:
python verify_vector_index.py
"""
import sys
from typing import Any, Dict
import weaviate
def check_collection_index(client: weaviate.WeaviateClient, collection_name: str) -> None:
"""Check and display vector index configuration for a collection.
Args:
client: Connected Weaviate client.
collection_name: Name of the collection to check.
"""
try:
collections = client.collections.list_all()
if collection_name not in collections:
print(f" ❌ Collection '{collection_name}' not found")
return
config = collections[collection_name]
print(f"\n📦 {collection_name}")
print("" * 80)
# Check vectorizer
vectorizer_str: str = str(config.vectorizer)
if "text2vec" in vectorizer_str.lower():
print(" ✓ Vectorizer: text2vec-transformers")
elif "none" in vectorizer_str.lower():
print(" Vectorizer: NONE (metadata collection)")
return
else:
print(f" ⚠ Vectorizer: {vectorizer_str}")
# Try to get vector index config (API structure varies)
# Access via config object properties
config_dict: Dict[str, Any] = {}
# Try different API paths to get config info
if hasattr(config, 'vector_index_config'):
vector_config = config.vector_index_config
config_dict['vector_config'] = str(vector_config)
# Check for specific attributes
if hasattr(vector_config, 'quantizer'):
config_dict['quantizer'] = str(vector_config.quantizer)
if hasattr(vector_config, 'distance_metric'):
config_dict['distance_metric'] = str(vector_config.distance_metric)
# Display available info
if config_dict:
print(f" • Configuration détectée:")
for key, value in config_dict.items():
print(f" - {key}: {value}")
# Simplified detection based on config representation
config_full_str = str(config)
# Detect index type
if "dynamic" in config_full_str.lower():
print(" • Index Type: DYNAMIC")
elif "hnsw" in config_full_str.lower():
print(" • Index Type: HNSW")
elif "flat" in config_full_str.lower():
print(" • Index Type: FLAT")
else:
print(" • Index Type: UNKNOWN (default HNSW probable)")
# Check for RQ
if "rq" in config_full_str.lower() or "quantizer" in config_full_str.lower():
print(" ✓ RQ (Rotational Quantization): Probablement ENABLED")
else:
print(" ⚠ RQ (Rotational Quantization): NOT DETECTED (ou désactivé)")
# Check distance metric
if "cosine" in config_full_str.lower():
print(" • Distance Metric: COSINE (détecté)")
elif "dot" in config_full_str.lower():
print(" • Distance Metric: DOT PRODUCT (détecté)")
elif "l2" in config_full_str.lower():
print(" • Distance Metric: L2 SQUARED (détecté)")
print("\n Interpretation:")
if "dynamic" in config_full_str.lower() and ("rq" in config_full_str.lower() or "quantizer" in config_full_str.lower()):
print(" ✅ OPTIMIZED: Dynamic index with RQ enabled")
print(" → Memory savings: ~75% at scale")
print(" → Auto-switches from flat to HNSW at threshold")
elif "hnsw" in config_full_str.lower():
if "rq" in config_full_str.lower() or "quantizer" in config_full_str.lower():
print(" ✅ HNSW with RQ: Good for large collections")
else:
print(" ⚠ HNSW without RQ: Consider enabling RQ for memory savings")
elif "flat" in config_full_str.lower():
print(" FLAT index: Good for small collections (<100k vectors)")
else:
print(" ⚠ Unknown index configuration (probably default HNSW)")
print(" → Collections créées sans config explicite utilisent HNSW par défaut")
except Exception as e:
print(f" ❌ Error checking {collection_name}: {e}")
def main() -> None:
"""Main entry point."""
# Fix encoding for Windows console
if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8')
print("=" * 80)
print("VÉRIFICATION DES INDEX VECTORIELS WEAVIATE")
print("=" * 80)
client: weaviate.WeaviateClient = weaviate.connect_to_local(
host="localhost",
port=8080,
grpc_port=50051,
)
try:
# Check if Weaviate is ready
if not client.is_ready():
print("\n❌ Weaviate is not ready. Ensure docker-compose is running.")
return
print("\n✓ Weaviate is ready")
# Get all collections
collections = client.collections.list_all()
print(f"✓ Found {len(collections)} collections: {sorted(collections.keys())}")
# Check vectorized collections (Chunk and Summary)
print("\n" + "=" * 80)
print("COLLECTIONS VECTORISÉES")
print("=" * 80)
check_collection_index(client, "Chunk")
check_collection_index(client, "Summary")
# Check non-vectorized collections (for reference)
print("\n" + "=" * 80)
print("COLLECTIONS MÉTADONNÉES (Non vectorisées)")
print("=" * 80)
check_collection_index(client, "Work")
check_collection_index(client, "Document")
print("\n" + "=" * 80)
print("VÉRIFICATION TERMINÉE")
print("=" * 80)
# Count objects in each collection
print("\n📊 STATISTIQUES:")
for name in ["Work", "Document", "Chunk", "Summary"]:
if name in collections:
try:
coll = client.collections.get(name)
# Simple count using aggregate (works for all collections)
result = coll.aggregate.over_all(total_count=True)
count = result.total_count
print(f"{name:<12} {count:>8,} objets")
except Exception as e:
print(f"{name:<12} Error: {e}")
finally:
client.close()
print("\n✓ Connexion fermée\n")
if __name__ == "__main__":
main()