feat: Add vectorized summary field and migration tools
- Add 'summary' field to Chunk collection (vectorized with text2vec) - Migrate from Dynamic index to HNSW + RQ for both Chunk and Summary - Add LLM summarizer module (utils/llm_summarizer.py) - Add migration scripts (migrate_add_summary.py, restore_*.py) - Add summary generation utilities and progress tracking - Add testing and cleaning tools (outils_test_and_cleaning/) - Add comprehensive documentation (ANALYSE_*.md, guides) - Remove obsolete files (linear_config.py, old test files) - Update .gitignore to exclude backups and temp files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,164 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate statistics for WEAVIATE_SCHEMA.md documentation.
|
||||
|
||||
This script queries Weaviate and generates updated statistics to keep
|
||||
the schema documentation in sync with reality.
|
||||
|
||||
Usage:
|
||||
python generate_schema_stats.py
|
||||
|
||||
Output:
|
||||
Prints formatted markdown table with current statistics that can be
|
||||
copy-pasted into WEAVIATE_SCHEMA.md
|
||||
"""
|
||||
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Dict
|
||||
|
||||
import weaviate
|
||||
|
||||
|
||||
def get_collection_stats(client: weaviate.WeaviateClient) -> Dict[str, int]:
|
||||
"""Get object counts for all collections.
|
||||
|
||||
Args:
|
||||
client: Connected Weaviate client.
|
||||
|
||||
Returns:
|
||||
Dict mapping collection name to object count.
|
||||
"""
|
||||
stats: Dict[str, int] = {}
|
||||
|
||||
collections = client.collections.list_all()
|
||||
|
||||
for name in ["Work", "Document", "Chunk", "Summary"]:
|
||||
if name in collections:
|
||||
try:
|
||||
coll = client.collections.get(name)
|
||||
result = coll.aggregate.over_all(total_count=True)
|
||||
stats[name] = result.total_count
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not get count for {name}: {e}", file=sys.stderr)
|
||||
stats[name] = 0
|
||||
else:
|
||||
stats[name] = 0
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def print_markdown_stats(stats: Dict[str, int]) -> None:
|
||||
"""Print statistics in markdown table format for WEAVIATE_SCHEMA.md.
|
||||
|
||||
Args:
|
||||
stats: Dict mapping collection name to object count.
|
||||
"""
|
||||
total_vectors = stats["Chunk"] + stats["Summary"]
|
||||
ratio = stats["Summary"] / stats["Chunk"] if stats["Chunk"] > 0 else 0
|
||||
|
||||
today = datetime.now().strftime("%d/%m/%Y")
|
||||
|
||||
print(f"## Contenu actuel (au {today})")
|
||||
print()
|
||||
print(f"**Dernière vérification** : {datetime.now().strftime('%d %B %Y')} via `generate_schema_stats.py`")
|
||||
print()
|
||||
print("### Statistiques par collection")
|
||||
print()
|
||||
print("| Collection | Objets | Vectorisé | Utilisation |")
|
||||
print("|------------|--------|-----------|-------------|")
|
||||
print(f"| **Chunk** | **{stats['Chunk']:,}** | ✅ Oui | Recherche sémantique principale |")
|
||||
print(f"| **Summary** | **{stats['Summary']:,}** | ✅ Oui | Recherche hiérarchique (chapitres/sections) |")
|
||||
print(f"| **Document** | **{stats['Document']:,}** | ❌ Non | Métadonnées d'éditions |")
|
||||
print(f"| **Work** | **{stats['Work']:,}** | ✅ Oui* | Métadonnées d'œuvres (vide, prêt pour migration) |")
|
||||
print()
|
||||
print(f"**Total vecteurs** : {total_vectors:,} ({stats['Chunk']:,} chunks + {stats['Summary']:,} summaries)")
|
||||
print(f"**Ratio Summary/Chunk** : {ratio:.2f} ", end="")
|
||||
|
||||
if ratio > 1:
|
||||
print("(plus de summaries que de chunks, bon pour recherche hiérarchique)")
|
||||
else:
|
||||
print("(plus de chunks que de summaries)")
|
||||
|
||||
print()
|
||||
print("\\* *Work est configuré avec vectorisation (depuis migration 2026-01) mais n'a pas encore d'objets*")
|
||||
print()
|
||||
|
||||
# Additional insights
|
||||
print("### Insights")
|
||||
print()
|
||||
|
||||
if stats["Chunk"] > 0:
|
||||
avg_summaries_per_chunk = stats["Summary"] / stats["Chunk"]
|
||||
print(f"- **Granularité** : {avg_summaries_per_chunk:.1f} summaries par chunk en moyenne")
|
||||
|
||||
if stats["Document"] > 0:
|
||||
avg_chunks_per_doc = stats["Chunk"] / stats["Document"]
|
||||
avg_summaries_per_doc = stats["Summary"] / stats["Document"]
|
||||
print(f"- **Taille moyenne document** : {avg_chunks_per_doc:.0f} chunks, {avg_summaries_per_doc:.0f} summaries")
|
||||
|
||||
if stats["Chunk"] >= 50000:
|
||||
print("- **⚠️ Index Switch** : Collection Chunk a dépassé 50k → HNSW activé (Dynamic index)")
|
||||
elif stats["Chunk"] >= 40000:
|
||||
print(f"- **📊 Proche seuil** : {50000 - stats['Chunk']:,} chunks avant switch FLAT→HNSW (50k)")
|
||||
|
||||
if stats["Summary"] >= 10000:
|
||||
print("- **⚠️ Index Switch** : Collection Summary a dépassé 10k → HNSW activé (Dynamic index)")
|
||||
elif stats["Summary"] >= 8000:
|
||||
print(f"- **📊 Proche seuil** : {10000 - stats['Summary']:,} summaries avant switch FLAT→HNSW (10k)")
|
||||
|
||||
# Memory estimation
|
||||
vectors_total = total_vectors
|
||||
# BGE-M3: 1024 dim × 4 bytes (float32) = 4KB per vector
|
||||
# + metadata ~1KB per object
|
||||
estimated_ram_gb = (vectors_total * 5) / (1024 * 1024) # 5KB per vector with metadata
|
||||
estimated_ram_with_rq_gb = estimated_ram_gb * 0.25 # RQ saves 75%
|
||||
|
||||
print()
|
||||
print(f"- **RAM estimée** : ~{estimated_ram_gb:.1f} GB sans RQ, ~{estimated_ram_with_rq_gb:.1f} GB avec RQ (économie 75%)")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point."""
|
||||
# Fix encoding for Windows console
|
||||
if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
print("=" * 80, file=sys.stderr)
|
||||
print("GÉNÉRATION DES STATISTIQUES WEAVIATE", file=sys.stderr)
|
||||
print("=" * 80, file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
|
||||
client: weaviate.WeaviateClient = weaviate.connect_to_local(
|
||||
host="localhost",
|
||||
port=8080,
|
||||
grpc_port=50051,
|
||||
)
|
||||
|
||||
try:
|
||||
if not client.is_ready():
|
||||
print("❌ Weaviate is not ready. Ensure docker-compose is running.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print("✓ Weaviate is ready", file=sys.stderr)
|
||||
print("✓ Querying collections...", file=sys.stderr)
|
||||
|
||||
stats = get_collection_stats(client)
|
||||
|
||||
print("✓ Statistics retrieved", file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print("=" * 80, file=sys.stderr)
|
||||
print("MARKDOWN OUTPUT (copy to WEAVIATE_SCHEMA.md):", file=sys.stderr)
|
||||
print("=" * 80, file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
|
||||
# Print to stdout (can be redirected to file)
|
||||
print_markdown_stats(stats)
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,480 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Gérer les chunks orphelins (sans document parent).
|
||||
|
||||
Un chunk est orphelin si son document.sourceId ne correspond à aucun objet
|
||||
dans la collection Document.
|
||||
|
||||
Ce script offre 3 options :
|
||||
1. SUPPRIMER les chunks orphelins (perte définitive)
|
||||
2. CRÉER les documents manquants (restauration)
|
||||
3. LISTER seulement (ne rien faire)
|
||||
|
||||
Usage:
|
||||
# Lister les orphelins (par défaut)
|
||||
python manage_orphan_chunks.py
|
||||
|
||||
# Créer les documents manquants pour les orphelins
|
||||
python manage_orphan_chunks.py --create-documents
|
||||
|
||||
# Supprimer les chunks orphelins (ATTENTION: perte de données)
|
||||
python manage_orphan_chunks.py --delete-orphans
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
from typing import Any, Dict, List, Set
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
import weaviate
|
||||
|
||||
|
||||
def identify_orphan_chunks(
|
||||
client: weaviate.WeaviateClient,
|
||||
) -> Dict[str, List[Any]]:
|
||||
"""Identifier les chunks orphelins (sans document parent).
|
||||
|
||||
Args:
|
||||
client: Connected Weaviate client.
|
||||
|
||||
Returns:
|
||||
Dict mapping orphan sourceId to list of orphan chunks.
|
||||
"""
|
||||
print("📊 Récupération de tous les chunks...")
|
||||
|
||||
chunk_collection = client.collections.get("Chunk")
|
||||
chunks_response = chunk_collection.query.fetch_objects(
|
||||
limit=10000,
|
||||
)
|
||||
|
||||
all_chunks = chunks_response.objects
|
||||
print(f" ✓ {len(all_chunks)} chunks récupérés")
|
||||
print()
|
||||
|
||||
print("📊 Récupération de tous les documents...")
|
||||
|
||||
doc_collection = client.collections.get("Document")
|
||||
docs_response = doc_collection.query.fetch_objects(
|
||||
limit=1000,
|
||||
)
|
||||
|
||||
print(f" ✓ {len(docs_response.objects)} documents récupérés")
|
||||
print()
|
||||
|
||||
# Construire un set des sourceIds existants
|
||||
existing_source_ids: Set[str] = set()
|
||||
for doc_obj in docs_response.objects:
|
||||
source_id = doc_obj.properties.get("sourceId")
|
||||
if source_id:
|
||||
existing_source_ids.add(source_id)
|
||||
|
||||
print(f"📊 {len(existing_source_ids)} sourceIds existants dans Document")
|
||||
print()
|
||||
|
||||
# Identifier les orphelins
|
||||
orphan_chunks_by_source: Dict[str, List[Any]] = defaultdict(list)
|
||||
orphan_source_ids: Set[str] = set()
|
||||
|
||||
for chunk_obj in all_chunks:
|
||||
props = chunk_obj.properties
|
||||
if "document" in props and isinstance(props["document"], dict):
|
||||
source_id = props["document"].get("sourceId")
|
||||
|
||||
if source_id and source_id not in existing_source_ids:
|
||||
orphan_chunks_by_source[source_id].append(chunk_obj)
|
||||
orphan_source_ids.add(source_id)
|
||||
|
||||
print(f"🔍 {len(orphan_source_ids)} sourceIds orphelins détectés")
|
||||
print(f"🔍 {sum(len(chunks) for chunks in orphan_chunks_by_source.values())} chunks orphelins au total")
|
||||
print()
|
||||
|
||||
return orphan_chunks_by_source
|
||||
|
||||
|
||||
def display_orphans_report(orphan_chunks: Dict[str, List[Any]]) -> None:
|
||||
"""Afficher le rapport des chunks orphelins.
|
||||
|
||||
Args:
|
||||
orphan_chunks: Dict mapping sourceId to list of orphan chunks.
|
||||
"""
|
||||
if not orphan_chunks:
|
||||
print("✅ Aucun chunk orphelin détecté !")
|
||||
print()
|
||||
return
|
||||
|
||||
print("=" * 80)
|
||||
print("CHUNKS ORPHELINS DÉTECTÉS")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
total_orphans = sum(len(chunks) for chunks in orphan_chunks.values())
|
||||
|
||||
print(f"📌 {len(orphan_chunks)} sourceIds orphelins")
|
||||
print(f"📌 {total_orphans:,} chunks orphelins au total")
|
||||
print()
|
||||
|
||||
for i, (source_id, chunks) in enumerate(sorted(orphan_chunks.items()), 1):
|
||||
print(f"[{i}/{len(orphan_chunks)}] {source_id}")
|
||||
print("─" * 80)
|
||||
print(f" Chunks orphelins : {len(chunks):,}")
|
||||
|
||||
# Extraire métadonnées depuis le premier chunk
|
||||
if chunks:
|
||||
first_chunk = chunks[0].properties
|
||||
work = first_chunk.get("work", {})
|
||||
|
||||
if isinstance(work, dict):
|
||||
title = work.get("title", "N/A")
|
||||
author = work.get("author", "N/A")
|
||||
print(f" Œuvre : {title}")
|
||||
print(f" Auteur : {author}")
|
||||
|
||||
# Langues détectées
|
||||
languages = set()
|
||||
for chunk in chunks:
|
||||
lang = chunk.properties.get("language")
|
||||
if lang:
|
||||
languages.add(lang)
|
||||
|
||||
if languages:
|
||||
print(f" Langues : {', '.join(sorted(languages))}")
|
||||
|
||||
print()
|
||||
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
|
||||
def create_missing_documents(
|
||||
client: weaviate.WeaviateClient,
|
||||
orphan_chunks: Dict[str, List[Any]],
|
||||
dry_run: bool = True,
|
||||
) -> Dict[str, int]:
|
||||
"""Créer les documents manquants pour les chunks orphelins.
|
||||
|
||||
Args:
|
||||
client: Connected Weaviate client.
|
||||
orphan_chunks: Dict mapping sourceId to list of orphan chunks.
|
||||
dry_run: If True, only simulate (don't actually create).
|
||||
|
||||
Returns:
|
||||
Dict with statistics: created, errors.
|
||||
"""
|
||||
stats = {
|
||||
"created": 0,
|
||||
"errors": 0,
|
||||
}
|
||||
|
||||
if not orphan_chunks:
|
||||
print("✅ Aucun document à créer (pas d'orphelins)")
|
||||
return stats
|
||||
|
||||
if dry_run:
|
||||
print("🔍 MODE DRY-RUN (simulation, aucune création réelle)")
|
||||
else:
|
||||
print("⚠️ MODE EXÉCUTION (création réelle)")
|
||||
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
doc_collection = client.collections.get("Document")
|
||||
|
||||
for source_id, chunks in sorted(orphan_chunks.items()):
|
||||
print(f"Traitement de {source_id}...")
|
||||
|
||||
# Extraire métadonnées depuis les chunks
|
||||
if not chunks:
|
||||
print(f" ⚠️ Aucun chunk, skip")
|
||||
continue
|
||||
|
||||
first_chunk = chunks[0].properties
|
||||
work = first_chunk.get("work", {})
|
||||
|
||||
# Construire l'objet Document avec métadonnées minimales
|
||||
doc_obj: Dict[str, Any] = {
|
||||
"sourceId": source_id,
|
||||
"title": "N/A",
|
||||
"author": "N/A",
|
||||
"edition": None,
|
||||
"language": "en",
|
||||
"pages": 0,
|
||||
"chunksCount": len(chunks),
|
||||
"toc": None,
|
||||
"hierarchy": None,
|
||||
"createdAt": datetime.now(),
|
||||
}
|
||||
|
||||
# Enrichir avec métadonnées work si disponibles
|
||||
if isinstance(work, dict):
|
||||
if work.get("title"):
|
||||
doc_obj["title"] = work["title"]
|
||||
if work.get("author"):
|
||||
doc_obj["author"] = work["author"]
|
||||
|
||||
# Nested object work
|
||||
doc_obj["work"] = {
|
||||
"title": work.get("title", "N/A"),
|
||||
"author": work.get("author", "N/A"),
|
||||
}
|
||||
|
||||
# Détecter langue
|
||||
languages = set()
|
||||
for chunk in chunks:
|
||||
lang = chunk.properties.get("language")
|
||||
if lang:
|
||||
languages.add(lang)
|
||||
|
||||
if len(languages) == 1:
|
||||
doc_obj["language"] = list(languages)[0]
|
||||
|
||||
print(f" Chunks : {len(chunks):,}")
|
||||
print(f" Titre : {doc_obj['title']}")
|
||||
print(f" Auteur : {doc_obj['author']}")
|
||||
print(f" Langue : {doc_obj['language']}")
|
||||
|
||||
if dry_run:
|
||||
print(f" 🔍 [DRY-RUN] Créerait Document : {doc_obj}")
|
||||
stats["created"] += 1
|
||||
else:
|
||||
try:
|
||||
uuid = doc_collection.data.insert(doc_obj)
|
||||
print(f" ✅ Créé UUID {uuid}")
|
||||
stats["created"] += 1
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Erreur création : {e}")
|
||||
stats["errors"] += 1
|
||||
|
||||
print()
|
||||
|
||||
print("=" * 80)
|
||||
print("RÉSUMÉ")
|
||||
print("=" * 80)
|
||||
print(f" Documents créés : {stats['created']}")
|
||||
print(f" Erreurs : {stats['errors']}")
|
||||
print()
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def delete_orphan_chunks(
|
||||
client: weaviate.WeaviateClient,
|
||||
orphan_chunks: Dict[str, List[Any]],
|
||||
dry_run: bool = True,
|
||||
) -> Dict[str, int]:
|
||||
"""Supprimer les chunks orphelins.
|
||||
|
||||
Args:
|
||||
client: Connected Weaviate client.
|
||||
orphan_chunks: Dict mapping sourceId to list of orphan chunks.
|
||||
dry_run: If True, only simulate (don't actually delete).
|
||||
|
||||
Returns:
|
||||
Dict with statistics: deleted, errors.
|
||||
"""
|
||||
stats = {
|
||||
"deleted": 0,
|
||||
"errors": 0,
|
||||
}
|
||||
|
||||
if not orphan_chunks:
|
||||
print("✅ Aucun chunk à supprimer (pas d'orphelins)")
|
||||
return stats
|
||||
|
||||
total_to_delete = sum(len(chunks) for chunks in orphan_chunks.values())
|
||||
|
||||
if dry_run:
|
||||
print("🔍 MODE DRY-RUN (simulation, aucune suppression réelle)")
|
||||
else:
|
||||
print("⚠️ MODE EXÉCUTION (suppression réelle)")
|
||||
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
chunk_collection = client.collections.get("Chunk")
|
||||
|
||||
for source_id, chunks in sorted(orphan_chunks.items()):
|
||||
print(f"Traitement de {source_id} ({len(chunks):,} chunks)...")
|
||||
|
||||
for chunk_obj in chunks:
|
||||
if dry_run:
|
||||
# En dry-run, compter seulement
|
||||
stats["deleted"] += 1
|
||||
else:
|
||||
try:
|
||||
chunk_collection.data.delete_by_id(chunk_obj.uuid)
|
||||
stats["deleted"] += 1
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Erreur suppression UUID {chunk_obj.uuid}: {e}")
|
||||
stats["errors"] += 1
|
||||
|
||||
if dry_run:
|
||||
print(f" 🔍 [DRY-RUN] Supprimerait {len(chunks):,} chunks")
|
||||
else:
|
||||
print(f" ✅ Supprimé {len(chunks):,} chunks")
|
||||
|
||||
print()
|
||||
|
||||
print("=" * 80)
|
||||
print("RÉSUMÉ")
|
||||
print("=" * 80)
|
||||
print(f" Chunks supprimés : {stats['deleted']:,}")
|
||||
print(f" Erreurs : {stats['errors']}")
|
||||
print()
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def verify_operation(client: weaviate.WeaviateClient) -> None:
|
||||
"""Vérifier le résultat de l'opération.
|
||||
|
||||
Args:
|
||||
client: Connected Weaviate client.
|
||||
"""
|
||||
print("=" * 80)
|
||||
print("VÉRIFICATION POST-OPÉRATION")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
orphan_chunks = identify_orphan_chunks(client)
|
||||
|
||||
if not orphan_chunks:
|
||||
print("✅ Aucun chunk orphelin restant !")
|
||||
print()
|
||||
|
||||
# Statistiques finales
|
||||
chunk_coll = client.collections.get("Chunk")
|
||||
chunk_result = chunk_coll.aggregate.over_all(total_count=True)
|
||||
|
||||
doc_coll = client.collections.get("Document")
|
||||
doc_result = doc_coll.aggregate.over_all(total_count=True)
|
||||
|
||||
print(f"📊 Chunks totaux : {chunk_result.total_count:,}")
|
||||
print(f"📊 Documents totaux : {doc_result.total_count:,}")
|
||||
print()
|
||||
else:
|
||||
total_orphans = sum(len(chunks) for chunks in orphan_chunks.values())
|
||||
print(f"⚠️ {total_orphans:,} chunks orphelins persistent")
|
||||
print()
|
||||
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Gérer les chunks orphelins (sans document parent)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--create-documents",
|
||||
action="store_true",
|
||||
help="Créer les documents manquants pour les orphelins",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delete-orphans",
|
||||
action="store_true",
|
||||
help="Supprimer les chunks orphelins (ATTENTION: perte de données)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--execute",
|
||||
action="store_true",
|
||||
help="Exécuter l'opération (par défaut: dry-run)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Fix encoding for Windows console
|
||||
if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
print("=" * 80)
|
||||
print("GESTION DES CHUNKS ORPHELINS")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
client = weaviate.connect_to_local(
|
||||
host="localhost",
|
||||
port=8080,
|
||||
grpc_port=50051,
|
||||
)
|
||||
|
||||
try:
|
||||
if not client.is_ready():
|
||||
print("❌ Weaviate is not ready. Ensure docker-compose is running.")
|
||||
sys.exit(1)
|
||||
|
||||
print("✓ Weaviate is ready")
|
||||
print()
|
||||
|
||||
# Identifier les orphelins
|
||||
orphan_chunks = identify_orphan_chunks(client)
|
||||
|
||||
# Afficher le rapport
|
||||
display_orphans_report(orphan_chunks)
|
||||
|
||||
if not orphan_chunks:
|
||||
print("✅ Aucune action nécessaire (pas d'orphelins)")
|
||||
sys.exit(0)
|
||||
|
||||
# Décider de l'action
|
||||
if args.create_documents:
|
||||
print("📋 ACTION : Créer les documents manquants")
|
||||
print()
|
||||
|
||||
if args.execute:
|
||||
print("⚠️ ATTENTION : Les documents vont être créés !")
|
||||
print()
|
||||
response = input("Continuer ? (oui/non) : ").strip().lower()
|
||||
if response not in ["oui", "yes", "o", "y"]:
|
||||
print("❌ Annulé par l'utilisateur.")
|
||||
sys.exit(0)
|
||||
print()
|
||||
|
||||
stats = create_missing_documents(client, orphan_chunks, dry_run=not args.execute)
|
||||
|
||||
if args.execute and stats["created"] > 0:
|
||||
verify_operation(client)
|
||||
|
||||
elif args.delete_orphans:
|
||||
print("📋 ACTION : Supprimer les chunks orphelins")
|
||||
print()
|
||||
|
||||
total_orphans = sum(len(chunks) for chunks in orphan_chunks.values())
|
||||
|
||||
if args.execute:
|
||||
print(f"⚠️ ATTENTION : {total_orphans:,} chunks vont être SUPPRIMÉS DÉFINITIVEMENT !")
|
||||
print("⚠️ Cette opération est IRRÉVERSIBLE !")
|
||||
print()
|
||||
response = input("Continuer ? (oui/non) : ").strip().lower()
|
||||
if response not in ["oui", "yes", "o", "y"]:
|
||||
print("❌ Annulé par l'utilisateur.")
|
||||
sys.exit(0)
|
||||
print()
|
||||
|
||||
stats = delete_orphan_chunks(client, orphan_chunks, dry_run=not args.execute)
|
||||
|
||||
if args.execute and stats["deleted"] > 0:
|
||||
verify_operation(client)
|
||||
|
||||
else:
|
||||
# Mode liste uniquement (par défaut)
|
||||
print("=" * 80)
|
||||
print("💡 ACTIONS POSSIBLES")
|
||||
print("=" * 80)
|
||||
print()
|
||||
print("Option 1 : Créer les documents manquants (recommandé)")
|
||||
print(" python manage_orphan_chunks.py --create-documents --execute")
|
||||
print()
|
||||
print("Option 2 : Supprimer les chunks orphelins (ATTENTION: perte de données)")
|
||||
print(" python manage_orphan_chunks.py --delete-orphans --execute")
|
||||
print()
|
||||
print("Option 3 : Ne rien faire (laisser orphelins)")
|
||||
print(" Les chunks restent accessibles via recherche sémantique")
|
||||
print()
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,91 @@
|
||||
"""Script to display all documents from the Weaviate Document collection in table format.
|
||||
|
||||
Usage:
|
||||
python show_works.py
|
||||
"""
|
||||
|
||||
import weaviate
|
||||
from typing import Any
|
||||
from tabulate import tabulate
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def format_date(date_val: Any) -> str:
|
||||
"""Format date for display.
|
||||
|
||||
Args:
|
||||
date_val: Date value (string or datetime).
|
||||
|
||||
Returns:
|
||||
Formatted date string.
|
||||
"""
|
||||
if date_val is None:
|
||||
return "-"
|
||||
if isinstance(date_val, str):
|
||||
try:
|
||||
dt = datetime.fromisoformat(date_val.replace('Z', '+00:00'))
|
||||
return dt.strftime("%Y-%m-%d %H:%M")
|
||||
except:
|
||||
return date_val
|
||||
return str(date_val)
|
||||
|
||||
|
||||
def display_documents() -> None:
|
||||
"""Connect to Weaviate and display all Document objects in table format."""
|
||||
try:
|
||||
# Connect to local Weaviate instance
|
||||
client = weaviate.connect_to_local()
|
||||
|
||||
try:
|
||||
# Get Document collection
|
||||
document_collection = client.collections.get("Document")
|
||||
|
||||
# Fetch all documents
|
||||
response = document_collection.query.fetch_objects(limit=1000)
|
||||
|
||||
if not response.objects:
|
||||
print("No documents found in the collection.")
|
||||
return
|
||||
|
||||
# Prepare data for table
|
||||
table_data = []
|
||||
for obj in response.objects:
|
||||
props = obj.properties
|
||||
|
||||
# Extract nested work object
|
||||
work = props.get("work", {})
|
||||
work_title = work.get("title", "N/A") if isinstance(work, dict) else "N/A"
|
||||
work_author = work.get("author", "N/A") if isinstance(work, dict) else "N/A"
|
||||
|
||||
table_data.append([
|
||||
props.get("sourceId", "N/A"),
|
||||
work_title,
|
||||
work_author,
|
||||
props.get("edition", "-"),
|
||||
props.get("pages", "-"),
|
||||
props.get("chunksCount", "-"),
|
||||
props.get("language", "-"),
|
||||
format_date(props.get("createdAt")),
|
||||
])
|
||||
|
||||
# Display header
|
||||
print(f"\n{'='*120}")
|
||||
print(f"Collection Document - {len(response.objects)} document(s) trouvé(s)")
|
||||
print(f"{'='*120}\n")
|
||||
|
||||
# Display table
|
||||
headers = ["Source ID", "Work Title", "Author", "Edition", "Pages", "Chunks", "Lang", "Created At"]
|
||||
print(tabulate(table_data, headers=headers, tablefmt="grid"))
|
||||
print()
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error connecting to Weaviate: {e}")
|
||||
print("\nMake sure Weaviate is running:")
|
||||
print(" docker compose up -d")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
display_documents()
|
||||
@@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test Weaviate connection from Flask context."""
|
||||
|
||||
import weaviate
|
||||
|
||||
try:
|
||||
print("Tentative de connexion à Weaviate...")
|
||||
client = weaviate.connect_to_local(
|
||||
host="localhost",
|
||||
port=8080,
|
||||
grpc_port=50051,
|
||||
)
|
||||
print("[OK] Connexion etablie!")
|
||||
print(f"[OK] Weaviate est pret: {client.is_ready()}")
|
||||
|
||||
# Test query
|
||||
collections = client.collections.list_all()
|
||||
print(f"[OK] Collections disponibles: {list(collections.keys())}")
|
||||
|
||||
client.close()
|
||||
print("[OK] Test reussi!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERREUR] {e}")
|
||||
print(f"Type d'erreur: {type(e).__name__}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
@@ -0,0 +1,441 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Vérification de la qualité des données Weaviate œuvre par œuvre.
|
||||
|
||||
Ce script analyse la cohérence entre les 4 collections (Work, Document, Chunk, Summary)
|
||||
et détecte les incohérences :
|
||||
- Documents sans chunks/summaries
|
||||
- Chunks/summaries orphelins
|
||||
- Works manquants
|
||||
- Incohérences dans les nested objects
|
||||
|
||||
Usage:
|
||||
python verify_data_quality.py
|
||||
"""
|
||||
|
||||
import sys
|
||||
from typing import Any, Dict, List, Set, Optional
|
||||
from collections import defaultdict
|
||||
|
||||
import weaviate
|
||||
from weaviate.collections import Collection
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Data Quality Checks
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class DataQualityReport:
|
||||
"""Rapport de qualité des données."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.total_documents = 0
|
||||
self.total_chunks = 0
|
||||
self.total_summaries = 0
|
||||
self.total_works = 0
|
||||
|
||||
self.documents: List[Dict[str, Any]] = []
|
||||
self.issues: List[str] = []
|
||||
self.warnings: List[str] = []
|
||||
|
||||
# Tracking des œuvres uniques extraites des nested objects
|
||||
self.unique_works: Dict[str, Set[str]] = defaultdict(set) # title -> set(authors)
|
||||
|
||||
def add_issue(self, severity: str, message: str) -> None:
|
||||
"""Ajouter un problème détecté."""
|
||||
if severity == "ERROR":
|
||||
self.issues.append(f"❌ {message}")
|
||||
elif severity == "WARNING":
|
||||
self.warnings.append(f"⚠️ {message}")
|
||||
|
||||
def add_document(self, doc_data: Dict[str, Any]) -> None:
|
||||
"""Ajouter les données d'un document analysé."""
|
||||
self.documents.append(doc_data)
|
||||
|
||||
def print_report(self) -> None:
|
||||
"""Afficher le rapport complet."""
|
||||
print("\n" + "=" * 80)
|
||||
print("RAPPORT DE QUALITÉ DES DONNÉES WEAVIATE")
|
||||
print("=" * 80)
|
||||
|
||||
# Statistiques globales
|
||||
print("\n📊 STATISTIQUES GLOBALES")
|
||||
print("─" * 80)
|
||||
print(f" • Works (collection) : {self.total_works:>6,} objets")
|
||||
print(f" • Documents : {self.total_documents:>6,} objets")
|
||||
print(f" • Chunks : {self.total_chunks:>6,} objets")
|
||||
print(f" • Summaries : {self.total_summaries:>6,} objets")
|
||||
print()
|
||||
print(f" • Œuvres uniques (nested): {len(self.unique_works):>6,} détectées")
|
||||
|
||||
# Œuvres uniques détectées dans nested objects
|
||||
if self.unique_works:
|
||||
print("\n📚 ŒUVRES DÉTECTÉES (via nested objects dans Chunks)")
|
||||
print("─" * 80)
|
||||
for i, (title, authors) in enumerate(sorted(self.unique_works.items()), 1):
|
||||
authors_str = ", ".join(sorted(authors))
|
||||
print(f" {i:2d}. {title}")
|
||||
print(f" Auteur(s): {authors_str}")
|
||||
|
||||
# Analyse par document
|
||||
print("\n" + "=" * 80)
|
||||
print("ANALYSE DÉTAILLÉE PAR DOCUMENT")
|
||||
print("=" * 80)
|
||||
|
||||
for i, doc in enumerate(self.documents, 1):
|
||||
status = "✅" if doc["chunks_count"] > 0 and doc["summaries_count"] > 0 else "⚠️"
|
||||
print(f"\n{status} [{i}/{len(self.documents)}] {doc['sourceId']}")
|
||||
print("─" * 80)
|
||||
|
||||
# Métadonnées Document
|
||||
if doc.get("work_nested"):
|
||||
work = doc["work_nested"]
|
||||
print(f" Œuvre : {work.get('title', 'N/A')}")
|
||||
print(f" Auteur : {work.get('author', 'N/A')}")
|
||||
else:
|
||||
print(f" Œuvre : {doc.get('title', 'N/A')}")
|
||||
print(f" Auteur : {doc.get('author', 'N/A')}")
|
||||
|
||||
print(f" Édition : {doc.get('edition', 'N/A')}")
|
||||
print(f" Langue : {doc.get('language', 'N/A')}")
|
||||
print(f" Pages : {doc.get('pages', 0):,}")
|
||||
|
||||
# Collections
|
||||
print()
|
||||
print(f" 📦 Collections :")
|
||||
print(f" • Chunks : {doc['chunks_count']:>6,} objets")
|
||||
print(f" • Summaries : {doc['summaries_count']:>6,} objets")
|
||||
|
||||
# Work collection
|
||||
if doc.get("has_work_object"):
|
||||
print(f" • Work : ✅ Existe dans collection Work")
|
||||
else:
|
||||
print(f" • Work : ❌ MANQUANT dans collection Work")
|
||||
|
||||
# Cohérence nested objects
|
||||
if doc.get("nested_works_consistency"):
|
||||
consistency = doc["nested_works_consistency"]
|
||||
if consistency["is_consistent"]:
|
||||
print(f" • Cohérence nested objects : ✅ OK")
|
||||
else:
|
||||
print(f" • Cohérence nested objects : ⚠️ INCOHÉRENCES DÉTECTÉES")
|
||||
if consistency["unique_titles"] > 1:
|
||||
print(f" → {consistency['unique_titles']} titres différents dans chunks:")
|
||||
for title in consistency["titles"]:
|
||||
print(f" - {title}")
|
||||
if consistency["unique_authors"] > 1:
|
||||
print(f" → {consistency['unique_authors']} auteurs différents dans chunks:")
|
||||
for author in consistency["authors"]:
|
||||
print(f" - {author}")
|
||||
|
||||
# Ratios
|
||||
if doc["chunks_count"] > 0:
|
||||
ratio = doc["summaries_count"] / doc["chunks_count"]
|
||||
print(f" 📊 Ratio Summary/Chunk : {ratio:.2f}")
|
||||
|
||||
if ratio < 0.5:
|
||||
print(f" ⚠️ Ratio faible (< 0.5) - Peut-être des summaries manquants")
|
||||
elif ratio > 3.0:
|
||||
print(f" ⚠️ Ratio élevé (> 3.0) - Beaucoup de summaries pour peu de chunks")
|
||||
|
||||
# Problèmes spécifiques à ce document
|
||||
if doc.get("issues"):
|
||||
print(f"\n ⚠️ Problèmes détectés :")
|
||||
for issue in doc["issues"]:
|
||||
print(f" • {issue}")
|
||||
|
||||
# Problèmes globaux
|
||||
if self.issues or self.warnings:
|
||||
print("\n" + "=" * 80)
|
||||
print("PROBLÈMES DÉTECTÉS")
|
||||
print("=" * 80)
|
||||
|
||||
if self.issues:
|
||||
print("\n❌ ERREURS CRITIQUES :")
|
||||
for issue in self.issues:
|
||||
print(f" {issue}")
|
||||
|
||||
if self.warnings:
|
||||
print("\n⚠️ AVERTISSEMENTS :")
|
||||
for warning in self.warnings:
|
||||
print(f" {warning}")
|
||||
|
||||
# Recommandations
|
||||
print("\n" + "=" * 80)
|
||||
print("RECOMMANDATIONS")
|
||||
print("=" * 80)
|
||||
|
||||
if self.total_works == 0 and len(self.unique_works) > 0:
|
||||
print("\n📌 Collection Work vide")
|
||||
print(f" • {len(self.unique_works)} œuvres uniques détectées dans nested objects")
|
||||
print(f" • Recommandation : Peupler la collection Work")
|
||||
print(f" • Commande : python migrate_add_work_collection.py")
|
||||
print(f" • Ensuite : Créer des objets Work depuis les nested objects uniques")
|
||||
|
||||
# Vérifier cohérence counts
|
||||
total_chunks_declared = sum(doc.get("chunksCount", 0) for doc in self.documents if "chunksCount" in doc)
|
||||
if total_chunks_declared != self.total_chunks:
|
||||
print(f"\n⚠️ Incohérence counts")
|
||||
print(f" • Document.chunksCount total : {total_chunks_declared:,}")
|
||||
print(f" • Chunks réels : {self.total_chunks:,}")
|
||||
print(f" • Différence : {abs(total_chunks_declared - self.total_chunks):,}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("FIN DU RAPPORT")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
|
||||
def analyze_document_quality(
|
||||
all_chunks: List[Any],
|
||||
all_summaries: List[Any],
|
||||
doc_sourceId: str,
|
||||
client: weaviate.WeaviateClient,
|
||||
) -> Dict[str, Any]:
|
||||
"""Analyser la qualité des données pour un document spécifique.
|
||||
|
||||
Args:
|
||||
all_chunks: All chunks from database (to filter in Python).
|
||||
all_summaries: All summaries from database (to filter in Python).
|
||||
doc_sourceId: Document identifier to analyze.
|
||||
client: Connected Weaviate client.
|
||||
|
||||
Returns:
|
||||
Dict containing analysis results.
|
||||
"""
|
||||
result: Dict[str, Any] = {
|
||||
"sourceId": doc_sourceId,
|
||||
"chunks_count": 0,
|
||||
"summaries_count": 0,
|
||||
"has_work_object": False,
|
||||
"issues": [],
|
||||
}
|
||||
|
||||
# Filtrer les chunks associés (en Python car nested objects non filtrables)
|
||||
try:
|
||||
doc_chunks = [
|
||||
chunk for chunk in all_chunks
|
||||
if chunk.properties.get("document", {}).get("sourceId") == doc_sourceId
|
||||
]
|
||||
|
||||
result["chunks_count"] = len(doc_chunks)
|
||||
|
||||
# Analyser cohérence nested objects
|
||||
if doc_chunks:
|
||||
titles: Set[str] = set()
|
||||
authors: Set[str] = set()
|
||||
|
||||
for chunk_obj in doc_chunks:
|
||||
props = chunk_obj.properties
|
||||
if "work" in props and isinstance(props["work"], dict):
|
||||
work = props["work"]
|
||||
if work.get("title"):
|
||||
titles.add(work["title"])
|
||||
if work.get("author"):
|
||||
authors.add(work["author"])
|
||||
|
||||
result["nested_works_consistency"] = {
|
||||
"titles": sorted(titles),
|
||||
"authors": sorted(authors),
|
||||
"unique_titles": len(titles),
|
||||
"unique_authors": len(authors),
|
||||
"is_consistent": len(titles) <= 1 and len(authors) <= 1,
|
||||
}
|
||||
|
||||
# Récupérer work/author pour ce document
|
||||
if titles and authors:
|
||||
result["work_from_chunks"] = {
|
||||
"title": list(titles)[0] if len(titles) == 1 else titles,
|
||||
"author": list(authors)[0] if len(authors) == 1 else authors,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
result["issues"].append(f"Erreur analyse chunks: {e}")
|
||||
|
||||
# Filtrer les summaries associés (en Python)
|
||||
try:
|
||||
doc_summaries = [
|
||||
summary for summary in all_summaries
|
||||
if summary.properties.get("document", {}).get("sourceId") == doc_sourceId
|
||||
]
|
||||
|
||||
result["summaries_count"] = len(doc_summaries)
|
||||
|
||||
except Exception as e:
|
||||
result["issues"].append(f"Erreur analyse summaries: {e}")
|
||||
|
||||
# Vérifier si Work existe
|
||||
if result.get("work_from_chunks"):
|
||||
work_info = result["work_from_chunks"]
|
||||
if isinstance(work_info["title"], str):
|
||||
try:
|
||||
work_collection = client.collections.get("Work")
|
||||
work_response = work_collection.query.fetch_objects(
|
||||
filters=weaviate.classes.query.Filter.by_property("title").equal(work_info["title"]),
|
||||
limit=1,
|
||||
)
|
||||
|
||||
result["has_work_object"] = len(work_response.objects) > 0
|
||||
|
||||
except Exception as e:
|
||||
result["issues"].append(f"Erreur vérification Work: {e}")
|
||||
|
||||
# Détection de problèmes
|
||||
if result["chunks_count"] == 0:
|
||||
result["issues"].append("Aucun chunk trouvé pour ce document")
|
||||
|
||||
if result["summaries_count"] == 0:
|
||||
result["issues"].append("Aucun summary trouvé pour ce document")
|
||||
|
||||
if result.get("nested_works_consistency") and not result["nested_works_consistency"]["is_consistent"]:
|
||||
result["issues"].append("Incohérences dans les nested objects work")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point."""
|
||||
# Fix encoding for Windows console
|
||||
if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
print("=" * 80)
|
||||
print("VÉRIFICATION DE LA QUALITÉ DES DONNÉES WEAVIATE")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
client = weaviate.connect_to_local(
|
||||
host="localhost",
|
||||
port=8080,
|
||||
grpc_port=50051,
|
||||
)
|
||||
|
||||
try:
|
||||
if not client.is_ready():
|
||||
print("❌ Weaviate is not ready. Ensure docker-compose is running.")
|
||||
sys.exit(1)
|
||||
|
||||
print("✓ Weaviate is ready")
|
||||
print("✓ Starting data quality analysis...")
|
||||
print()
|
||||
|
||||
report = DataQualityReport()
|
||||
|
||||
# Récupérer counts globaux
|
||||
try:
|
||||
work_coll = client.collections.get("Work")
|
||||
work_result = work_coll.aggregate.over_all(total_count=True)
|
||||
report.total_works = work_result.total_count
|
||||
except Exception as e:
|
||||
report.add_issue("ERROR", f"Cannot count Work objects: {e}")
|
||||
|
||||
try:
|
||||
chunk_coll = client.collections.get("Chunk")
|
||||
chunk_result = chunk_coll.aggregate.over_all(total_count=True)
|
||||
report.total_chunks = chunk_result.total_count
|
||||
except Exception as e:
|
||||
report.add_issue("ERROR", f"Cannot count Chunk objects: {e}")
|
||||
|
||||
try:
|
||||
summary_coll = client.collections.get("Summary")
|
||||
summary_result = summary_coll.aggregate.over_all(total_count=True)
|
||||
report.total_summaries = summary_result.total_count
|
||||
except Exception as e:
|
||||
report.add_issue("ERROR", f"Cannot count Summary objects: {e}")
|
||||
|
||||
# Récupérer TOUS les chunks et summaries en une fois
|
||||
# (car nested objects non filtrables via API Weaviate)
|
||||
print("Loading all chunks and summaries into memory...")
|
||||
all_chunks: List[Any] = []
|
||||
all_summaries: List[Any] = []
|
||||
|
||||
try:
|
||||
chunk_coll = client.collections.get("Chunk")
|
||||
chunks_response = chunk_coll.query.fetch_objects(
|
||||
limit=10000, # Haute limite pour gros corpus
|
||||
# Note: nested objects (work, document) sont retournés automatiquement
|
||||
)
|
||||
all_chunks = chunks_response.objects
|
||||
print(f" ✓ Loaded {len(all_chunks)} chunks")
|
||||
except Exception as e:
|
||||
report.add_issue("ERROR", f"Cannot fetch all chunks: {e}")
|
||||
|
||||
try:
|
||||
summary_coll = client.collections.get("Summary")
|
||||
summaries_response = summary_coll.query.fetch_objects(
|
||||
limit=10000,
|
||||
# Note: nested objects (document) sont retournés automatiquement
|
||||
)
|
||||
all_summaries = summaries_response.objects
|
||||
print(f" ✓ Loaded {len(all_summaries)} summaries")
|
||||
except Exception as e:
|
||||
report.add_issue("ERROR", f"Cannot fetch all summaries: {e}")
|
||||
|
||||
print()
|
||||
|
||||
# Récupérer tous les documents
|
||||
try:
|
||||
doc_collection = client.collections.get("Document")
|
||||
docs_response = doc_collection.query.fetch_objects(
|
||||
limit=1000,
|
||||
return_properties=["sourceId", "title", "author", "edition", "language", "pages", "chunksCount", "work"],
|
||||
)
|
||||
|
||||
report.total_documents = len(docs_response.objects)
|
||||
|
||||
print(f"Analyzing {report.total_documents} documents...")
|
||||
print()
|
||||
|
||||
for doc_obj in docs_response.objects:
|
||||
props = doc_obj.properties
|
||||
doc_sourceId = props.get("sourceId", "unknown")
|
||||
|
||||
print(f" • Analyzing {doc_sourceId}...", end=" ")
|
||||
|
||||
# Analyser ce document (avec filtrage Python)
|
||||
analysis = analyze_document_quality(all_chunks, all_summaries, doc_sourceId, client)
|
||||
|
||||
# Merger props Document avec analysis
|
||||
analysis.update({
|
||||
"title": props.get("title"),
|
||||
"author": props.get("author"),
|
||||
"edition": props.get("edition"),
|
||||
"language": props.get("language"),
|
||||
"pages": props.get("pages", 0),
|
||||
"chunksCount": props.get("chunksCount", 0),
|
||||
"work_nested": props.get("work"),
|
||||
})
|
||||
|
||||
# Collecter œuvres uniques
|
||||
if analysis.get("work_from_chunks"):
|
||||
work_info = analysis["work_from_chunks"]
|
||||
if isinstance(work_info["title"], str) and isinstance(work_info["author"], str):
|
||||
report.unique_works[work_info["title"]].add(work_info["author"])
|
||||
|
||||
report.add_document(analysis)
|
||||
|
||||
# Feedback
|
||||
if analysis["chunks_count"] > 0:
|
||||
print(f"✓ ({analysis['chunks_count']} chunks, {analysis['summaries_count']} summaries)")
|
||||
else:
|
||||
print("⚠️ (no chunks)")
|
||||
|
||||
except Exception as e:
|
||||
report.add_issue("ERROR", f"Cannot fetch documents: {e}")
|
||||
|
||||
# Vérifications globales
|
||||
if report.total_works == 0 and report.total_chunks > 0:
|
||||
report.add_issue("WARNING", f"Work collection is empty but {report.total_chunks:,} chunks exist")
|
||||
|
||||
if report.total_documents == 0 and report.total_chunks > 0:
|
||||
report.add_issue("WARNING", f"No documents but {report.total_chunks:,} chunks exist (orphan chunks)")
|
||||
|
||||
# Afficher le rapport
|
||||
report.print_report()
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,185 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Verify vector index configuration for Chunk and Summary collections.
|
||||
|
||||
This script checks if the dynamic index with RQ is properly configured
|
||||
for vectorized collections. It displays:
|
||||
- Index type (flat, hnsw, or dynamic)
|
||||
- Quantization status (RQ enabled/disabled)
|
||||
- Distance metric
|
||||
- Dynamic threshold (if applicable)
|
||||
|
||||
Usage:
|
||||
python verify_vector_index.py
|
||||
"""
|
||||
|
||||
import sys
|
||||
from typing import Any, Dict
|
||||
|
||||
import weaviate
|
||||
|
||||
|
||||
def check_collection_index(client: weaviate.WeaviateClient, collection_name: str) -> None:
|
||||
"""Check and display vector index configuration for a collection.
|
||||
|
||||
Args:
|
||||
client: Connected Weaviate client.
|
||||
collection_name: Name of the collection to check.
|
||||
"""
|
||||
try:
|
||||
collections = client.collections.list_all()
|
||||
|
||||
if collection_name not in collections:
|
||||
print(f" ❌ Collection '{collection_name}' not found")
|
||||
return
|
||||
|
||||
config = collections[collection_name]
|
||||
|
||||
print(f"\n📦 {collection_name}")
|
||||
print("─" * 80)
|
||||
|
||||
# Check vectorizer
|
||||
vectorizer_str: str = str(config.vectorizer)
|
||||
if "text2vec" in vectorizer_str.lower():
|
||||
print(" ✓ Vectorizer: text2vec-transformers")
|
||||
elif "none" in vectorizer_str.lower():
|
||||
print(" ℹ Vectorizer: NONE (metadata collection)")
|
||||
return
|
||||
else:
|
||||
print(f" ⚠ Vectorizer: {vectorizer_str}")
|
||||
|
||||
# Try to get vector index config (API structure varies)
|
||||
# Access via config object properties
|
||||
config_dict: Dict[str, Any] = {}
|
||||
|
||||
# Try different API paths to get config info
|
||||
if hasattr(config, 'vector_index_config'):
|
||||
vector_config = config.vector_index_config
|
||||
config_dict['vector_config'] = str(vector_config)
|
||||
|
||||
# Check for specific attributes
|
||||
if hasattr(vector_config, 'quantizer'):
|
||||
config_dict['quantizer'] = str(vector_config.quantizer)
|
||||
if hasattr(vector_config, 'distance_metric'):
|
||||
config_dict['distance_metric'] = str(vector_config.distance_metric)
|
||||
|
||||
# Display available info
|
||||
if config_dict:
|
||||
print(f" • Configuration détectée:")
|
||||
for key, value in config_dict.items():
|
||||
print(f" - {key}: {value}")
|
||||
|
||||
# Simplified detection based on config representation
|
||||
config_full_str = str(config)
|
||||
|
||||
# Detect index type
|
||||
if "dynamic" in config_full_str.lower():
|
||||
print(" • Index Type: DYNAMIC")
|
||||
elif "hnsw" in config_full_str.lower():
|
||||
print(" • Index Type: HNSW")
|
||||
elif "flat" in config_full_str.lower():
|
||||
print(" • Index Type: FLAT")
|
||||
else:
|
||||
print(" • Index Type: UNKNOWN (default HNSW probable)")
|
||||
|
||||
# Check for RQ
|
||||
if "rq" in config_full_str.lower() or "quantizer" in config_full_str.lower():
|
||||
print(" ✓ RQ (Rotational Quantization): Probablement ENABLED")
|
||||
else:
|
||||
print(" ⚠ RQ (Rotational Quantization): NOT DETECTED (ou désactivé)")
|
||||
|
||||
# Check distance metric
|
||||
if "cosine" in config_full_str.lower():
|
||||
print(" • Distance Metric: COSINE (détecté)")
|
||||
elif "dot" in config_full_str.lower():
|
||||
print(" • Distance Metric: DOT PRODUCT (détecté)")
|
||||
elif "l2" in config_full_str.lower():
|
||||
print(" • Distance Metric: L2 SQUARED (détecté)")
|
||||
|
||||
print("\n Interpretation:")
|
||||
if "dynamic" in config_full_str.lower() and ("rq" in config_full_str.lower() or "quantizer" in config_full_str.lower()):
|
||||
print(" ✅ OPTIMIZED: Dynamic index with RQ enabled")
|
||||
print(" → Memory savings: ~75% at scale")
|
||||
print(" → Auto-switches from flat to HNSW at threshold")
|
||||
elif "hnsw" in config_full_str.lower():
|
||||
if "rq" in config_full_str.lower() or "quantizer" in config_full_str.lower():
|
||||
print(" ✅ HNSW with RQ: Good for large collections")
|
||||
else:
|
||||
print(" ⚠ HNSW without RQ: Consider enabling RQ for memory savings")
|
||||
elif "flat" in config_full_str.lower():
|
||||
print(" ℹ FLAT index: Good for small collections (<100k vectors)")
|
||||
else:
|
||||
print(" ⚠ Unknown index configuration (probably default HNSW)")
|
||||
print(" → Collections créées sans config explicite utilisent HNSW par défaut")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Error checking {collection_name}: {e}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point."""
|
||||
# Fix encoding for Windows console
|
||||
if sys.platform == "win32" and hasattr(sys.stdout, 'reconfigure'):
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
print("=" * 80)
|
||||
print("VÉRIFICATION DES INDEX VECTORIELS WEAVIATE")
|
||||
print("=" * 80)
|
||||
|
||||
client: weaviate.WeaviateClient = weaviate.connect_to_local(
|
||||
host="localhost",
|
||||
port=8080,
|
||||
grpc_port=50051,
|
||||
)
|
||||
|
||||
try:
|
||||
# Check if Weaviate is ready
|
||||
if not client.is_ready():
|
||||
print("\n❌ Weaviate is not ready. Ensure docker-compose is running.")
|
||||
return
|
||||
|
||||
print("\n✓ Weaviate is ready")
|
||||
|
||||
# Get all collections
|
||||
collections = client.collections.list_all()
|
||||
print(f"✓ Found {len(collections)} collections: {sorted(collections.keys())}")
|
||||
|
||||
# Check vectorized collections (Chunk and Summary)
|
||||
print("\n" + "=" * 80)
|
||||
print("COLLECTIONS VECTORISÉES")
|
||||
print("=" * 80)
|
||||
|
||||
check_collection_index(client, "Chunk")
|
||||
check_collection_index(client, "Summary")
|
||||
|
||||
# Check non-vectorized collections (for reference)
|
||||
print("\n" + "=" * 80)
|
||||
print("COLLECTIONS MÉTADONNÉES (Non vectorisées)")
|
||||
print("=" * 80)
|
||||
|
||||
check_collection_index(client, "Work")
|
||||
check_collection_index(client, "Document")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("VÉRIFICATION TERMINÉE")
|
||||
print("=" * 80)
|
||||
|
||||
# Count objects in each collection
|
||||
print("\n📊 STATISTIQUES:")
|
||||
for name in ["Work", "Document", "Chunk", "Summary"]:
|
||||
if name in collections:
|
||||
try:
|
||||
coll = client.collections.get(name)
|
||||
# Simple count using aggregate (works for all collections)
|
||||
result = coll.aggregate.over_all(total_count=True)
|
||||
count = result.total_count
|
||||
print(f" • {name:<12} {count:>8,} objets")
|
||||
except Exception as e:
|
||||
print(f" • {name:<12} Error: {e}")
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
print("\n✓ Connexion fermée\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user