Implemented chunking optimization to resolve oversized chunks and improve semantic search quality: CHUNKING IMPROVEMENTS: - Added strict 1000-word max limit (vs previous 1500-2000) - Implemented 100-word overlap between consecutive chunks - Created llm_chunker_improved.py with overlap functionality - Added 3 fallback points in llm_chunker.py for robustness RE-CHUNKING RESULTS: - Identified and re-chunked 31 oversized chunks (>2000 tokens) - Split into 92 optimally-sized chunks (max 1995 tokens) - Preserved all metadata (workTitle, workAuthor, sectionPath, etc.) - 0 chunks now exceed 2000 tokens (vs 31 before) VECTORIZATION: - Created manual vectorization script for chunks without vectors - Successfully vectorized all 92 new chunks (100% coverage) - All 5,304 chunks now have BGE-M3 embeddings DOCKER CONFIGURATION: - Exposed text2vec-transformers port 8090 for manual vectorization - Added cluster configuration to fix "No private IP address found" - Increased worker timeout to 600s for large chunks TESTING: - Created comprehensive search quality test suite - Tests distribution, overlap detection, and semantic search - Modified to use near_vector() (Chunk_v2 has no vectorizer) Scripts: - 08_fix_summaries_properties.py - Add missing Work metadata to summaries - 09_rechunk_oversized.py - Re-chunk giant chunks with overlap - 10_test_search_quality.py - Validate search improvements - 11_vectorize_missing_chunks.py - Manual vectorization via API Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
158 lines
4.4 KiB
Python
158 lines
4.4 KiB
Python
"""Correctif: Ajouter workAuthor, year, language aux Summary_v2."""
|
|
|
|
import weaviate
|
|
import sys
|
|
|
|
if sys.stdout.encoding != 'utf-8':
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
|
|
# Try to import tqdm
|
|
try:
|
|
from tqdm import tqdm
|
|
HAS_TQDM = True
|
|
except ImportError:
|
|
HAS_TQDM = False
|
|
|
|
client = weaviate.connect_to_local()
|
|
|
|
try:
|
|
print("=" * 80)
|
|
print("CORRECTIF: AJOUTER workAuthor, year, language À SUMMARY_V2")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
summary_v2 = client.collections.get("Summary_v2")
|
|
work_collection = client.collections.get("Work")
|
|
|
|
# Build workTitle → Work metadata map
|
|
print("Étape 1: Mapping workTitle → Work metadata")
|
|
print("-" * 80)
|
|
|
|
work_map = {}
|
|
|
|
for work in work_collection.iterator(include_vector=False):
|
|
props = work.properties
|
|
title = props.get("title")
|
|
if title:
|
|
work_map[title] = {
|
|
"author": props.get("author", "Unknown"),
|
|
"year": props.get("year", 0),
|
|
"language": props.get("language", "en"),
|
|
}
|
|
|
|
print(f"✓ {len(work_map)} mappings workTitle → metadata")
|
|
print()
|
|
|
|
# Count total summaries
|
|
print("Étape 2: Comptage summaries")
|
|
print("-" * 80)
|
|
|
|
print("Comptage en cours...")
|
|
total_summaries = sum(1 for _ in summary_v2.iterator(include_vector=False))
|
|
|
|
print(f"✓ {total_summaries} summaries à corriger")
|
|
print()
|
|
|
|
# Update summaries
|
|
print("Étape 3: Mise à jour des propriétés")
|
|
print("-" * 80)
|
|
print()
|
|
|
|
updated = 0
|
|
skipped = 0
|
|
errors = []
|
|
|
|
# Create iterator with or without tqdm
|
|
if HAS_TQDM:
|
|
iterator = tqdm(
|
|
summary_v2.iterator(include_vector=False),
|
|
total=total_summaries,
|
|
desc="Mise à jour",
|
|
unit="summaries"
|
|
)
|
|
else:
|
|
iterator = summary_v2.iterator(include_vector=False)
|
|
print("Mise à jour en cours...")
|
|
|
|
for idx, summary in enumerate(iterator, 1):
|
|
props = summary.properties
|
|
|
|
try:
|
|
work_title = props.get("workTitle")
|
|
|
|
if not work_title:
|
|
errors.append(f"Summary {summary.uuid}: pas de workTitle")
|
|
skipped += 1
|
|
continue
|
|
|
|
# Get work metadata
|
|
work_metadata = work_map.get(work_title)
|
|
if not work_metadata:
|
|
errors.append(f"Summary {summary.uuid}: Work '{work_title}' introuvable")
|
|
skipped += 1
|
|
continue
|
|
|
|
# Check if already updated (workAuthor exists)
|
|
if props.get("workAuthor") is not None:
|
|
skipped += 1
|
|
continue
|
|
|
|
# Update properties
|
|
summary_v2.data.update(
|
|
uuid=summary.uuid,
|
|
properties={
|
|
"workAuthor": work_metadata["author"],
|
|
"year": work_metadata["year"],
|
|
"language": work_metadata["language"],
|
|
}
|
|
)
|
|
|
|
updated += 1
|
|
|
|
# Progress without tqdm
|
|
if not HAS_TQDM and idx % 10 == 0:
|
|
print(f" {idx}/{total_summaries} summaries traités...")
|
|
|
|
except Exception as e:
|
|
errors.append(f"Summary {summary.uuid}: {e}")
|
|
|
|
print()
|
|
print("-" * 80)
|
|
print(f"✓ Total mis à jour: {updated}/{total_summaries}")
|
|
print(f" Déjà à jour: {skipped}")
|
|
|
|
if errors:
|
|
print(f"⚠️ Erreurs rencontrées: {len(errors)}")
|
|
print()
|
|
print("Premières erreurs:")
|
|
for err in errors[:10]:
|
|
print(f" - {err}")
|
|
if len(errors) > 10:
|
|
print(f" ... et {len(errors) - 10} autres")
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("CORRECTIF TERMINÉ")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
if updated == total_summaries:
|
|
print("✅ Tous les summaries ont été mis à jour")
|
|
print()
|
|
print("Propriétés ajoutées:")
|
|
print(" ✓ workAuthor (auteur de l'œuvre)")
|
|
print(" ✓ year (année de publication)")
|
|
print(" ✓ language (langue du texte)")
|
|
print()
|
|
print("VÉRIFICATION:")
|
|
print(" python -c \"from verify_summaries import verify; verify()\"")
|
|
elif updated > 0:
|
|
print(f"⚠️ {updated}/{total_summaries} summaries mis à jour")
|
|
print(" Vérifier les erreurs")
|
|
else:
|
|
print("❌ Aucun summary mis à jour")
|
|
print(" Corriger les erreurs et relancer")
|
|
|
|
finally:
|
|
client.close()
|