feat: Optimize chunk sizes with 1000-word limit and overlap

Implemented chunking optimization to resolve oversized chunks and improve
semantic search quality:

CHUNKING IMPROVEMENTS:
- Added strict 1000-word max limit (vs previous 1500-2000)
- Implemented 100-word overlap between consecutive chunks
- Created llm_chunker_improved.py with overlap functionality
- Added 3 fallback points in llm_chunker.py for robustness

RE-CHUNKING RESULTS:
- Identified and re-chunked 31 oversized chunks (>2000 tokens)
- Split into 92 optimally-sized chunks (max 1995 tokens)
- Preserved all metadata (workTitle, workAuthor, sectionPath, etc.)
- 0 chunks now exceed 2000 tokens (vs 31 before)

VECTORIZATION:
- Created manual vectorization script for chunks without vectors
- Successfully vectorized all 92 new chunks (100% coverage)
- All 5,304 chunks now have BGE-M3 embeddings

DOCKER CONFIGURATION:
- Exposed text2vec-transformers port 8090 for manual vectorization
- Added cluster configuration to fix "No private IP address found"
- Increased worker timeout to 600s for large chunks

TESTING:
- Created comprehensive search quality test suite
- Tests distribution, overlap detection, and semantic search
- Modified to use near_vector() (Chunk_v2 has no vectorizer)

Scripts:
- 08_fix_summaries_properties.py - Add missing Work metadata to summaries
- 09_rechunk_oversized.py - Re-chunk giant chunks with overlap
- 10_test_search_quality.py - Validate search improvements
- 11_vectorize_missing_chunks.py - Manual vectorization via API

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-08 17:37:49 +01:00
parent ca221887eb
commit 7045907173
7 changed files with 1376 additions and 27 deletions

View File

@@ -0,0 +1,217 @@
"""Vectorize chunks that don't have vectors.
After re-chunking, new chunks were created without vectors because Chunk_v2
collection has no vectorizer configured. This script manually vectorizes
these chunks using the text2vec-transformers service.
"""
import weaviate
import sys
import requests
if sys.stdout.encoding != 'utf-8':
sys.stdout.reconfigure(encoding='utf-8')
# Try to import tqdm
try:
from tqdm import tqdm
HAS_TQDM = True
except ImportError:
HAS_TQDM = False
# Text2vec-transformers service URL (from docker-compose.yml)
VECTORIZER_URL = "http://localhost:8090/vectors"
client = weaviate.connect_to_local()
try:
print("=" * 80)
print("VECTORISATION DES CHUNKS SANS VECTEUR")
print("=" * 80)
print()
chunk_v2 = client.collections.get("Chunk_v2")
# ========== 1. IDENTIFIER LES CHUNKS SANS VECTEUR ==========
print("1. IDENTIFICATION DES CHUNKS SANS VECTEUR")
print("-" * 80)
print()
print("Analyse en cours...")
chunks_to_vectorize = []
for chunk in chunk_v2.iterator(include_vector=True):
if not chunk.vector or not chunk.vector.get('default'):
props = chunk.properties
chunks_to_vectorize.append({
'uuid': chunk.uuid,
'text': props.get('text', ''),
'summary': props.get('summary', ''),
'keywords': props.get('keywords', []),
'workTitle': props.get('workTitle', 'N/A')
})
print(f"{len(chunks_to_vectorize)} chunks sans vecteur trouvés")
print()
if not chunks_to_vectorize:
print("✅ Aucun chunk à vectoriser")
print()
print("=" * 80)
print("SCRIPT TERMINÉ - RIEN À FAIRE")
print("=" * 80)
sys.exit(0)
# ========== 2. VECTORISATION ==========
print("2. VECTORISATION DES CHUNKS")
print("-" * 80)
print()
print(f"Service vectorizer: {VECTORIZER_URL}")
print()
vectorized_count = 0
errors = []
# Create iterator with or without tqdm
if HAS_TQDM:
iterator = tqdm(
chunks_to_vectorize,
desc="Vectorisation",
unit="chunks"
)
else:
iterator = chunks_to_vectorize
print("Vectorisation en cours...")
for idx, chunk_data in enumerate(iterator, 1):
try:
# Prepare text for vectorization
# Combine text, summary, and keywords as per original Chunk schema
text_parts = [chunk_data['text']]
if chunk_data['summary']:
text_parts.append(chunk_data['summary'])
if chunk_data['keywords']:
text_parts.append(' '.join(chunk_data['keywords']))
combined_text = ' '.join(text_parts)
# Call text2vec-transformers service
response = requests.post(
VECTORIZER_URL,
json={"text": combined_text},
headers={"Content-Type": "application/json"},
timeout=30
)
if response.status_code != 200:
errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: HTTP {response.status_code}")
continue
result = response.json()
vector = result.get('vector')
if not vector:
errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: Pas de vecteur dans la réponse")
continue
# Update chunk with vector
chunk_v2.data.update(
uuid=chunk_data['uuid'],
vector=vector
)
vectorized_count += 1
# Progress without tqdm
if not HAS_TQDM and idx % 10 == 0:
print(f" {idx}/{len(chunks_to_vectorize)} chunks vectorisés...")
except requests.exceptions.RequestException as e:
errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: Erreur réseau - {e}")
except Exception as e:
errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: {e}")
print()
print("-" * 80)
print(f"✓ Chunks vectorisés: {vectorized_count}/{len(chunks_to_vectorize)}")
if errors:
print()
print(f"⚠️ Erreurs rencontrées: {len(errors)}")
for err in errors[:10]:
print(f" - {err}")
if len(errors) > 10:
print(f" ... et {len(errors) - 10} autres")
print()
# ========== 3. VÉRIFICATION ==========
print("3. VÉRIFICATION POST-VECTORISATION")
print("-" * 80)
print()
print("Recomptage...")
remaining_without_vector = 0
total_chunks = 0
for chunk in chunk_v2.iterator(include_vector=True):
total_chunks += 1
if not chunk.vector or not chunk.vector.get('default'):
remaining_without_vector += 1
chunks_with_vector = total_chunks - remaining_without_vector
print(f"✓ Total chunks: {total_chunks:,}")
print(f"✓ Avec vecteur: {chunks_with_vector:,} ({chunks_with_vector/total_chunks*100:.1f}%)")
print(f"✓ Sans vecteur: {remaining_without_vector:,}")
print()
if remaining_without_vector == 0:
print("✅ Tous les chunks ont été vectorisés!")
else:
print(f"⚠️ {remaining_without_vector} chunks encore sans vecteur")
print(" Relancer le script ou vérifier les erreurs")
print()
print("=" * 80)
print("VECTORISATION TERMINÉE")
print("=" * 80)
print()
if remaining_without_vector == 0:
print("✅ VECTORISATION RÉUSSIE")
print()
print("RÉSULTATS:")
print(f"{vectorized_count} nouveaux vecteurs créés")
print(f"{total_chunks:,} chunks totaux")
print(f" • 100% des chunks ont des vecteurs")
print()
print("PROCHAINES ÉTAPES:")
print(" 1. Relancer le test de recherche: python 10_test_search_quality.py")
print(" 2. Tester l'application Flask")
print()
print("NOTE: Chunk_v2 n'a toujours pas de vectorizer configuré.")
print("Les futurs nouveaux chunks devront être vectorisés manuellement")
print("OU la collection devra être recréée avec un vectorizer.")
elif vectorized_count > 0:
print("⚠️ VECTORISATION PARTIELLE")
print()
print(f"{vectorized_count} chunks vectorisés")
print(f"{remaining_without_vector} chunks restants")
print(" • Vérifier les erreurs et relancer")
else:
print("❌ VECTORISATION ÉCHOUÉE")
print()
print("Aucun chunk n'a pu être vectorisé.")
print("Vérifications:")
print(f" 1. Service text2vec-transformers actif: {VECTORIZER_URL}")
print(" 2. Docker containers en cours d'exécution")
print(" 3. Logs des erreurs ci-dessus")
finally:
client.close()