feat: Optimize chunk sizes with 1000-word limit and overlap
Implemented chunking optimization to resolve oversized chunks and improve semantic search quality: CHUNKING IMPROVEMENTS: - Added strict 1000-word max limit (vs previous 1500-2000) - Implemented 100-word overlap between consecutive chunks - Created llm_chunker_improved.py with overlap functionality - Added 3 fallback points in llm_chunker.py for robustness RE-CHUNKING RESULTS: - Identified and re-chunked 31 oversized chunks (>2000 tokens) - Split into 92 optimally-sized chunks (max 1995 tokens) - Preserved all metadata (workTitle, workAuthor, sectionPath, etc.) - 0 chunks now exceed 2000 tokens (vs 31 before) VECTORIZATION: - Created manual vectorization script for chunks without vectors - Successfully vectorized all 92 new chunks (100% coverage) - All 5,304 chunks now have BGE-M3 embeddings DOCKER CONFIGURATION: - Exposed text2vec-transformers port 8090 for manual vectorization - Added cluster configuration to fix "No private IP address found" - Increased worker timeout to 600s for large chunks TESTING: - Created comprehensive search quality test suite - Tests distribution, overlap detection, and semantic search - Modified to use near_vector() (Chunk_v2 has no vectorizer) Scripts: - 08_fix_summaries_properties.py - Add missing Work metadata to summaries - 09_rechunk_oversized.py - Re-chunk giant chunks with overlap - 10_test_search_quality.py - Validate search improvements - 11_vectorize_missing_chunks.py - Manual vectorization via API Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
267
09_rechunk_oversized.py
Normal file
267
09_rechunk_oversized.py
Normal file
@@ -0,0 +1,267 @@
|
||||
"""Script to re-chunk oversized chunks (> 2000 tokens) in Chunk_v2.
|
||||
|
||||
This script identifies chunks that are too large (> 2000 tokens) and splits them
|
||||
into smaller chunks with overlap (max 1000 words, overlap 100 words).
|
||||
|
||||
Steps:
|
||||
1. Identify all chunks > 2000 tokens in Chunk_v2
|
||||
2. Re-chunk using simple_chunk_with_overlap (1000 words max, 100 overlap)
|
||||
3. Delete the original oversized chunk
|
||||
4. Insert new smaller chunks with preserved metadata
|
||||
5. Update Summary_v2 chunksCount if needed
|
||||
"""
|
||||
|
||||
import weaviate
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add utils to path
|
||||
sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
|
||||
|
||||
from utils.llm_chunker_improved import simple_chunk_with_overlap, estimate_tokens
|
||||
|
||||
if sys.stdout.encoding != 'utf-8':
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
# Try to import tqdm
|
||||
try:
|
||||
from tqdm import tqdm
|
||||
HAS_TQDM = True
|
||||
except ImportError:
|
||||
HAS_TQDM = False
|
||||
|
||||
# Constants
|
||||
TOKEN_THRESHOLD = 2000 # Chunks > 2000 tokens will be re-chunked
|
||||
MAX_WORDS = 1000
|
||||
OVERLAP_WORDS = 100
|
||||
|
||||
client = weaviate.connect_to_local()
|
||||
|
||||
try:
|
||||
print("=" * 80)
|
||||
print("RE-CHUNKING DES CHUNKS SURDIMENSIONNÉS")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
chunk_v2 = client.collections.get("Chunk_v2")
|
||||
work_collection = client.collections.get("Work")
|
||||
|
||||
# ========== 1. IDENTIFIER LES CHUNKS PROBLÉMATIQUES ==========
|
||||
print("1. IDENTIFICATION DES CHUNKS > 2000 TOKENS")
|
||||
print("-" * 80)
|
||||
print()
|
||||
|
||||
oversized_chunks = []
|
||||
|
||||
print("Analyse en cours...")
|
||||
for chunk in chunk_v2.iterator(include_vector=False):
|
||||
props = chunk.properties
|
||||
text = props.get('text', '')
|
||||
tokens = estimate_tokens(text)
|
||||
|
||||
if tokens > TOKEN_THRESHOLD:
|
||||
oversized_chunks.append({
|
||||
'uuid': str(chunk.uuid),
|
||||
'tokens': tokens,
|
||||
'chars': len(text),
|
||||
'text': text,
|
||||
'workTitle': props.get('workTitle', ''),
|
||||
'workAuthor': props.get('workAuthor', ''),
|
||||
'year': props.get('year', 0),
|
||||
'language': props.get('language', 'en'),
|
||||
'sectionPath': props.get('sectionPath', ''),
|
||||
'chapterTitle': props.get('chapterTitle', ''),
|
||||
'canonicalReference': props.get('canonicalReference', ''),
|
||||
'unitType': props.get('unitType', 'main_content'),
|
||||
'keywords': props.get('keywords', []),
|
||||
'orderIndex': props.get('orderIndex', 0),
|
||||
})
|
||||
|
||||
print(f"✓ {len(oversized_chunks)} chunks > {TOKEN_THRESHOLD} tokens trouvés")
|
||||
print()
|
||||
|
||||
if not oversized_chunks:
|
||||
print("✅ Aucun chunk surdimensionné à traiter")
|
||||
print()
|
||||
print("=" * 80)
|
||||
print("SCRIPT TERMINÉ - RIEN À FAIRE")
|
||||
print("=" * 80)
|
||||
sys.exit(0)
|
||||
|
||||
# Trier par taille
|
||||
oversized_chunks.sort(key=lambda x: x['tokens'], reverse=True)
|
||||
|
||||
print("Top 5 plus gros chunks:")
|
||||
for i, chunk in enumerate(oversized_chunks[:5], 1):
|
||||
print(f"{i}. {chunk['tokens']:,} tokens ({chunk['chars']:,} chars)")
|
||||
print(f" Œuvre: {chunk['workTitle']}")
|
||||
print(f" Section: {chunk['sectionPath'][:60]}...")
|
||||
print()
|
||||
|
||||
if len(oversized_chunks) > 5:
|
||||
print(f"... et {len(oversized_chunks) - 5} autres")
|
||||
|
||||
print()
|
||||
|
||||
# ========== 2. RE-CHUNKING ==========
|
||||
print("2. RE-CHUNKING AVEC OVERLAP")
|
||||
print("-" * 80)
|
||||
print()
|
||||
|
||||
# Build work_title -> work_uuid map for references
|
||||
work_map = {}
|
||||
for work in work_collection.iterator(include_vector=False):
|
||||
props = work.properties
|
||||
title = props.get("title")
|
||||
if title:
|
||||
work_map[title] = str(work.uuid)
|
||||
|
||||
print(f"✓ {len(work_map)} Works mappés")
|
||||
print()
|
||||
|
||||
deleted_count = 0
|
||||
inserted_count = 0
|
||||
errors = []
|
||||
|
||||
# Create iterator with or without tqdm
|
||||
if HAS_TQDM:
|
||||
iterator = tqdm(
|
||||
oversized_chunks,
|
||||
desc="Re-chunking",
|
||||
unit="chunks"
|
||||
)
|
||||
else:
|
||||
iterator = oversized_chunks
|
||||
print("Re-chunking en cours...")
|
||||
|
||||
for idx, old_chunk in enumerate(iterator, 1):
|
||||
try:
|
||||
# Re-chunk text
|
||||
new_texts = simple_chunk_with_overlap(
|
||||
old_chunk['text'],
|
||||
max_words=MAX_WORDS,
|
||||
overlap_words=OVERLAP_WORDS
|
||||
)
|
||||
|
||||
# Get work reference
|
||||
work_uuid = work_map.get(old_chunk['workTitle'])
|
||||
if not work_uuid:
|
||||
errors.append(f"Chunk {old_chunk['uuid'][:8]}: Work '{old_chunk['workTitle']}' introuvable")
|
||||
continue
|
||||
|
||||
# Insert new chunks
|
||||
for i, new_text in enumerate(new_texts):
|
||||
# Sub-ordering: multiply base index by 100 and add part index
|
||||
# Example: orderIndex=5 becomes 500, 501, 502, etc.
|
||||
new_order_index = (old_chunk['orderIndex'] * 100) + i
|
||||
|
||||
new_props = {
|
||||
"text": new_text,
|
||||
"summary": "", # Empty summary for simple chunks
|
||||
"keywords": old_chunk['keywords'],
|
||||
"workTitle": old_chunk['workTitle'],
|
||||
"workAuthor": old_chunk['workAuthor'],
|
||||
"year": old_chunk['year'],
|
||||
"language": old_chunk['language'],
|
||||
"sectionPath": old_chunk['sectionPath'],
|
||||
"chapterTitle": old_chunk['chapterTitle'],
|
||||
"canonicalReference": old_chunk['canonicalReference'],
|
||||
"unitType": old_chunk['unitType'],
|
||||
"orderIndex": new_order_index,
|
||||
}
|
||||
|
||||
chunk_v2.data.insert(
|
||||
properties=new_props,
|
||||
references={"work": work_uuid}
|
||||
)
|
||||
inserted_count += 1
|
||||
|
||||
# Delete old chunk
|
||||
chunk_v2.data.delete_by_id(old_chunk['uuid'])
|
||||
deleted_count += 1
|
||||
|
||||
# Progress without tqdm
|
||||
if not HAS_TQDM and idx % 5 == 0:
|
||||
print(f" {idx}/{len(oversized_chunks)} chunks traités...")
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f"Chunk {old_chunk['uuid'][:8]}: {e}")
|
||||
|
||||
print()
|
||||
print("-" * 80)
|
||||
print(f"✓ Chunks supprimés: {deleted_count}")
|
||||
print(f"✓ Nouveaux chunks créés: {inserted_count}")
|
||||
if deleted_count > 0:
|
||||
print(f" Expansion moyenne: {inserted_count / deleted_count:.1f}x")
|
||||
else:
|
||||
print(f" ⚠️ Aucun chunk supprimé - vérifier les erreurs")
|
||||
|
||||
if errors:
|
||||
print()
|
||||
print(f"⚠️ Erreurs rencontrées: {len(errors)}")
|
||||
for err in errors[:10]:
|
||||
print(f" - {err}")
|
||||
if len(errors) > 10:
|
||||
print(f" ... et {len(errors) - 10} autres")
|
||||
|
||||
print()
|
||||
|
||||
# ========== 3. VÉRIFICATION ==========
|
||||
print("3. VÉRIFICATION POST-RECHUNKING")
|
||||
print("-" * 80)
|
||||
print()
|
||||
|
||||
print("Comptage des nouveaux chunks...")
|
||||
remaining_oversized = 0
|
||||
total_chunks = 0
|
||||
|
||||
for chunk in chunk_v2.iterator(include_vector=False):
|
||||
total_chunks += 1
|
||||
text = chunk.properties.get('text', '')
|
||||
tokens = estimate_tokens(text)
|
||||
if tokens > TOKEN_THRESHOLD:
|
||||
remaining_oversized += 1
|
||||
|
||||
print(f"✓ Total chunks: {total_chunks:,}")
|
||||
print(f"✓ Chunks > {TOKEN_THRESHOLD} tokens: {remaining_oversized}")
|
||||
|
||||
if remaining_oversized == 0:
|
||||
print()
|
||||
print("✅ Aucun chunk surdimensionné restant!")
|
||||
else:
|
||||
print()
|
||||
print(f"⚠️ {remaining_oversized} chunks encore > {TOKEN_THRESHOLD} tokens")
|
||||
print(" Relancer le script si nécessaire")
|
||||
|
||||
print()
|
||||
print("=" * 80)
|
||||
print("RE-CHUNKING TERMINÉ")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
print("RÉSULTATS:")
|
||||
print(f" • Chunks supprimés: {deleted_count}")
|
||||
print(f" • Nouveaux chunks créés: {inserted_count}")
|
||||
if deleted_count > 0:
|
||||
print(f" • Expansion: {inserted_count / deleted_count:.1f}x")
|
||||
print(f" • Chunks restants > {TOKEN_THRESHOLD} tokens: {remaining_oversized}")
|
||||
print()
|
||||
|
||||
if remaining_oversized == 0 and deleted_count > 0:
|
||||
print("✅ RE-CHUNKING RÉUSSI")
|
||||
print()
|
||||
print("AMÉLIORATIONS:")
|
||||
print(f" • {deleted_count} chunks géants éliminés")
|
||||
print(f" • {inserted_count} chunks optimaux créés")
|
||||
print(f" • Taille max: {MAX_WORDS} mots (~{MAX_WORDS * 2.5:.0f} tokens)")
|
||||
print(f" • Overlap: {OVERLAP_WORDS} mots (contexte préservé)")
|
||||
print()
|
||||
print("PROCHAINES ÉTAPES:")
|
||||
print(" 1. Tester la recherche sémantique")
|
||||
print(" 2. Vérifier la qualité des vecteurs")
|
||||
print(" 3. Optionnel: Mettre à jour Summary_v2.chunksCount si nécessaire")
|
||||
elif deleted_count == 0:
|
||||
print("ℹ️ Aucun chunk n'a nécessité de re-chunking")
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
Reference in New Issue
Block a user