feat: Optimize chunk sizes with 1000-word limit and overlap

Implemented chunking optimization to resolve oversized chunks and improve semantic search quality: CHUNKING IMPROVEMENTS: - Added strict 1000-word max limit (vs previous 1500-2000) - Implemented 100-word overlap between consecutive chunks - Created llm_chunker_improved.py with overlap functionality - Added 3 fallback points in llm_chunker.py for robustness RE-CHUNKING RESULTS: - Identified and re-chunked 31 oversized chunks (>2000 tokens) - Split into 92 optimally-sized chunks (max 1995 tokens) - Preserved all metadata (workTitle, workAuthor, sectionPath, etc.) - 0 chunks now exceed 2000 tokens (vs 31 before) VECTORIZATION: - Created manual vectorization script for chunks without vectors - Successfully vectorized all 92 new chunks (100% coverage) - All 5,304 chunks now have BGE-M3 embeddings DOCKER CONFIGURATION: - Exposed text2vec-transformers port 8090 for manual vectorization - Added cluster configuration to fix "No private IP address found" - Increased worker timeout to 600s for large chunks TESTING: - Created comprehensive search quality test suite - Tests distribution, overlap detection, and semantic search - Modified to use near_vector() (Chunk_v2 has no vectorizer) Scripts: - 08_fix_summaries_properties.py - Add missing Work metadata to summaries - 09_rechunk_oversized.py - Re-chunk giant chunks with overlap - 10_test_search_quality.py - Validate search improvements - 11_vectorize_missing_chunks.py - Manual vectorization via API Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-08 17:37:49 +01:00
parent ca221887eb
commit 7045907173
7 changed files with 1376 additions and 27 deletions
--- a/09_rechunk_oversized.py
+++ b/09_rechunk_oversized.py
@@ -0,0 +1,267 @@
+"""Script to re-chunk oversized chunks (> 2000 tokens) in Chunk_v2.
+
+This script identifies chunks that are too large (> 2000 tokens) and splits them
+into smaller chunks with overlap (max 1000 words, overlap 100 words).
+
+Steps:
+    1. Identify all chunks > 2000 tokens in Chunk_v2
+    2. Re-chunk using simple_chunk_with_overlap (1000 words max, 100 overlap)
+    3. Delete the original oversized chunk
+    4. Insert new smaller chunks with preserved metadata
+    5. Update Summary_v2 chunksCount if needed
+"""
+
+import weaviate
+import sys
+from pathlib import Path
+
+# Add utils to path
+sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
+
+from utils.llm_chunker_improved import simple_chunk_with_overlap, estimate_tokens
+
+if sys.stdout.encoding != 'utf-8':
+    sys.stdout.reconfigure(encoding='utf-8')
+
+# Try to import tqdm
+try:
+    from tqdm import tqdm
+    HAS_TQDM = True
+except ImportError:
+    HAS_TQDM = False
+
+# Constants
+TOKEN_THRESHOLD = 2000  # Chunks > 2000 tokens will be re-chunked
+MAX_WORDS = 1000
+OVERLAP_WORDS = 100
+
+client = weaviate.connect_to_local()
+
+try:
+    print("=" * 80)
+    print("RE-CHUNKING DES CHUNKS SURDIMENSIONNÉS")
+    print("=" * 80)
+    print()
+
+    chunk_v2 = client.collections.get("Chunk_v2")
+    work_collection = client.collections.get("Work")
+
+    # ========== 1. IDENTIFIER LES CHUNKS PROBLÉMATIQUES ==========
+    print("1. IDENTIFICATION DES CHUNKS > 2000 TOKENS")
+    print("-" * 80)
+    print()
+
+    oversized_chunks = []
+
+    print("Analyse en cours...")
+    for chunk in chunk_v2.iterator(include_vector=False):
+        props = chunk.properties
+        text = props.get('text', '')
+        tokens = estimate_tokens(text)
+
+        if tokens > TOKEN_THRESHOLD:
+            oversized_chunks.append({
+                'uuid': str(chunk.uuid),
+                'tokens': tokens,
+                'chars': len(text),
+                'text': text,
+                'workTitle': props.get('workTitle', ''),
+                'workAuthor': props.get('workAuthor', ''),
+                'year': props.get('year', 0),
+                'language': props.get('language', 'en'),
+                'sectionPath': props.get('sectionPath', ''),
+                'chapterTitle': props.get('chapterTitle', ''),
+                'canonicalReference': props.get('canonicalReference', ''),
+                'unitType': props.get('unitType', 'main_content'),
+                'keywords': props.get('keywords', []),
+                'orderIndex': props.get('orderIndex', 0),
+            })
+
+    print(f"✓ {len(oversized_chunks)} chunks > {TOKEN_THRESHOLD} tokens trouvés")
+    print()
+
+    if not oversized_chunks:
+        print("✅ Aucun chunk surdimensionné à traiter")
+        print()
+        print("=" * 80)
+        print("SCRIPT TERMINÉ - RIEN À FAIRE")
+        print("=" * 80)
+        sys.exit(0)
+
+    # Trier par taille
+    oversized_chunks.sort(key=lambda x: x['tokens'], reverse=True)
+
+    print("Top 5 plus gros chunks:")
+    for i, chunk in enumerate(oversized_chunks[:5], 1):
+        print(f"{i}. {chunk['tokens']:,} tokens ({chunk['chars']:,} chars)")
+        print(f"   Œuvre: {chunk['workTitle']}")
+        print(f"   Section: {chunk['sectionPath'][:60]}...")
+        print()
+
+    if len(oversized_chunks) > 5:
+        print(f"... et {len(oversized_chunks) - 5} autres")
+
+    print()
+
+    # ========== 2. RE-CHUNKING ==========
+    print("2. RE-CHUNKING AVEC OVERLAP")
+    print("-" * 80)
+    print()
+
+    # Build work_title -> work_uuid map for references
+    work_map = {}
+    for work in work_collection.iterator(include_vector=False):
+        props = work.properties
+        title = props.get("title")
+        if title:
+            work_map[title] = str(work.uuid)
+
+    print(f"✓ {len(work_map)} Works mappés")
+    print()
+
+    deleted_count = 0
+    inserted_count = 0
+    errors = []
+
+    # Create iterator with or without tqdm
+    if HAS_TQDM:
+        iterator = tqdm(
+            oversized_chunks,
+            desc="Re-chunking",
+            unit="chunks"
+        )
+    else:
+        iterator = oversized_chunks
+        print("Re-chunking en cours...")
+
+    for idx, old_chunk in enumerate(iterator, 1):
+        try:
+            # Re-chunk text
+            new_texts = simple_chunk_with_overlap(
+                old_chunk['text'],
+                max_words=MAX_WORDS,
+                overlap_words=OVERLAP_WORDS
+            )
+
+            # Get work reference
+            work_uuid = work_map.get(old_chunk['workTitle'])
+            if not work_uuid:
+                errors.append(f"Chunk {old_chunk['uuid'][:8]}: Work '{old_chunk['workTitle']}' introuvable")
+                continue
+
+            # Insert new chunks
+            for i, new_text in enumerate(new_texts):
+                # Sub-ordering: multiply base index by 100 and add part index
+                # Example: orderIndex=5 becomes 500, 501, 502, etc.
+                new_order_index = (old_chunk['orderIndex'] * 100) + i
+
+                new_props = {
+                    "text": new_text,
+                    "summary": "",  # Empty summary for simple chunks
+                    "keywords": old_chunk['keywords'],
+                    "workTitle": old_chunk['workTitle'],
+                    "workAuthor": old_chunk['workAuthor'],
+                    "year": old_chunk['year'],
+                    "language": old_chunk['language'],
+                    "sectionPath": old_chunk['sectionPath'],
+                    "chapterTitle": old_chunk['chapterTitle'],
+                    "canonicalReference": old_chunk['canonicalReference'],
+                    "unitType": old_chunk['unitType'],
+                    "orderIndex": new_order_index,
+                }
+
+                chunk_v2.data.insert(
+                    properties=new_props,
+                    references={"work": work_uuid}
+                )
+                inserted_count += 1
+
+            # Delete old chunk
+            chunk_v2.data.delete_by_id(old_chunk['uuid'])
+            deleted_count += 1
+
+            # Progress without tqdm
+            if not HAS_TQDM and idx % 5 == 0:
+                print(f"  {idx}/{len(oversized_chunks)} chunks traités...")
+
+        except Exception as e:
+            errors.append(f"Chunk {old_chunk['uuid'][:8]}: {e}")
+
+    print()
+    print("-" * 80)
+    print(f"✓ Chunks supprimés: {deleted_count}")
+    print(f"✓ Nouveaux chunks créés: {inserted_count}")
+    if deleted_count > 0:
+        print(f"  Expansion moyenne: {inserted_count / deleted_count:.1f}x")
+    else:
+        print(f"  ⚠️  Aucun chunk supprimé - vérifier les erreurs")
+
+    if errors:
+        print()
+        print(f"⚠️  Erreurs rencontrées: {len(errors)}")
+        for err in errors[:10]:
+            print(f"  - {err}")
+        if len(errors) > 10:
+            print(f"  ... et {len(errors) - 10} autres")
+
+    print()
+
+    # ========== 3. VÉRIFICATION ==========
+    print("3. VÉRIFICATION POST-RECHUNKING")
+    print("-" * 80)
+    print()
+
+    print("Comptage des nouveaux chunks...")
+    remaining_oversized = 0
+    total_chunks = 0
+
+    for chunk in chunk_v2.iterator(include_vector=False):
+        total_chunks += 1
+        text = chunk.properties.get('text', '')
+        tokens = estimate_tokens(text)
+        if tokens > TOKEN_THRESHOLD:
+            remaining_oversized += 1
+
+    print(f"✓ Total chunks: {total_chunks:,}")
+    print(f"✓ Chunks > {TOKEN_THRESHOLD} tokens: {remaining_oversized}")
+
+    if remaining_oversized == 0:
+        print()
+        print("✅ Aucun chunk surdimensionné restant!")
+    else:
+        print()
+        print(f"⚠️  {remaining_oversized} chunks encore > {TOKEN_THRESHOLD} tokens")
+        print("   Relancer le script si nécessaire")
+
+    print()
+    print("=" * 80)
+    print("RE-CHUNKING TERMINÉ")
+    print("=" * 80)
+    print()
+
+    print("RÉSULTATS:")
+    print(f"  • Chunks supprimés: {deleted_count}")
+    print(f"  • Nouveaux chunks créés: {inserted_count}")
+    if deleted_count > 0:
+        print(f"  • Expansion: {inserted_count / deleted_count:.1f}x")
+    print(f"  • Chunks restants > {TOKEN_THRESHOLD} tokens: {remaining_oversized}")
+    print()
+
+    if remaining_oversized == 0 and deleted_count > 0:
+        print("✅ RE-CHUNKING RÉUSSI")
+        print()
+        print("AMÉLIORATIONS:")
+        print(f"  • {deleted_count} chunks géants éliminés")
+        print(f"  • {inserted_count} chunks optimaux créés")
+        print(f"  • Taille max: {MAX_WORDS} mots (~{MAX_WORDS * 2.5:.0f} tokens)")
+        print(f"  • Overlap: {OVERLAP_WORDS} mots (contexte préservé)")
+        print()
+        print("PROCHAINES ÉTAPES:")
+        print("  1. Tester la recherche sémantique")
+        print("  2. Vérifier la qualité des vecteurs")
+        print("  3. Optionnel: Mettre à jour Summary_v2.chunksCount si nécessaire")
+    elif deleted_count == 0:
+        print("ℹ️  Aucun chunk n'a nécessité de re-chunking")
+
+finally:
+    client.close()