feat: Optimize chunk sizes with 1000-word limit and overlap

Implemented chunking optimization to resolve oversized chunks and improve semantic search quality: CHUNKING IMPROVEMENTS: - Added strict 1000-word max limit (vs previous 1500-2000) - Implemented 100-word overlap between consecutive chunks - Created llm_chunker_improved.py with overlap functionality - Added 3 fallback points in llm_chunker.py for robustness RE-CHUNKING RESULTS: - Identified and re-chunked 31 oversized chunks (>2000 tokens) - Split into 92 optimally-sized chunks (max 1995 tokens) - Preserved all metadata (workTitle, workAuthor, sectionPath, etc.) - 0 chunks now exceed 2000 tokens (vs 31 before) VECTORIZATION: - Created manual vectorization script for chunks without vectors - Successfully vectorized all 92 new chunks (100% coverage) - All 5,304 chunks now have BGE-M3 embeddings DOCKER CONFIGURATION: - Exposed text2vec-transformers port 8090 for manual vectorization - Added cluster configuration to fix "No private IP address found" - Increased worker timeout to 600s for large chunks TESTING: - Created comprehensive search quality test suite - Tests distribution, overlap detection, and semantic search - Modified to use near_vector() (Chunk_v2 has no vectorizer) Scripts: - 08_fix_summaries_properties.py - Add missing Work metadata to summaries - 09_rechunk_oversized.py - Re-chunk giant chunks with overlap - 10_test_search_quality.py - Validate search improvements - 11_vectorize_missing_chunks.py - Manual vectorization via API Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-08 17:37:49 +01:00
parent ca221887eb
commit 7045907173
7 changed files with 1376 additions and 27 deletions
--- a/08_fix_summaries_properties.py
+++ b/08_fix_summaries_properties.py
@@ -0,0 +1,157 @@
+"""Correctif: Ajouter workAuthor, year, language aux Summary_v2."""
+
+import weaviate
+import sys
+
+if sys.stdout.encoding != 'utf-8':
+    sys.stdout.reconfigure(encoding='utf-8')
+
+# Try to import tqdm
+try:
+    from tqdm import tqdm
+    HAS_TQDM = True
+except ImportError:
+    HAS_TQDM = False
+
+client = weaviate.connect_to_local()
+
+try:
+    print("=" * 80)
+    print("CORRECTIF: AJOUTER workAuthor, year, language À SUMMARY_V2")
+    print("=" * 80)
+    print()
+
+    summary_v2 = client.collections.get("Summary_v2")
+    work_collection = client.collections.get("Work")
+
+    # Build workTitle → Work metadata map
+    print("Étape 1: Mapping workTitle → Work metadata")
+    print("-" * 80)
+
+    work_map = {}
+
+    for work in work_collection.iterator(include_vector=False):
+        props = work.properties
+        title = props.get("title")
+        if title:
+            work_map[title] = {
+                "author": props.get("author", "Unknown"),
+                "year": props.get("year", 0),
+                "language": props.get("language", "en"),
+            }
+
+    print(f"✓ {len(work_map)} mappings workTitle → metadata")
+    print()
+
+    # Count total summaries
+    print("Étape 2: Comptage summaries")
+    print("-" * 80)
+
+    print("Comptage en cours...")
+    total_summaries = sum(1 for _ in summary_v2.iterator(include_vector=False))
+
+    print(f"✓ {total_summaries} summaries à corriger")
+    print()
+
+    # Update summaries
+    print("Étape 3: Mise à jour des propriétés")
+    print("-" * 80)
+    print()
+
+    updated = 0
+    skipped = 0
+    errors = []
+
+    # Create iterator with or without tqdm
+    if HAS_TQDM:
+        iterator = tqdm(
+            summary_v2.iterator(include_vector=False),
+            total=total_summaries,
+            desc="Mise à jour",
+            unit="summaries"
+        )
+    else:
+        iterator = summary_v2.iterator(include_vector=False)
+        print("Mise à jour en cours...")
+
+    for idx, summary in enumerate(iterator, 1):
+        props = summary.properties
+
+        try:
+            work_title = props.get("workTitle")
+
+            if not work_title:
+                errors.append(f"Summary {summary.uuid}: pas de workTitle")
+                skipped += 1
+                continue
+
+            # Get work metadata
+            work_metadata = work_map.get(work_title)
+            if not work_metadata:
+                errors.append(f"Summary {summary.uuid}: Work '{work_title}' introuvable")
+                skipped += 1
+                continue
+
+            # Check if already updated (workAuthor exists)
+            if props.get("workAuthor") is not None:
+                skipped += 1
+                continue
+
+            # Update properties
+            summary_v2.data.update(
+                uuid=summary.uuid,
+                properties={
+                    "workAuthor": work_metadata["author"],
+                    "year": work_metadata["year"],
+                    "language": work_metadata["language"],
+                }
+            )
+
+            updated += 1
+
+            # Progress without tqdm
+            if not HAS_TQDM and idx % 10 == 0:
+                print(f"  {idx}/{total_summaries} summaries traités...")
+
+        except Exception as e:
+            errors.append(f"Summary {summary.uuid}: {e}")
+
+    print()
+    print("-" * 80)
+    print(f"✓ Total mis à jour: {updated}/{total_summaries}")
+    print(f"  Déjà à jour: {skipped}")
+
+    if errors:
+        print(f"⚠️  Erreurs rencontrées: {len(errors)}")
+        print()
+        print("Premières erreurs:")
+        for err in errors[:10]:
+            print(f"  - {err}")
+        if len(errors) > 10:
+            print(f"  ... et {len(errors) - 10} autres")
+
+    print()
+    print("=" * 80)
+    print("CORRECTIF TERMINÉ")
+    print("=" * 80)
+    print()
+
+    if updated == total_summaries:
+        print("✅ Tous les summaries ont été mis à jour")
+        print()
+        print("Propriétés ajoutées:")
+        print("  ✓ workAuthor (auteur de l'œuvre)")
+        print("  ✓ year (année de publication)")
+        print("  ✓ language (langue du texte)")
+        print()
+        print("VÉRIFICATION:")
+        print("  python -c \"from verify_summaries import verify; verify()\"")
+    elif updated > 0:
+        print(f"⚠️  {updated}/{total_summaries} summaries mis à jour")
+        print("  Vérifier les erreurs")
+    else:
+        print("❌ Aucun summary mis à jour")
+        print("  Corriger les erreurs et relancer")
+
+finally:
+    client.close()
--- a/09_rechunk_oversized.py
+++ b/09_rechunk_oversized.py
@@ -0,0 +1,267 @@
+"""Script to re-chunk oversized chunks (> 2000 tokens) in Chunk_v2.
+
+This script identifies chunks that are too large (> 2000 tokens) and splits them
+into smaller chunks with overlap (max 1000 words, overlap 100 words).
+
+Steps:
+    1. Identify all chunks > 2000 tokens in Chunk_v2
+    2. Re-chunk using simple_chunk_with_overlap (1000 words max, 100 overlap)
+    3. Delete the original oversized chunk
+    4. Insert new smaller chunks with preserved metadata
+    5. Update Summary_v2 chunksCount if needed
+"""
+
+import weaviate
+import sys
+from pathlib import Path
+
+# Add utils to path
+sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
+
+from utils.llm_chunker_improved import simple_chunk_with_overlap, estimate_tokens
+
+if sys.stdout.encoding != 'utf-8':
+    sys.stdout.reconfigure(encoding='utf-8')
+
+# Try to import tqdm
+try:
+    from tqdm import tqdm
+    HAS_TQDM = True
+except ImportError:
+    HAS_TQDM = False
+
+# Constants
+TOKEN_THRESHOLD = 2000  # Chunks > 2000 tokens will be re-chunked
+MAX_WORDS = 1000
+OVERLAP_WORDS = 100
+
+client = weaviate.connect_to_local()
+
+try:
+    print("=" * 80)
+    print("RE-CHUNKING DES CHUNKS SURDIMENSIONNÉS")
+    print("=" * 80)
+    print()
+
+    chunk_v2 = client.collections.get("Chunk_v2")
+    work_collection = client.collections.get("Work")
+
+    # ========== 1. IDENTIFIER LES CHUNKS PROBLÉMATIQUES ==========
+    print("1. IDENTIFICATION DES CHUNKS > 2000 TOKENS")
+    print("-" * 80)
+    print()
+
+    oversized_chunks = []
+
+    print("Analyse en cours...")
+    for chunk in chunk_v2.iterator(include_vector=False):
+        props = chunk.properties
+        text = props.get('text', '')
+        tokens = estimate_tokens(text)
+
+        if tokens > TOKEN_THRESHOLD:
+            oversized_chunks.append({
+                'uuid': str(chunk.uuid),
+                'tokens': tokens,
+                'chars': len(text),
+                'text': text,
+                'workTitle': props.get('workTitle', ''),
+                'workAuthor': props.get('workAuthor', ''),
+                'year': props.get('year', 0),
+                'language': props.get('language', 'en'),
+                'sectionPath': props.get('sectionPath', ''),
+                'chapterTitle': props.get('chapterTitle', ''),
+                'canonicalReference': props.get('canonicalReference', ''),
+                'unitType': props.get('unitType', 'main_content'),
+                'keywords': props.get('keywords', []),
+                'orderIndex': props.get('orderIndex', 0),
+            })
+
+    print(f"✓ {len(oversized_chunks)} chunks > {TOKEN_THRESHOLD} tokens trouvés")
+    print()
+
+    if not oversized_chunks:
+        print("✅ Aucun chunk surdimensionné à traiter")
+        print()
+        print("=" * 80)
+        print("SCRIPT TERMINÉ - RIEN À FAIRE")
+        print("=" * 80)
+        sys.exit(0)
+
+    # Trier par taille
+    oversized_chunks.sort(key=lambda x: x['tokens'], reverse=True)
+
+    print("Top 5 plus gros chunks:")
+    for i, chunk in enumerate(oversized_chunks[:5], 1):
+        print(f"{i}. {chunk['tokens']:,} tokens ({chunk['chars']:,} chars)")
+        print(f"   Œuvre: {chunk['workTitle']}")
+        print(f"   Section: {chunk['sectionPath'][:60]}...")
+        print()
+
+    if len(oversized_chunks) > 5:
+        print(f"... et {len(oversized_chunks) - 5} autres")
+
+    print()
+
+    # ========== 2. RE-CHUNKING ==========
+    print("2. RE-CHUNKING AVEC OVERLAP")
+    print("-" * 80)
+    print()
+
+    # Build work_title -> work_uuid map for references
+    work_map = {}
+    for work in work_collection.iterator(include_vector=False):
+        props = work.properties
+        title = props.get("title")
+        if title:
+            work_map[title] = str(work.uuid)
+
+    print(f"✓ {len(work_map)} Works mappés")
+    print()
+
+    deleted_count = 0
+    inserted_count = 0
+    errors = []
+
+    # Create iterator with or without tqdm
+    if HAS_TQDM:
+        iterator = tqdm(
+            oversized_chunks,
+            desc="Re-chunking",
+            unit="chunks"
+        )
+    else:
+        iterator = oversized_chunks
+        print("Re-chunking en cours...")
+
+    for idx, old_chunk in enumerate(iterator, 1):
+        try:
+            # Re-chunk text
+            new_texts = simple_chunk_with_overlap(
+                old_chunk['text'],
+                max_words=MAX_WORDS,
+                overlap_words=OVERLAP_WORDS
+            )
+
+            # Get work reference
+            work_uuid = work_map.get(old_chunk['workTitle'])
+            if not work_uuid:
+                errors.append(f"Chunk {old_chunk['uuid'][:8]}: Work '{old_chunk['workTitle']}' introuvable")
+                continue
+
+            # Insert new chunks
+            for i, new_text in enumerate(new_texts):
+                # Sub-ordering: multiply base index by 100 and add part index
+                # Example: orderIndex=5 becomes 500, 501, 502, etc.
+                new_order_index = (old_chunk['orderIndex'] * 100) + i
+
+                new_props = {
+                    "text": new_text,
+                    "summary": "",  # Empty summary for simple chunks
+                    "keywords": old_chunk['keywords'],
+                    "workTitle": old_chunk['workTitle'],
+                    "workAuthor": old_chunk['workAuthor'],
+                    "year": old_chunk['year'],
+                    "language": old_chunk['language'],
+                    "sectionPath": old_chunk['sectionPath'],
+                    "chapterTitle": old_chunk['chapterTitle'],
+                    "canonicalReference": old_chunk['canonicalReference'],
+                    "unitType": old_chunk['unitType'],
+                    "orderIndex": new_order_index,
+                }
+
+                chunk_v2.data.insert(
+                    properties=new_props,
+                    references={"work": work_uuid}
+                )
+                inserted_count += 1
+
+            # Delete old chunk
+            chunk_v2.data.delete_by_id(old_chunk['uuid'])
+            deleted_count += 1
+
+            # Progress without tqdm
+            if not HAS_TQDM and idx % 5 == 0:
+                print(f"  {idx}/{len(oversized_chunks)} chunks traités...")
+
+        except Exception as e:
+            errors.append(f"Chunk {old_chunk['uuid'][:8]}: {e}")
+
+    print()
+    print("-" * 80)
+    print(f"✓ Chunks supprimés: {deleted_count}")
+    print(f"✓ Nouveaux chunks créés: {inserted_count}")
+    if deleted_count > 0:
+        print(f"  Expansion moyenne: {inserted_count / deleted_count:.1f}x")
+    else:
+        print(f"  ⚠️  Aucun chunk supprimé - vérifier les erreurs")
+
+    if errors:
+        print()
+        print(f"⚠️  Erreurs rencontrées: {len(errors)}")
+        for err in errors[:10]:
+            print(f"  - {err}")
+        if len(errors) > 10:
+            print(f"  ... et {len(errors) - 10} autres")
+
+    print()
+
+    # ========== 3. VÉRIFICATION ==========
+    print("3. VÉRIFICATION POST-RECHUNKING")
+    print("-" * 80)
+    print()
+
+    print("Comptage des nouveaux chunks...")
+    remaining_oversized = 0
+    total_chunks = 0
+
+    for chunk in chunk_v2.iterator(include_vector=False):
+        total_chunks += 1
+        text = chunk.properties.get('text', '')
+        tokens = estimate_tokens(text)
+        if tokens > TOKEN_THRESHOLD:
+            remaining_oversized += 1
+
+    print(f"✓ Total chunks: {total_chunks:,}")
+    print(f"✓ Chunks > {TOKEN_THRESHOLD} tokens: {remaining_oversized}")
+
+    if remaining_oversized == 0:
+        print()
+        print("✅ Aucun chunk surdimensionné restant!")
+    else:
+        print()
+        print(f"⚠️  {remaining_oversized} chunks encore > {TOKEN_THRESHOLD} tokens")
+        print("   Relancer le script si nécessaire")
+
+    print()
+    print("=" * 80)
+    print("RE-CHUNKING TERMINÉ")
+    print("=" * 80)
+    print()
+
+    print("RÉSULTATS:")
+    print(f"  • Chunks supprimés: {deleted_count}")
+    print(f"  • Nouveaux chunks créés: {inserted_count}")
+    if deleted_count > 0:
+        print(f"  • Expansion: {inserted_count / deleted_count:.1f}x")
+    print(f"  • Chunks restants > {TOKEN_THRESHOLD} tokens: {remaining_oversized}")
+    print()
+
+    if remaining_oversized == 0 and deleted_count > 0:
+        print("✅ RE-CHUNKING RÉUSSI")
+        print()
+        print("AMÉLIORATIONS:")
+        print(f"  • {deleted_count} chunks géants éliminés")
+        print(f"  • {inserted_count} chunks optimaux créés")
+        print(f"  • Taille max: {MAX_WORDS} mots (~{MAX_WORDS * 2.5:.0f} tokens)")
+        print(f"  • Overlap: {OVERLAP_WORDS} mots (contexte préservé)")
+        print()
+        print("PROCHAINES ÉTAPES:")
+        print("  1. Tester la recherche sémantique")
+        print("  2. Vérifier la qualité des vecteurs")
+        print("  3. Optionnel: Mettre à jour Summary_v2.chunksCount si nécessaire")
+    elif deleted_count == 0:
+        print("ℹ️  Aucun chunk n'a nécessité de re-chunking")
+
+finally:
+    client.close()
--- a/10_test_search_quality.py
+++ b/10_test_search_quality.py
@@ -0,0 +1,402 @@
+"""Test search quality with re-chunked data.
+
+This script tests semantic search to verify that the re-chunking improved
+search quality and relevance.
+
+Tests:
+    1. Chunk size distribution after re-chunking
+    2. Overlap verification between consecutive chunks
+    3. Semantic search quality on various queries
+    4. Comparison of results from giant chunks vs optimized chunks
+"""
+
+import weaviate
+import sys
+import requests
+from pathlib import Path
+
+# Add utils to path
+sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
+
+from utils.llm_chunker_improved import estimate_tokens
+
+if sys.stdout.encoding != 'utf-8':
+    sys.stdout.reconfigure(encoding='utf-8')
+
+# Vectorizer URL (same as in 11_vectorize_missing_chunks.py)
+VECTORIZER_URL = "http://localhost:8090/vectors"
+
+def vectorize_query(query: str) -> list[float]:
+    """Manually vectorize a query using text2vec-transformers service.
+
+    Args:
+        query: Query text to vectorize
+
+    Returns:
+        Vector as list of floats (1024 dimensions for BGE-M3)
+    """
+    response = requests.post(
+        VECTORIZER_URL,
+        json={"text": query},
+        headers={"Content-Type": "application/json"},
+        timeout=30
+    )
+    if response.status_code != 200:
+        raise RuntimeError(f"Vectorization failed: HTTP {response.status_code}")
+
+    result = response.json()
+    vector = result.get('vector')
+    if not vector:
+        raise RuntimeError("No vector in response")
+
+    return vector
+
+client = weaviate.connect_to_local()
+
+try:
+    print("=" * 80)
+    print("TEST DE LA QUALITÉ DE RECHERCHE APRÈS RE-CHUNKING")
+    print("=" * 80)
+    print()
+
+    chunk_v2 = client.collections.get("Chunk_v2")
+
+    # ========== 1. DISTRIBUTION DES TAILLES ==========
+    print("1. DISTRIBUTION DES TAILLES DE CHUNKS")
+    print("-" * 80)
+    print()
+
+    print("Analyse en cours...")
+
+    sizes = []
+    for chunk in chunk_v2.iterator(include_vector=False):
+        text = chunk.properties.get('text', '')
+        tokens = estimate_tokens(text)
+        sizes.append(tokens)
+
+    total = len(sizes)
+    avg = sum(sizes) / total
+    max_size = max(sizes)
+    min_size = min(sizes)
+
+    print(f"Total chunks: {total:,}")
+    print(f"Taille moyenne: {avg:.0f} tokens")
+    print(f"Min: {min_size} tokens")
+    print(f"Max: {max_size} tokens")
+    print()
+
+    # Distribution par tranches
+    ranges = [
+        (0, 500, "Très petits"),
+        (500, 1000, "Petits"),
+        (1000, 1500, "Moyens"),
+        (1500, 2000, "Grands"),
+        (2000, 3000, "Très grands"),
+        (3000, 10000, "ÉNORMES"),
+    ]
+
+    print("Distribution par tranches:")
+    for min_tok, max_tok, label in ranges:
+        count = sum(1 for s in sizes if min_tok <= s < max_tok)
+        percentage = count / total * 100
+        bar = "█" * int(percentage / 2)
+        print(f"  {min_tok:>5}-{max_tok:>5} tokens ({label:15}): {count:>5} ({percentage:>5.1f}%) {bar}")
+
+    print()
+
+    # ========== 2. VÉRIFICATION OVERLAP ==========
+    print("2. VÉRIFICATION DE L'OVERLAP ENTRE CHUNKS CONSÉCUTIFS")
+    print("-" * 80)
+    print()
+
+    # Prendre une œuvre pour vérifier l'overlap
+    print("Analyse de l'overlap dans 'Between Past and Future'...")
+
+    arendt_chunks = []
+    for chunk in chunk_v2.iterator(include_vector=False):
+        props = chunk.properties
+        if props.get('workTitle') == 'Between Past and Future':
+            arendt_chunks.append({
+                'orderIndex': props.get('orderIndex', 0),
+                'text': props.get('text', ''),
+                'sectionPath': props.get('sectionPath', '')
+            })
+
+    # Trier par orderIndex
+    arendt_chunks.sort(key=lambda x: x['orderIndex'])
+
+    print(f"Chunks trouvés: {len(arendt_chunks)}")
+    print()
+
+    # Vérifier overlap entre chunks consécutifs de même section
+    overlaps_found = 0
+    overlaps_checked = 0
+
+    for i in range(len(arendt_chunks) - 1):
+        current = arendt_chunks[i]
+        next_chunk = arendt_chunks[i + 1]
+
+        # Vérifier si même section (potentiellement des chunks split)
+        if current['sectionPath'] == next_chunk['sectionPath']:
+            # Extraire les derniers 200 caractères du chunk actuel
+            current_end = current['text'][-200:].strip()
+            # Extraire les premiers 200 caractères du chunk suivant
+            next_start = next_chunk['text'][:200].strip()
+
+            # Chercher overlap
+            overlap_found = False
+            for length in range(50, 201, 10):  # Tester différentes longueurs
+                if len(current_end) < length or len(next_start) < length:
+                    continue
+
+                test_end = current_end[-length:]
+                if test_end in next_start:
+                    overlap_found = True
+                    overlaps_found += 1
+                    break
+
+            overlaps_checked += 1
+
+    if overlaps_checked > 0:
+        print(f"Chunks consécutifs vérifiés: {overlaps_checked}")
+        print(f"Overlaps détectés: {overlaps_found} ({overlaps_found/overlaps_checked*100:.1f}%)")
+    else:
+        print("Aucun chunk consécutif dans la même section (pas de split détecté)")
+
+    print()
+
+    # ========== 3. TESTS DE RECHERCHE SÉMANTIQUE ==========
+    print("3. TESTS DE RECHERCHE SÉMANTIQUE")
+    print("-" * 80)
+    print()
+
+    test_queries = [
+        {
+            "query": "What is the nature of representation in cognitive science?",
+            "expected_work": "Mind Design III",
+            "description": "Requête philosophique complexe"
+        },
+        {
+            "query": "Comment définit-on la vertu selon Platon?",
+            "expected_work": "Platon - Ménon",
+            "description": "Requête en français sur un concept spécifique"
+        },
+        {
+            "query": "pragmatism and belief fixation",
+            "expected_work": "Collected papers",
+            "description": "Concepts multiples (test de granularité)"
+        },
+        {
+            "query": "Entre la logique des termes et la grammaire spéculative",
+            "expected_work": "La pensée-signe",
+            "description": "Requête technique académique"
+        },
+    ]
+
+    for i, test in enumerate(test_queries, 1):
+        print(f"Test {i}: {test['description']}")
+        print(f"Query: \"{test['query']}\"")
+        print()
+
+        # Vectorize query and search with near_vector
+        # (Chunk_v2 has no vectorizer, so we must manually vectorize queries)
+        query_vector = vectorize_query(test['query'])
+        result = chunk_v2.query.near_vector(
+            near_vector=query_vector,
+            limit=5,
+            return_properties=[
+                'text', 'workTitle', 'workAuthor',
+                'sectionPath', 'chapterTitle'
+            ],
+            return_metadata=['distance']
+        )
+
+        if not result.objects:
+            print("  ❌ Aucun résultat trouvé")
+            print()
+            continue
+
+        # Analyser les résultats
+        print(f"  Résultats: {len(result.objects)}")
+        print()
+
+        for j, obj in enumerate(result.objects, 1):
+            props = obj.properties
+            work_title = props.get('workTitle', 'N/A')
+            text = props.get('text', '')
+            tokens = estimate_tokens(text)
+
+            # Distance (si disponible)
+            distance = getattr(obj.metadata, 'distance', None) if hasattr(obj, 'metadata') else None
+            distance_str = f" (distance: {distance:.4f})" if distance else ""
+
+            # Marquer si c'est l'œuvre attendue
+            match_icon = "✓" if test['expected_work'] in work_title else " "
+
+            print(f"  [{match_icon}] {j}. {work_title}{distance_str}")
+            print(f"      Taille: {tokens} tokens")
+            print(f"      Section: {props.get('sectionPath', 'N/A')[:60]}...")
+            print(f"      Extrait: {text[:120]}...")
+            print()
+
+        # Vérifier si l'œuvre attendue est dans les résultats
+        found_expected = any(
+            test['expected_work'] in obj.properties.get('workTitle', '')
+            for obj in result.objects
+        )
+
+        if found_expected:
+            rank = next(
+                i for i, obj in enumerate(result.objects, 1)
+                if test['expected_work'] in obj.properties.get('workTitle', '')
+            )
+            print(f"  ✅ Œuvre attendue trouvée (rang {rank}/5)")
+        else:
+            print(f"  ⚠️  Œuvre attendue '{test['expected_work']}' non trouvée dans le top 5")
+
+        print()
+        print("-" * 80)
+        print()
+
+    # ========== 4. STATISTIQUES GLOBALES ==========
+    print("4. STATISTIQUES GLOBALES DE RECHERCHE")
+    print("-" * 80)
+    print()
+
+    # Tester une requête large
+    broad_query = "philosophy and logic"
+    print(f"Requête large: \"{broad_query}\"")
+    print()
+
+    query_vector = vectorize_query(broad_query)
+    result = chunk_v2.query.near_vector(
+        near_vector=query_vector,
+        limit=20,
+        return_properties=['workTitle', 'text']
+    )
+
+    # Compter par œuvre
+    work_distribution = {}
+    chunk_sizes_in_results = []
+
+    for obj in result.objects:
+        props = obj.properties
+        work = props.get('workTitle', 'Unknown')
+        work_distribution[work] = work_distribution.get(work, 0) + 1
+
+        text = props.get('text', '')
+        tokens = estimate_tokens(text)
+        chunk_sizes_in_results.append(tokens)
+
+    print(f"Résultats par œuvre (top 20):")
+    for work, count in sorted(work_distribution.items(), key=lambda x: x[1], reverse=True):
+        print(f"  • {work}: {count} chunks")
+
+    print()
+
+    if chunk_sizes_in_results:
+        avg_result_size = sum(chunk_sizes_in_results) / len(chunk_sizes_in_results)
+        max_result_size = max(chunk_sizes_in_results)
+        print(f"Taille moyenne des chunks retournés: {avg_result_size:.0f} tokens")
+        print(f"Taille max des chunks retournés: {max_result_size} tokens")
+
+    print()
+
+    # ========== 5. SCORE DE QUALITÉ ==========
+    print("5. SCORE DE QUALITÉ DE LA RECHERCHE")
+    print("-" * 80)
+    print()
+
+    quality_checks = []
+
+    # Check 1: Aucun chunk > 2000 tokens
+    oversized = sum(1 for s in sizes if s > 2000)
+    quality_checks.append({
+        'name': 'Taille des chunks',
+        'passed': oversized == 0,
+        'detail': f'{oversized} chunks > 2000 tokens'
+    })
+
+    # Check 2: Distribution équilibrée
+    optimal_range = sum(1 for s in sizes if 200 <= s <= 1500)
+    optimal_percentage = optimal_range / total * 100
+    quality_checks.append({
+        'name': 'Distribution optimale',
+        'passed': optimal_percentage >= 80,
+        'detail': f'{optimal_percentage:.1f}% dans range optimal (200-1500 tokens)'
+    })
+
+    # Check 3: Résultats variés
+    unique_works = len(work_distribution)
+    quality_checks.append({
+        'name': 'Diversité des résultats',
+        'passed': unique_works >= 3,
+        'detail': f'{unique_works} œuvres différentes dans top 20'
+    })
+
+    # Check 4: Overlap présent
+    quality_checks.append({
+        'name': 'Overlap entre chunks',
+        'passed': overlaps_found > 0 if overlaps_checked > 0 else None,
+        'detail': f'{overlaps_found}/{overlaps_checked} overlaps détectés' if overlaps_checked > 0 else 'N/A'
+    })
+
+    # Afficher les résultats
+    passed = sum(1 for c in quality_checks if c['passed'] is True)
+    total_checks = sum(1 for c in quality_checks if c['passed'] is not None)
+
+    for check in quality_checks:
+        if check['passed'] is None:
+            icon = "⚠️"
+            status = "N/A"
+        elif check['passed']:
+            icon = "✅"
+            status = "OK"
+        else:
+            icon = "❌"
+            status = "FAIL"
+
+        print(f"{icon} {check['name']}: {status}")
+        print(f"   {check['detail']}")
+
+    print()
+    print(f"Score: {passed}/{total_checks} ({passed/total_checks*100:.0f}%)")
+    print()
+
+    # ========== 6. RÉSUMÉ ==========
+    print("=" * 80)
+    print("RÉSUMÉ DU TEST")
+    print("=" * 80)
+    print()
+
+    if passed >= total_checks * 0.8:
+        print("✅ QUALITÉ DE RECHERCHE: EXCELLENTE")
+        print()
+        print("Les chunks re-chunkés ont amélioré la recherche:")
+        print(f"  • {total:,} chunks optimisés")
+        print(f"  • Taille moyenne: {avg:.0f} tokens (optimal)")
+        print(f"  • {optimal_percentage:.1f}% dans la plage optimale")
+        print(f"  • Max: {max_size} tokens (< 2500)")
+        print(f"  • Overlap détecté: {overlaps_found > 0 if overlaps_checked > 0 else 'N/A'}")
+        print()
+        print("Recommandations:")
+        print("  ✓ La recherche sémantique fonctionne correctement")
+        print("  ✓ Les chunks sont de taille optimale pour BGE-M3")
+        print("  ✓ Le système est prêt pour la production")
+    elif passed >= total_checks * 0.6:
+        print("⚠️  QUALITÉ DE RECHERCHE: BONNE")
+        print()
+        print("Quelques améliorations possibles:")
+        for check in quality_checks:
+            if not check['passed'] and check['passed'] is not None:
+                print(f"  • {check['name']}: {check['detail']}")
+    else:
+        print("❌ QUALITÉ DE RECHERCHE: À AMÉLIORER")
+        print()
+        print("Problèmes détectés:")
+        for check in quality_checks:
+            if not check['passed'] and check['passed'] is not None:
+                print(f"  • {check['name']}: {check['detail']}")
+
+finally:
+    client.close()
--- a/11_vectorize_missing_chunks.py
+++ b/11_vectorize_missing_chunks.py
@@ -0,0 +1,217 @@
+"""Vectorize chunks that don't have vectors.
+
+After re-chunking, new chunks were created without vectors because Chunk_v2
+collection has no vectorizer configured. This script manually vectorizes
+these chunks using the text2vec-transformers service.
+"""
+
+import weaviate
+import sys
+import requests
+
+if sys.stdout.encoding != 'utf-8':
+    sys.stdout.reconfigure(encoding='utf-8')
+
+# Try to import tqdm
+try:
+    from tqdm import tqdm
+    HAS_TQDM = True
+except ImportError:
+    HAS_TQDM = False
+
+# Text2vec-transformers service URL (from docker-compose.yml)
+VECTORIZER_URL = "http://localhost:8090/vectors"
+
+client = weaviate.connect_to_local()
+
+try:
+    print("=" * 80)
+    print("VECTORISATION DES CHUNKS SANS VECTEUR")
+    print("=" * 80)
+    print()
+
+    chunk_v2 = client.collections.get("Chunk_v2")
+
+    # ========== 1. IDENTIFIER LES CHUNKS SANS VECTEUR ==========
+    print("1. IDENTIFICATION DES CHUNKS SANS VECTEUR")
+    print("-" * 80)
+    print()
+
+    print("Analyse en cours...")
+
+    chunks_to_vectorize = []
+
+    for chunk in chunk_v2.iterator(include_vector=True):
+        if not chunk.vector or not chunk.vector.get('default'):
+            props = chunk.properties
+            chunks_to_vectorize.append({
+                'uuid': chunk.uuid,
+                'text': props.get('text', ''),
+                'summary': props.get('summary', ''),
+                'keywords': props.get('keywords', []),
+                'workTitle': props.get('workTitle', 'N/A')
+            })
+
+    print(f"✓ {len(chunks_to_vectorize)} chunks sans vecteur trouvés")
+    print()
+
+    if not chunks_to_vectorize:
+        print("✅ Aucun chunk à vectoriser")
+        print()
+        print("=" * 80)
+        print("SCRIPT TERMINÉ - RIEN À FAIRE")
+        print("=" * 80)
+        sys.exit(0)
+
+    # ========== 2. VECTORISATION ==========
+    print("2. VECTORISATION DES CHUNKS")
+    print("-" * 80)
+    print()
+
+    print(f"Service vectorizer: {VECTORIZER_URL}")
+    print()
+
+    vectorized_count = 0
+    errors = []
+
+    # Create iterator with or without tqdm
+    if HAS_TQDM:
+        iterator = tqdm(
+            chunks_to_vectorize,
+            desc="Vectorisation",
+            unit="chunks"
+        )
+    else:
+        iterator = chunks_to_vectorize
+        print("Vectorisation en cours...")
+
+    for idx, chunk_data in enumerate(iterator, 1):
+        try:
+            # Prepare text for vectorization
+            # Combine text, summary, and keywords as per original Chunk schema
+            text_parts = [chunk_data['text']]
+
+            if chunk_data['summary']:
+                text_parts.append(chunk_data['summary'])
+
+            if chunk_data['keywords']:
+                text_parts.append(' '.join(chunk_data['keywords']))
+
+            combined_text = ' '.join(text_parts)
+
+            # Call text2vec-transformers service
+            response = requests.post(
+                VECTORIZER_URL,
+                json={"text": combined_text},
+                headers={"Content-Type": "application/json"},
+                timeout=30
+            )
+
+            if response.status_code != 200:
+                errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: HTTP {response.status_code}")
+                continue
+
+            result = response.json()
+            vector = result.get('vector')
+
+            if not vector:
+                errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: Pas de vecteur dans la réponse")
+                continue
+
+            # Update chunk with vector
+            chunk_v2.data.update(
+                uuid=chunk_data['uuid'],
+                vector=vector
+            )
+
+            vectorized_count += 1
+
+            # Progress without tqdm
+            if not HAS_TQDM and idx % 10 == 0:
+                print(f"  {idx}/{len(chunks_to_vectorize)} chunks vectorisés...")
+
+        except requests.exceptions.RequestException as e:
+            errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: Erreur réseau - {e}")
+        except Exception as e:
+            errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: {e}")
+
+    print()
+    print("-" * 80)
+    print(f"✓ Chunks vectorisés: {vectorized_count}/{len(chunks_to_vectorize)}")
+
+    if errors:
+        print()
+        print(f"⚠️  Erreurs rencontrées: {len(errors)}")
+        for err in errors[:10]:
+            print(f"  - {err}")
+        if len(errors) > 10:
+            print(f"  ... et {len(errors) - 10} autres")
+
+    print()
+
+    # ========== 3. VÉRIFICATION ==========
+    print("3. VÉRIFICATION POST-VECTORISATION")
+    print("-" * 80)
+    print()
+
+    print("Recomptage...")
+    remaining_without_vector = 0
+    total_chunks = 0
+
+    for chunk in chunk_v2.iterator(include_vector=True):
+        total_chunks += 1
+        if not chunk.vector or not chunk.vector.get('default'):
+            remaining_without_vector += 1
+
+    chunks_with_vector = total_chunks - remaining_without_vector
+
+    print(f"✓ Total chunks: {total_chunks:,}")
+    print(f"✓ Avec vecteur: {chunks_with_vector:,} ({chunks_with_vector/total_chunks*100:.1f}%)")
+    print(f"✓ Sans vecteur: {remaining_without_vector:,}")
+
+    print()
+
+    if remaining_without_vector == 0:
+        print("✅ Tous les chunks ont été vectorisés!")
+    else:
+        print(f"⚠️  {remaining_without_vector} chunks encore sans vecteur")
+        print("   Relancer le script ou vérifier les erreurs")
+
+    print()
+    print("=" * 80)
+    print("VECTORISATION TERMINÉE")
+    print("=" * 80)
+    print()
+
+    if remaining_without_vector == 0:
+        print("✅ VECTORISATION RÉUSSIE")
+        print()
+        print("RÉSULTATS:")
+        print(f"  • {vectorized_count} nouveaux vecteurs créés")
+        print(f"  • {total_chunks:,} chunks totaux")
+        print(f"  • 100% des chunks ont des vecteurs")
+        print()
+        print("PROCHAINES ÉTAPES:")
+        print("  1. Relancer le test de recherche: python 10_test_search_quality.py")
+        print("  2. Tester l'application Flask")
+        print()
+        print("NOTE: Chunk_v2 n'a toujours pas de vectorizer configuré.")
+        print("Les futurs nouveaux chunks devront être vectorisés manuellement")
+        print("OU la collection devra être recréée avec un vectorizer.")
+    elif vectorized_count > 0:
+        print("⚠️  VECTORISATION PARTIELLE")
+        print()
+        print(f"  • {vectorized_count} chunks vectorisés")
+        print(f"  • {remaining_without_vector} chunks restants")
+        print("  • Vérifier les erreurs et relancer")
+    else:
+        print("❌ VECTORISATION ÉCHOUÉE")
+        print()
+        print("Aucun chunk n'a pu être vectorisé.")
+        print("Vérifications:")
+        print(f"  1. Service text2vec-transformers actif: {VECTORIZER_URL}")
+        print("  2. Docker containers en cours d'exécution")
+        print("  3. Logs des erreurs ci-dessus")
+
+finally:
+    client.close()
--- a/generations/library_rag/docker-compose.yml
+++ b/generations/library_rag/docker-compose.yml
@@ -31,6 +31,10 @@ services:
      AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: "true"   # ok pour dev/local
      PERSISTENCE_DATA_PATH: "/var/lib/weaviate"
      CLUSTER_HOSTNAME: "node1"
+      CLUSTER_GOSSIP_BIND_PORT: "7946"
+      CLUSTER_DATA_BIND_PORT: "7947"
+      # Fix for "No private IP address found" error
+      CLUSTER_JOIN: ""
      DEFAULT_VECTORIZER_MODULE: "text2vec-transformers"
      ENABLE_MODULES: "text2vec-transformers"
      TRANSFORMERS_INFERENCE_API: "http://text2vec-transformers:8080"
@@ -56,6 +60,8 @@ services:
    #   - Current setup: CPU-only with AVX2 optimization (functional but slower)
    image: cr.weaviate.io/semitechnologies/transformers-inference:baai-bge-m3-onnx-latest
    restart: on-failure:0
+    ports:
+      - "8090:8080"  # Expose vectorizer API for manual vectorization
    environment:
      # ONNX runtime - CPU only (CUDA not supported in ONNX version)
      ENABLE_CUDA: "0"
--- a/generations/library_rag/utils/llm_chunker.py
+++ b/generations/library_rag/utils/llm_chunker.py
@@ -52,9 +52,15 @@ from .llm_structurer import (
 )
 from .llm_cleaner import clean_page_markers, is_chunk_valid
 from .types import LLMProvider, SemanticChunk
+from .llm_chunker_improved import simple_chunk_with_overlap, validate_chunk_size

 logger: logging.Logger = logging.getLogger(__name__)

+# Chunk size limits (2024-01 optimization)
+MAX_CHUNK_WORDS = 1000  # Hard limit to stay within BGE-M3 context
+OVERLAP_WORDS = 100     # Overlap for context preservation
+FORCE_SIMPLE_CHUNKING_THRESHOLD = 1500  # Words - force simple chunking above this
+

 # =============================================================================
 # Type Definitions for LLM Chunker
@@ -221,8 +227,43 @@ def chunk_section_with_llm(
    # Nettoyer le contenu
    content: str = clean_page_markers(section_content)

-    # Si le contenu est court, ne pas découper
+    # Compter les mots
    word_count: int = len(content.split())
+
+    # FORCE SIMPLE CHUNKING if section is too long (> 1500 words)
+    # This prevents giant chunks that exceed BGE-M3 limits
+    if word_count > FORCE_SIMPLE_CHUNKING_THRESHOLD:
+        logger.warning(
+            f"Section '{section_title}' is too long ({word_count} words), "
+            f"forcing simple chunking with overlap"
+        )
+        simple_texts = simple_chunk_with_overlap(
+            content,
+            max_words=MAX_CHUNK_WORDS,
+            overlap_words=OVERLAP_WORDS
+        )
+
+        # Convert to SemanticChunk format
+        result_chunks: List[SemanticChunk] = []
+        for i, text in enumerate(simple_texts):
+            para_num = extract_paragraph_number(text)
+            chunk: SemanticChunk = {
+                "text": text,
+                "summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
+                "concepts": [],
+                "type": "main_content",
+                "section_level": section_level,
+            }
+            if para_num is not None:
+                chunk["paragraph_number"] = para_num
+            if subsection_title and subsection_title != section_title:
+                chunk["subsection_title"] = subsection_title
+            result_chunks.append(chunk)
+
+        logger.info(f"Section split into {len(result_chunks)} chunks with overlap")
+        return result_chunks
+
+    # Si le contenu est court, ne pas découper
    if word_count < target_chunk_size * 0.8:
        para_num: Optional[int] = extract_paragraph_number(content)
        chunk: SemanticChunk = {
@@ -320,39 +361,66 @@ RÉPONDS avec un JSON entre <JSON></JSON>:

                valid_chunks.append(chunk_data)

-        # Si aucun chunk valide, retourner le contenu complet
+        # Si aucun chunk valide, utiliser simple chunking avec overlap
        if not valid_chunks:
-            logger.warning(f"Aucun chunk valide pour '{section_title}', retour contenu complet")
-            para_num = extract_paragraph_number(content)
-            fallback: SemanticChunk = {
-                "text": content,
-                "summary": section_title,
-                "concepts": [],
-                "type": "main_content",
-                "section_level": section_level,
-            }
-            if para_num is not None:
-                fallback["paragraph_number"] = para_num
-            return [fallback]
+            logger.warning(
+                f"Aucun chunk valide pour '{section_title}', "
+                f"fallback vers simple chunking avec overlap"
+            )
+            simple_texts = simple_chunk_with_overlap(
+                content,
+                max_words=MAX_CHUNK_WORDS,
+                overlap_words=OVERLAP_WORDS
+            )
+
+            fallback_chunks: List[SemanticChunk] = []
+            for i, text in enumerate(simple_texts):
+                para_num = extract_paragraph_number(text)
+                chunk_data: SemanticChunk = {
+                    "text": text,
+                    "summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
+                    "concepts": [],
+                    "type": "main_content",
+                    "section_level": section_level,
+                }
+                if para_num is not None:
+                    chunk_data["paragraph_number"] = para_num
+                fallback_chunks.append(chunk_data)
+
+            logger.info(f"Fallback: section split into {len(fallback_chunks)} chunks")
+            return fallback_chunks

        logger.info(f"Section '{section_title}' découpée en {len(valid_chunks)} chunks")
        return valid_chunks

    except Exception as e:
        logger.error(f"Erreur chunking LLM: {e}")
-        # Fallback: retourner le contenu complet
-        para_num = extract_paragraph_number(content)
-        fallback_err: SemanticChunk = {
-            "text": content,
-            "summary": section_title,
-            "concepts": [],
-            "type": "main_content",
-            "section_level": section_level,
-            "error": str(e),
-        }
-        if para_num is not None:
-            fallback_err["paragraph_number"] = para_num
-        return [fallback_err]
+        # Fallback: utiliser simple chunking avec overlap
+        logger.warning(f"Exception LLM, fallback vers simple chunking avec overlap")
+
+        simple_texts = simple_chunk_with_overlap(
+            content,
+            max_words=MAX_CHUNK_WORDS,
+            overlap_words=OVERLAP_WORDS
+        )
+
+        error_chunks: List[SemanticChunk] = []
+        for i, text in enumerate(simple_texts):
+            para_num = extract_paragraph_number(text)
+            chunk_data: SemanticChunk = {
+                "text": text,
+                "summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
+                "concepts": [],
+                "type": "main_content",
+                "section_level": section_level,
+                "error": f"LLM failed: {str(e)}",
+            }
+            if para_num is not None:
+                chunk_data["paragraph_number"] = para_num
+            error_chunks.append(chunk_data)
+
+        logger.info(f"Error fallback: section split into {len(error_chunks)} chunks")
+        return error_chunks


 def simple_chunk_by_paragraphs(
--- a/generations/library_rag/utils/llm_chunker_improved.py
+++ b/generations/library_rag/utils/llm_chunker_improved.py
@@ -0,0 +1,232 @@
+"""Improved semantic chunking with strict size limits and overlap.
+
+This module adds strict chunk size constraints (max 1000 words) and overlap
+functionality (100 words) to prevent giant chunks that exceed BGE-M3 limits.
+
+Key improvements:
+    - MAX_CHUNK_WORDS = 1000 (hard limit)
+    - OVERLAP_WORDS = 100 (context preservation)
+    - Fallback to simple chunking if section > 1500 words
+    - Fallback to simple chunking if LLM fails
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import List, Optional
+
+from .llm_cleaner import clean_page_markers
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+# Constants
+MAX_CHUNK_WORDS = 1000  # Hard limit per chunk (~2500 tokens)
+OVERLAP_WORDS = 100  # Overlap between chunks for context
+MIN_CHUNK_WORDS = 100  # Minimum chunk size
+
+
+def simple_chunk_with_overlap(
+    content: str,
+    max_words: int = MAX_CHUNK_WORDS,
+    min_words: int = MIN_CHUNK_WORDS,
+    overlap_words: int = OVERLAP_WORDS,
+) -> List[str]:
+    """Split text into chunks with overlap for context preservation.
+
+    This is an improved version of simple_chunk_by_paragraphs that adds
+    overlap between consecutive chunks to maintain context.
+
+    Algorithm:
+        1. Split by paragraph boundaries (double newlines)
+        2. Merge small paragraphs until max_words is reached
+        3. Split long paragraphs at sentence boundaries
+        4. Add overlap_words from previous chunk to next chunk
+        5. Filter chunks below min_words threshold
+
+    Args:
+        content: Text content to split into chunks.
+        max_words: Maximum words per chunk. Defaults to 1000.
+        min_words: Minimum words per chunk. Defaults to 100.
+        overlap_words: Words to overlap between chunks. Defaults to 100.
+
+    Returns:
+        List of text chunks as strings with overlap.
+
+    Example:
+        >>> chunks = simple_chunk_with_overlap(text, max_words=1000, overlap_words=100)
+        >>> # Each chunk overlaps with 100 words from previous chunk
+    """
+    content = clean_page_markers(content)
+
+    # Split by paragraphs
+    paragraphs: List[str] = re.split(r'\n\n+', content)
+
+    chunks: List[str] = []
+    current_chunk: List[str] = []
+    current_words: int = 0
+    overlap_buffer: List[str] = []  # Store last sentences for overlap
+
+    def finalize_chunk() -> None:
+        """Finalize current chunk and prepare overlap."""
+        nonlocal current_chunk, current_words, overlap_buffer
+
+        if not current_chunk:
+            return
+
+        chunk_text = '\n\n'.join(current_chunk)
+        chunks.append(chunk_text)
+
+        # Extract last sentences for overlap
+        sentences = re.split(r'(?<=[.!?])\s+', chunk_text)
+        overlap_buffer = []
+        overlap_word_count = 0
+
+        # Take last sentences until we reach overlap_words
+        for sentence in reversed(sentences):
+            sentence_words = len(sentence.split())
+            if overlap_word_count + sentence_words <= overlap_words:
+                overlap_buffer.insert(0, sentence)
+                overlap_word_count += sentence_words
+            else:
+                break
+
+        current_chunk = []
+        current_words = 0
+
+    for para in paragraphs:
+        para = para.strip()
+        if not para:
+            continue
+
+        para_words: int = len(para.split())
+
+        # If paragraph is too long, split by sentences
+        if para_words > max_words:
+            # Finalize current chunk first
+            if current_chunk:
+                finalize_chunk()
+
+            # Add overlap if exists
+            if overlap_buffer and chunks:
+                current_chunk.extend(overlap_buffer)
+                current_words = sum(len(s.split()) for s in overlap_buffer)
+
+            # Split long paragraph by sentences
+            sentences: List[str] = re.split(r'(?<=[.!?])\s+', para)
+            for sentence in sentences:
+                sentence_words: int = len(sentence.split())
+
+                if current_words + sentence_words > max_words and current_chunk:
+                    finalize_chunk()
+
+                    # Add overlap
+                    if overlap_buffer:
+                        current_chunk.extend(overlap_buffer)
+                        current_words = sum(len(s.split()) for s in overlap_buffer)
+
+                    current_chunk.append(sentence)
+                    current_words += sentence_words
+                else:
+                    current_chunk.append(sentence)
+                    current_words += sentence_words
+
+        # If adding paragraph exceeds limit
+        elif current_words + para_words > max_words:
+            if current_chunk:
+                finalize_chunk()
+
+            # Add overlap
+            if overlap_buffer and chunks:
+                current_chunk.extend(overlap_buffer)
+                current_words = sum(len(s.split()) for s in overlap_buffer)
+
+            current_chunk.append(para)
+            current_words += para_words
+
+        else:
+            current_chunk.append(para)
+            current_words += para_words
+
+    # Last chunk
+    if current_chunk:
+        chunk_text = '\n\n'.join(current_chunk)
+        chunks.append(chunk_text)
+
+    # Filter chunks that are too short (unless it's the only chunk)
+    if len(chunks) > 1:
+        chunks = [c for c in chunks if len(c.split()) >= min_words]
+
+    return chunks
+
+
+def get_chunk_text_with_context(
+    chunks: List[str],
+    index: int,
+    context_words: int = 50
+) -> tuple[str, str, str]:
+    """Get chunk with before/after context for better LLM processing.
+
+    Args:
+        chunks: List of chunk texts.
+        index: Index of the chunk to process.
+        context_words: Words of context to include from adjacent chunks.
+
+    Returns:
+        Tuple of (before_context, chunk_text, after_context).
+    """
+    chunk = chunks[index]
+
+    before_context = ""
+    if index > 0:
+        prev_chunk = chunks[index - 1]
+        words = prev_chunk.split()
+        before_context = " ".join(words[-context_words:]) if len(words) > context_words else prev_chunk
+
+    after_context = ""
+    if index < len(chunks) - 1:
+        next_chunk = chunks[index + 1]
+        words = next_chunk.split()
+        after_context = " ".join(words[:context_words]) if len(words) > context_words else next_chunk
+
+    return before_context, chunk, after_context
+
+
+def estimate_tokens(text: str) -> int:
+    """Estimate token count from text.
+
+    Uses approximation of 1 token ≈ 4 characters.
+
+    Args:
+        text: Text to estimate.
+
+    Returns:
+        Estimated token count.
+    """
+    return len(text) // 4
+
+
+def validate_chunk_size(text: str, max_tokens: int = 2500) -> bool:
+    """Validate that chunk size is within acceptable limits.
+
+    Args:
+        text: Chunk text to validate.
+        max_tokens: Maximum allowed tokens (default 2500 for safety margin below BGE-M3's 8192).
+
+    Returns:
+        True if chunk is valid size, False otherwise.
+    """
+    tokens = estimate_tokens(text)
+    return tokens <= max_tokens
+
+
+# Export key functions
+__all__ = [
+    'simple_chunk_with_overlap',
+    'get_chunk_text_with_context',
+    'estimate_tokens',
+    'validate_chunk_size',
+    'MAX_CHUNK_WORDS',
+    'OVERLAP_WORDS',
+    'MIN_CHUNK_WORDS',
+]