feat: Optimize chunk sizes with 1000-word limit and overlap

Implemented chunking optimization to resolve oversized chunks and improve semantic search quality: CHUNKING IMPROVEMENTS: - Added strict 1000-word max limit (vs previous 1500-2000) - Implemented 100-word overlap between consecutive chunks - Created llm_chunker_improved.py with overlap functionality - Added 3 fallback points in llm_chunker.py for robustness RE-CHUNKING RESULTS: - Identified and re-chunked 31 oversized chunks (>2000 tokens) - Split into 92 optimally-sized chunks (max 1995 tokens) - Preserved all metadata (workTitle, workAuthor, sectionPath, etc.) - 0 chunks now exceed 2000 tokens (vs 31 before) VECTORIZATION: - Created manual vectorization script for chunks without vectors - Successfully vectorized all 92 new chunks (100% coverage) - All 5,304 chunks now have BGE-M3 embeddings DOCKER CONFIGURATION: - Exposed text2vec-transformers port 8090 for manual vectorization - Added cluster configuration to fix "No private IP address found" - Increased worker timeout to 600s for large chunks TESTING: - Created comprehensive search quality test suite - Tests distribution, overlap detection, and semantic search - Modified to use near_vector() (Chunk_v2 has no vectorizer) Scripts: - 08_fix_summaries_properties.py - Add missing Work metadata to summaries - 09_rechunk_oversized.py - Re-chunk giant chunks with overlap - 10_test_search_quality.py - Validate search improvements - 11_vectorize_missing_chunks.py - Manual vectorization via API Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-08 17:37:49 +01:00
parent ca221887eb
commit 7045907173
7 changed files with 1376 additions and 27 deletions
--- a/generations/library_rag/docker-compose.yml
+++ b/generations/library_rag/docker-compose.yml
@@ -31,6 +31,10 @@ services:
      AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: "true"   # ok pour dev/local
      PERSISTENCE_DATA_PATH: "/var/lib/weaviate"
      CLUSTER_HOSTNAME: "node1"
+      CLUSTER_GOSSIP_BIND_PORT: "7946"
+      CLUSTER_DATA_BIND_PORT: "7947"
+      # Fix for "No private IP address found" error
+      CLUSTER_JOIN: ""
      DEFAULT_VECTORIZER_MODULE: "text2vec-transformers"
      ENABLE_MODULES: "text2vec-transformers"
      TRANSFORMERS_INFERENCE_API: "http://text2vec-transformers:8080"
@@ -56,6 +60,8 @@ services:
    #   - Current setup: CPU-only with AVX2 optimization (functional but slower)
    image: cr.weaviate.io/semitechnologies/transformers-inference:baai-bge-m3-onnx-latest
    restart: on-failure:0
+    ports:
+      - "8090:8080"  # Expose vectorizer API for manual vectorization
    environment:
      # ONNX runtime - CPU only (CUDA not supported in ONNX version)
      ENABLE_CUDA: "0"
--- a/generations/library_rag/utils/llm_chunker.py
+++ b/generations/library_rag/utils/llm_chunker.py
@@ -52,9 +52,15 @@ from .llm_structurer import (
 )
 from .llm_cleaner import clean_page_markers, is_chunk_valid
 from .types import LLMProvider, SemanticChunk
+from .llm_chunker_improved import simple_chunk_with_overlap, validate_chunk_size

 logger: logging.Logger = logging.getLogger(__name__)

+# Chunk size limits (2024-01 optimization)
+MAX_CHUNK_WORDS = 1000  # Hard limit to stay within BGE-M3 context
+OVERLAP_WORDS = 100     # Overlap for context preservation
+FORCE_SIMPLE_CHUNKING_THRESHOLD = 1500  # Words - force simple chunking above this
+

 # =============================================================================
 # Type Definitions for LLM Chunker
@@ -221,8 +227,43 @@ def chunk_section_with_llm(
    # Nettoyer le contenu
    content: str = clean_page_markers(section_content)

-    # Si le contenu est court, ne pas découper
+    # Compter les mots
    word_count: int = len(content.split())
+
+    # FORCE SIMPLE CHUNKING if section is too long (> 1500 words)
+    # This prevents giant chunks that exceed BGE-M3 limits
+    if word_count > FORCE_SIMPLE_CHUNKING_THRESHOLD:
+        logger.warning(
+            f"Section '{section_title}' is too long ({word_count} words), "
+            f"forcing simple chunking with overlap"
+        )
+        simple_texts = simple_chunk_with_overlap(
+            content,
+            max_words=MAX_CHUNK_WORDS,
+            overlap_words=OVERLAP_WORDS
+        )
+
+        # Convert to SemanticChunk format
+        result_chunks: List[SemanticChunk] = []
+        for i, text in enumerate(simple_texts):
+            para_num = extract_paragraph_number(text)
+            chunk: SemanticChunk = {
+                "text": text,
+                "summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
+                "concepts": [],
+                "type": "main_content",
+                "section_level": section_level,
+            }
+            if para_num is not None:
+                chunk["paragraph_number"] = para_num
+            if subsection_title and subsection_title != section_title:
+                chunk["subsection_title"] = subsection_title
+            result_chunks.append(chunk)
+
+        logger.info(f"Section split into {len(result_chunks)} chunks with overlap")
+        return result_chunks
+
+    # Si le contenu est court, ne pas découper
    if word_count < target_chunk_size * 0.8:
        para_num: Optional[int] = extract_paragraph_number(content)
        chunk: SemanticChunk = {
@@ -320,39 +361,66 @@ RÉPONDS avec un JSON entre <JSON></JSON>:

                valid_chunks.append(chunk_data)

-        # Si aucun chunk valide, retourner le contenu complet
+        # Si aucun chunk valide, utiliser simple chunking avec overlap
        if not valid_chunks:
-            logger.warning(f"Aucun chunk valide pour '{section_title}', retour contenu complet")
-            para_num = extract_paragraph_number(content)
-            fallback: SemanticChunk = {
-                "text": content,
-                "summary": section_title,
-                "concepts": [],
-                "type": "main_content",
-                "section_level": section_level,
-            }
-            if para_num is not None:
-                fallback["paragraph_number"] = para_num
-            return [fallback]
+            logger.warning(
+                f"Aucun chunk valide pour '{section_title}', "
+                f"fallback vers simple chunking avec overlap"
+            )
+            simple_texts = simple_chunk_with_overlap(
+                content,
+                max_words=MAX_CHUNK_WORDS,
+                overlap_words=OVERLAP_WORDS
+            )
+
+            fallback_chunks: List[SemanticChunk] = []
+            for i, text in enumerate(simple_texts):
+                para_num = extract_paragraph_number(text)
+                chunk_data: SemanticChunk = {
+                    "text": text,
+                    "summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
+                    "concepts": [],
+                    "type": "main_content",
+                    "section_level": section_level,
+                }
+                if para_num is not None:
+                    chunk_data["paragraph_number"] = para_num
+                fallback_chunks.append(chunk_data)
+
+            logger.info(f"Fallback: section split into {len(fallback_chunks)} chunks")
+            return fallback_chunks

        logger.info(f"Section '{section_title}' découpée en {len(valid_chunks)} chunks")
        return valid_chunks

    except Exception as e:
        logger.error(f"Erreur chunking LLM: {e}")
-        # Fallback: retourner le contenu complet
-        para_num = extract_paragraph_number(content)
-        fallback_err: SemanticChunk = {
-            "text": content,
-            "summary": section_title,
-            "concepts": [],
-            "type": "main_content",
-            "section_level": section_level,
-            "error": str(e),
-        }
-        if para_num is not None:
-            fallback_err["paragraph_number"] = para_num
-        return [fallback_err]
+        # Fallback: utiliser simple chunking avec overlap
+        logger.warning(f"Exception LLM, fallback vers simple chunking avec overlap")
+
+        simple_texts = simple_chunk_with_overlap(
+            content,
+            max_words=MAX_CHUNK_WORDS,
+            overlap_words=OVERLAP_WORDS
+        )
+
+        error_chunks: List[SemanticChunk] = []
+        for i, text in enumerate(simple_texts):
+            para_num = extract_paragraph_number(text)
+            chunk_data: SemanticChunk = {
+                "text": text,
+                "summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
+                "concepts": [],
+                "type": "main_content",
+                "section_level": section_level,
+                "error": f"LLM failed: {str(e)}",
+            }
+            if para_num is not None:
+                chunk_data["paragraph_number"] = para_num
+            error_chunks.append(chunk_data)
+
+        logger.info(f"Error fallback: section split into {len(error_chunks)} chunks")
+        return error_chunks


 def simple_chunk_by_paragraphs(
--- a/generations/library_rag/utils/llm_chunker_improved.py
+++ b/generations/library_rag/utils/llm_chunker_improved.py
@@ -0,0 +1,232 @@
+"""Improved semantic chunking with strict size limits and overlap.
+
+This module adds strict chunk size constraints (max 1000 words) and overlap
+functionality (100 words) to prevent giant chunks that exceed BGE-M3 limits.
+
+Key improvements:
+    - MAX_CHUNK_WORDS = 1000 (hard limit)
+    - OVERLAP_WORDS = 100 (context preservation)
+    - Fallback to simple chunking if section > 1500 words
+    - Fallback to simple chunking if LLM fails
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import List, Optional
+
+from .llm_cleaner import clean_page_markers
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+# Constants
+MAX_CHUNK_WORDS = 1000  # Hard limit per chunk (~2500 tokens)
+OVERLAP_WORDS = 100  # Overlap between chunks for context
+MIN_CHUNK_WORDS = 100  # Minimum chunk size
+
+
+def simple_chunk_with_overlap(
+    content: str,
+    max_words: int = MAX_CHUNK_WORDS,
+    min_words: int = MIN_CHUNK_WORDS,
+    overlap_words: int = OVERLAP_WORDS,
+) -> List[str]:
+    """Split text into chunks with overlap for context preservation.
+
+    This is an improved version of simple_chunk_by_paragraphs that adds
+    overlap between consecutive chunks to maintain context.
+
+    Algorithm:
+        1. Split by paragraph boundaries (double newlines)
+        2. Merge small paragraphs until max_words is reached
+        3. Split long paragraphs at sentence boundaries
+        4. Add overlap_words from previous chunk to next chunk
+        5. Filter chunks below min_words threshold
+
+    Args:
+        content: Text content to split into chunks.
+        max_words: Maximum words per chunk. Defaults to 1000.
+        min_words: Minimum words per chunk. Defaults to 100.
+        overlap_words: Words to overlap between chunks. Defaults to 100.
+
+    Returns:
+        List of text chunks as strings with overlap.
+
+    Example:
+        >>> chunks = simple_chunk_with_overlap(text, max_words=1000, overlap_words=100)
+        >>> # Each chunk overlaps with 100 words from previous chunk
+    """
+    content = clean_page_markers(content)
+
+    # Split by paragraphs
+    paragraphs: List[str] = re.split(r'\n\n+', content)
+
+    chunks: List[str] = []
+    current_chunk: List[str] = []
+    current_words: int = 0
+    overlap_buffer: List[str] = []  # Store last sentences for overlap
+
+    def finalize_chunk() -> None:
+        """Finalize current chunk and prepare overlap."""
+        nonlocal current_chunk, current_words, overlap_buffer
+
+        if not current_chunk:
+            return
+
+        chunk_text = '\n\n'.join(current_chunk)
+        chunks.append(chunk_text)
+
+        # Extract last sentences for overlap
+        sentences = re.split(r'(?<=[.!?])\s+', chunk_text)
+        overlap_buffer = []
+        overlap_word_count = 0
+
+        # Take last sentences until we reach overlap_words
+        for sentence in reversed(sentences):
+            sentence_words = len(sentence.split())
+            if overlap_word_count + sentence_words <= overlap_words:
+                overlap_buffer.insert(0, sentence)
+                overlap_word_count += sentence_words
+            else:
+                break
+
+        current_chunk = []
+        current_words = 0
+
+    for para in paragraphs:
+        para = para.strip()
+        if not para:
+            continue
+
+        para_words: int = len(para.split())
+
+        # If paragraph is too long, split by sentences
+        if para_words > max_words:
+            # Finalize current chunk first
+            if current_chunk:
+                finalize_chunk()
+
+            # Add overlap if exists
+            if overlap_buffer and chunks:
+                current_chunk.extend(overlap_buffer)
+                current_words = sum(len(s.split()) for s in overlap_buffer)
+
+            # Split long paragraph by sentences
+            sentences: List[str] = re.split(r'(?<=[.!?])\s+', para)
+            for sentence in sentences:
+                sentence_words: int = len(sentence.split())
+
+                if current_words + sentence_words > max_words and current_chunk:
+                    finalize_chunk()
+
+                    # Add overlap
+                    if overlap_buffer:
+                        current_chunk.extend(overlap_buffer)
+                        current_words = sum(len(s.split()) for s in overlap_buffer)
+
+                    current_chunk.append(sentence)
+                    current_words += sentence_words
+                else:
+                    current_chunk.append(sentence)
+                    current_words += sentence_words
+
+        # If adding paragraph exceeds limit
+        elif current_words + para_words > max_words:
+            if current_chunk:
+                finalize_chunk()
+
+            # Add overlap
+            if overlap_buffer and chunks:
+                current_chunk.extend(overlap_buffer)
+                current_words = sum(len(s.split()) for s in overlap_buffer)
+
+            current_chunk.append(para)
+            current_words += para_words
+
+        else:
+            current_chunk.append(para)
+            current_words += para_words
+
+    # Last chunk
+    if current_chunk:
+        chunk_text = '\n\n'.join(current_chunk)
+        chunks.append(chunk_text)
+
+    # Filter chunks that are too short (unless it's the only chunk)
+    if len(chunks) > 1:
+        chunks = [c for c in chunks if len(c.split()) >= min_words]
+
+    return chunks
+
+
+def get_chunk_text_with_context(
+    chunks: List[str],
+    index: int,
+    context_words: int = 50
+) -> tuple[str, str, str]:
+    """Get chunk with before/after context for better LLM processing.
+
+    Args:
+        chunks: List of chunk texts.
+        index: Index of the chunk to process.
+        context_words: Words of context to include from adjacent chunks.
+
+    Returns:
+        Tuple of (before_context, chunk_text, after_context).
+    """
+    chunk = chunks[index]
+
+    before_context = ""
+    if index > 0:
+        prev_chunk = chunks[index - 1]
+        words = prev_chunk.split()
+        before_context = " ".join(words[-context_words:]) if len(words) > context_words else prev_chunk
+
+    after_context = ""
+    if index < len(chunks) - 1:
+        next_chunk = chunks[index + 1]
+        words = next_chunk.split()
+        after_context = " ".join(words[:context_words]) if len(words) > context_words else next_chunk
+
+    return before_context, chunk, after_context
+
+
+def estimate_tokens(text: str) -> int:
+    """Estimate token count from text.
+
+    Uses approximation of 1 token ≈ 4 characters.
+
+    Args:
+        text: Text to estimate.
+
+    Returns:
+        Estimated token count.
+    """
+    return len(text) // 4
+
+
+def validate_chunk_size(text: str, max_tokens: int = 2500) -> bool:
+    """Validate that chunk size is within acceptable limits.
+
+    Args:
+        text: Chunk text to validate.
+        max_tokens: Maximum allowed tokens (default 2500 for safety margin below BGE-M3's 8192).
+
+    Returns:
+        True if chunk is valid size, False otherwise.
+    """
+    tokens = estimate_tokens(text)
+    return tokens <= max_tokens
+
+
+# Export key functions
+__all__ = [
+    'simple_chunk_with_overlap',
+    'get_chunk_text_with_context',
+    'estimate_tokens',
+    'validate_chunk_size',
+    'MAX_CHUNK_WORDS',
+    'OVERLAP_WORDS',
+    'MIN_CHUNK_WORDS',
+]