feat: Optimize chunk sizes with 1000-word limit and overlap

Implemented chunking optimization to resolve oversized chunks and improve semantic search quality: CHUNKING IMPROVEMENTS: - Added strict 1000-word max limit (vs previous 1500-2000) - Implemented 100-word overlap between consecutive chunks - Created llm_chunker_improved.py with overlap functionality - Added 3 fallback points in llm_chunker.py for robustness RE-CHUNKING RESULTS: - Identified and re-chunked 31 oversized chunks (>2000 tokens) - Split into 92 optimally-sized chunks (max 1995 tokens) - Preserved all metadata (workTitle, workAuthor, sectionPath, etc.) - 0 chunks now exceed 2000 tokens (vs 31 before) VECTORIZATION: - Created manual vectorization script for chunks without vectors - Successfully vectorized all 92 new chunks (100% coverage) - All 5,304 chunks now have BGE-M3 embeddings DOCKER CONFIGURATION: - Exposed text2vec-transformers port 8090 for manual vectorization - Added cluster configuration to fix "No private IP address found" - Increased worker timeout to 600s for large chunks TESTING: - Created comprehensive search quality test suite - Tests distribution, overlap detection, and semantic search - Modified to use near_vector() (Chunk_v2 has no vectorizer) Scripts: - 08_fix_summaries_properties.py - Add missing Work metadata to summaries - 09_rechunk_oversized.py - Re-chunk giant chunks with overlap - 10_test_search_quality.py - Validate search improvements - 11_vectorize_missing_chunks.py - Manual vectorization via API Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-08 17:37:49 +01:00
parent ca221887eb
commit 7045907173
7 changed files with 1376 additions and 27 deletions
--- a/generations/library_rag/utils/llm_chunker.py
+++ b/generations/library_rag/utils/llm_chunker.py
@@ -52,9 +52,15 @@ from .llm_structurer import (
 )
 from .llm_cleaner import clean_page_markers, is_chunk_valid
 from .types import LLMProvider, SemanticChunk
+from .llm_chunker_improved import simple_chunk_with_overlap, validate_chunk_size

 logger: logging.Logger = logging.getLogger(__name__)

+# Chunk size limits (2024-01 optimization)
+MAX_CHUNK_WORDS = 1000  # Hard limit to stay within BGE-M3 context
+OVERLAP_WORDS = 100     # Overlap for context preservation
+FORCE_SIMPLE_CHUNKING_THRESHOLD = 1500  # Words - force simple chunking above this
+

 # =============================================================================
 # Type Definitions for LLM Chunker
@@ -221,8 +227,43 @@ def chunk_section_with_llm(
    # Nettoyer le contenu
    content: str = clean_page_markers(section_content)

-    # Si le contenu est court, ne pas découper
+    # Compter les mots
    word_count: int = len(content.split())
+
+    # FORCE SIMPLE CHUNKING if section is too long (> 1500 words)
+    # This prevents giant chunks that exceed BGE-M3 limits
+    if word_count > FORCE_SIMPLE_CHUNKING_THRESHOLD:
+        logger.warning(
+            f"Section '{section_title}' is too long ({word_count} words), "
+            f"forcing simple chunking with overlap"
+        )
+        simple_texts = simple_chunk_with_overlap(
+            content,
+            max_words=MAX_CHUNK_WORDS,
+            overlap_words=OVERLAP_WORDS
+        )
+
+        # Convert to SemanticChunk format
+        result_chunks: List[SemanticChunk] = []
+        for i, text in enumerate(simple_texts):
+            para_num = extract_paragraph_number(text)
+            chunk: SemanticChunk = {
+                "text": text,
+                "summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
+                "concepts": [],
+                "type": "main_content",
+                "section_level": section_level,
+            }
+            if para_num is not None:
+                chunk["paragraph_number"] = para_num
+            if subsection_title and subsection_title != section_title:
+                chunk["subsection_title"] = subsection_title
+            result_chunks.append(chunk)
+
+        logger.info(f"Section split into {len(result_chunks)} chunks with overlap")
+        return result_chunks
+
+    # Si le contenu est court, ne pas découper
    if word_count < target_chunk_size * 0.8:
        para_num: Optional[int] = extract_paragraph_number(content)
        chunk: SemanticChunk = {
@@ -320,39 +361,66 @@ RÉPONDS avec un JSON entre <JSON></JSON>:

                valid_chunks.append(chunk_data)

-        # Si aucun chunk valide, retourner le contenu complet
+        # Si aucun chunk valide, utiliser simple chunking avec overlap
        if not valid_chunks:
-            logger.warning(f"Aucun chunk valide pour '{section_title}', retour contenu complet")
-            para_num = extract_paragraph_number(content)
-            fallback: SemanticChunk = {
-                "text": content,
-                "summary": section_title,
-                "concepts": [],
-                "type": "main_content",
-                "section_level": section_level,
-            }
-            if para_num is not None:
-                fallback["paragraph_number"] = para_num
-            return [fallback]
+            logger.warning(
+                f"Aucun chunk valide pour '{section_title}', "
+                f"fallback vers simple chunking avec overlap"
+            )
+            simple_texts = simple_chunk_with_overlap(
+                content,
+                max_words=MAX_CHUNK_WORDS,
+                overlap_words=OVERLAP_WORDS
+            )
+
+            fallback_chunks: List[SemanticChunk] = []
+            for i, text in enumerate(simple_texts):
+                para_num = extract_paragraph_number(text)
+                chunk_data: SemanticChunk = {
+                    "text": text,
+                    "summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
+                    "concepts": [],
+                    "type": "main_content",
+                    "section_level": section_level,
+                }
+                if para_num is not None:
+                    chunk_data["paragraph_number"] = para_num
+                fallback_chunks.append(chunk_data)
+
+            logger.info(f"Fallback: section split into {len(fallback_chunks)} chunks")
+            return fallback_chunks

        logger.info(f"Section '{section_title}' découpée en {len(valid_chunks)} chunks")
        return valid_chunks

    except Exception as e:
        logger.error(f"Erreur chunking LLM: {e}")
-        # Fallback: retourner le contenu complet
-        para_num = extract_paragraph_number(content)
-        fallback_err: SemanticChunk = {
-            "text": content,
-            "summary": section_title,
-            "concepts": [],
-            "type": "main_content",
-            "section_level": section_level,
-            "error": str(e),
-        }
-        if para_num is not None:
-            fallback_err["paragraph_number"] = para_num
-        return [fallback_err]
+        # Fallback: utiliser simple chunking avec overlap
+        logger.warning(f"Exception LLM, fallback vers simple chunking avec overlap")
+
+        simple_texts = simple_chunk_with_overlap(
+            content,
+            max_words=MAX_CHUNK_WORDS,
+            overlap_words=OVERLAP_WORDS
+        )
+
+        error_chunks: List[SemanticChunk] = []
+        for i, text in enumerate(simple_texts):
+            para_num = extract_paragraph_number(text)
+            chunk_data: SemanticChunk = {
+                "text": text,
+                "summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
+                "concepts": [],
+                "type": "main_content",
+                "section_level": section_level,
+                "error": f"LLM failed: {str(e)}",
+            }
+            if para_num is not None:
+                chunk_data["paragraph_number"] = para_num
+            error_chunks.append(chunk_data)
+
+        logger.info(f"Error fallback: section split into {len(error_chunks)} chunks")
+        return error_chunks


 def simple_chunk_by_paragraphs(