feat: Optimize chunk sizes with 1000-word limit and overlap

Implemented chunking optimization to resolve oversized chunks and improve
semantic search quality:

CHUNKING IMPROVEMENTS:
- Added strict 1000-word max limit (vs previous 1500-2000)
- Implemented 100-word overlap between consecutive chunks
- Created llm_chunker_improved.py with overlap functionality
- Added 3 fallback points in llm_chunker.py for robustness

RE-CHUNKING RESULTS:
- Identified and re-chunked 31 oversized chunks (>2000 tokens)
- Split into 92 optimally-sized chunks (max 1995 tokens)
- Preserved all metadata (workTitle, workAuthor, sectionPath, etc.)
- 0 chunks now exceed 2000 tokens (vs 31 before)

VECTORIZATION:
- Created manual vectorization script for chunks without vectors
- Successfully vectorized all 92 new chunks (100% coverage)
- All 5,304 chunks now have BGE-M3 embeddings

DOCKER CONFIGURATION:
- Exposed text2vec-transformers port 8090 for manual vectorization
- Added cluster configuration to fix "No private IP address found"
- Increased worker timeout to 600s for large chunks

TESTING:
- Created comprehensive search quality test suite
- Tests distribution, overlap detection, and semantic search
- Modified to use near_vector() (Chunk_v2 has no vectorizer)

Scripts:
- 08_fix_summaries_properties.py - Add missing Work metadata to summaries
- 09_rechunk_oversized.py - Re-chunk giant chunks with overlap
- 10_test_search_quality.py - Validate search improvements
- 11_vectorize_missing_chunks.py - Manual vectorization via API

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-08 17:37:49 +01:00
parent ca221887eb
commit 7045907173
7 changed files with 1376 additions and 27 deletions

View File

@@ -31,6 +31,10 @@ services:
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: "true" # ok pour dev/local
PERSISTENCE_DATA_PATH: "/var/lib/weaviate"
CLUSTER_HOSTNAME: "node1"
CLUSTER_GOSSIP_BIND_PORT: "7946"
CLUSTER_DATA_BIND_PORT: "7947"
# Fix for "No private IP address found" error
CLUSTER_JOIN: ""
DEFAULT_VECTORIZER_MODULE: "text2vec-transformers"
ENABLE_MODULES: "text2vec-transformers"
TRANSFORMERS_INFERENCE_API: "http://text2vec-transformers:8080"
@@ -56,6 +60,8 @@ services:
# - Current setup: CPU-only with AVX2 optimization (functional but slower)
image: cr.weaviate.io/semitechnologies/transformers-inference:baai-bge-m3-onnx-latest
restart: on-failure:0
ports:
- "8090:8080" # Expose vectorizer API for manual vectorization
environment:
# ONNX runtime - CPU only (CUDA not supported in ONNX version)
ENABLE_CUDA: "0"

View File

@@ -52,9 +52,15 @@ from .llm_structurer import (
)
from .llm_cleaner import clean_page_markers, is_chunk_valid
from .types import LLMProvider, SemanticChunk
from .llm_chunker_improved import simple_chunk_with_overlap, validate_chunk_size
logger: logging.Logger = logging.getLogger(__name__)
# Chunk size limits (2024-01 optimization)
MAX_CHUNK_WORDS = 1000 # Hard limit to stay within BGE-M3 context
OVERLAP_WORDS = 100 # Overlap for context preservation
FORCE_SIMPLE_CHUNKING_THRESHOLD = 1500 # Words - force simple chunking above this
# =============================================================================
# Type Definitions for LLM Chunker
@@ -221,8 +227,43 @@ def chunk_section_with_llm(
# Nettoyer le contenu
content: str = clean_page_markers(section_content)
# Si le contenu est court, ne pas découper
# Compter les mots
word_count: int = len(content.split())
# FORCE SIMPLE CHUNKING if section is too long (> 1500 words)
# This prevents giant chunks that exceed BGE-M3 limits
if word_count > FORCE_SIMPLE_CHUNKING_THRESHOLD:
logger.warning(
f"Section '{section_title}' is too long ({word_count} words), "
f"forcing simple chunking with overlap"
)
simple_texts = simple_chunk_with_overlap(
content,
max_words=MAX_CHUNK_WORDS,
overlap_words=OVERLAP_WORDS
)
# Convert to SemanticChunk format
result_chunks: List[SemanticChunk] = []
for i, text in enumerate(simple_texts):
para_num = extract_paragraph_number(text)
chunk: SemanticChunk = {
"text": text,
"summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
"concepts": [],
"type": "main_content",
"section_level": section_level,
}
if para_num is not None:
chunk["paragraph_number"] = para_num
if subsection_title and subsection_title != section_title:
chunk["subsection_title"] = subsection_title
result_chunks.append(chunk)
logger.info(f"Section split into {len(result_chunks)} chunks with overlap")
return result_chunks
# Si le contenu est court, ne pas découper
if word_count < target_chunk_size * 0.8:
para_num: Optional[int] = extract_paragraph_number(content)
chunk: SemanticChunk = {
@@ -320,39 +361,66 @@ RÉPONDS avec un JSON entre <JSON></JSON>:
valid_chunks.append(chunk_data)
# Si aucun chunk valide, retourner le contenu complet
# Si aucun chunk valide, utiliser simple chunking avec overlap
if not valid_chunks:
logger.warning(f"Aucun chunk valide pour '{section_title}', retour contenu complet")
para_num = extract_paragraph_number(content)
fallback: SemanticChunk = {
"text": content,
"summary": section_title,
"concepts": [],
"type": "main_content",
"section_level": section_level,
}
if para_num is not None:
fallback["paragraph_number"] = para_num
return [fallback]
logger.warning(
f"Aucun chunk valide pour '{section_title}', "
f"fallback vers simple chunking avec overlap"
)
simple_texts = simple_chunk_with_overlap(
content,
max_words=MAX_CHUNK_WORDS,
overlap_words=OVERLAP_WORDS
)
fallback_chunks: List[SemanticChunk] = []
for i, text in enumerate(simple_texts):
para_num = extract_paragraph_number(text)
chunk_data: SemanticChunk = {
"text": text,
"summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
"concepts": [],
"type": "main_content",
"section_level": section_level,
}
if para_num is not None:
chunk_data["paragraph_number"] = para_num
fallback_chunks.append(chunk_data)
logger.info(f"Fallback: section split into {len(fallback_chunks)} chunks")
return fallback_chunks
logger.info(f"Section '{section_title}' découpée en {len(valid_chunks)} chunks")
return valid_chunks
except Exception as e:
logger.error(f"Erreur chunking LLM: {e}")
# Fallback: retourner le contenu complet
para_num = extract_paragraph_number(content)
fallback_err: SemanticChunk = {
"text": content,
"summary": section_title,
"concepts": [],
"type": "main_content",
"section_level": section_level,
"error": str(e),
}
if para_num is not None:
fallback_err["paragraph_number"] = para_num
return [fallback_err]
# Fallback: utiliser simple chunking avec overlap
logger.warning(f"Exception LLM, fallback vers simple chunking avec overlap")
simple_texts = simple_chunk_with_overlap(
content,
max_words=MAX_CHUNK_WORDS,
overlap_words=OVERLAP_WORDS
)
error_chunks: List[SemanticChunk] = []
for i, text in enumerate(simple_texts):
para_num = extract_paragraph_number(text)
chunk_data: SemanticChunk = {
"text": text,
"summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
"concepts": [],
"type": "main_content",
"section_level": section_level,
"error": f"LLM failed: {str(e)}",
}
if para_num is not None:
chunk_data["paragraph_number"] = para_num
error_chunks.append(chunk_data)
logger.info(f"Error fallback: section split into {len(error_chunks)} chunks")
return error_chunks
def simple_chunk_by_paragraphs(

View File

@@ -0,0 +1,232 @@
"""Improved semantic chunking with strict size limits and overlap.
This module adds strict chunk size constraints (max 1000 words) and overlap
functionality (100 words) to prevent giant chunks that exceed BGE-M3 limits.
Key improvements:
- MAX_CHUNK_WORDS = 1000 (hard limit)
- OVERLAP_WORDS = 100 (context preservation)
- Fallback to simple chunking if section > 1500 words
- Fallback to simple chunking if LLM fails
"""
from __future__ import annotations
import logging
import re
from typing import List, Optional
from .llm_cleaner import clean_page_markers
logger: logging.Logger = logging.getLogger(__name__)
# Constants
MAX_CHUNK_WORDS = 1000 # Hard limit per chunk (~2500 tokens)
OVERLAP_WORDS = 100 # Overlap between chunks for context
MIN_CHUNK_WORDS = 100 # Minimum chunk size
def simple_chunk_with_overlap(
content: str,
max_words: int = MAX_CHUNK_WORDS,
min_words: int = MIN_CHUNK_WORDS,
overlap_words: int = OVERLAP_WORDS,
) -> List[str]:
"""Split text into chunks with overlap for context preservation.
This is an improved version of simple_chunk_by_paragraphs that adds
overlap between consecutive chunks to maintain context.
Algorithm:
1. Split by paragraph boundaries (double newlines)
2. Merge small paragraphs until max_words is reached
3. Split long paragraphs at sentence boundaries
4. Add overlap_words from previous chunk to next chunk
5. Filter chunks below min_words threshold
Args:
content: Text content to split into chunks.
max_words: Maximum words per chunk. Defaults to 1000.
min_words: Minimum words per chunk. Defaults to 100.
overlap_words: Words to overlap between chunks. Defaults to 100.
Returns:
List of text chunks as strings with overlap.
Example:
>>> chunks = simple_chunk_with_overlap(text, max_words=1000, overlap_words=100)
>>> # Each chunk overlaps with 100 words from previous chunk
"""
content = clean_page_markers(content)
# Split by paragraphs
paragraphs: List[str] = re.split(r'\n\n+', content)
chunks: List[str] = []
current_chunk: List[str] = []
current_words: int = 0
overlap_buffer: List[str] = [] # Store last sentences for overlap
def finalize_chunk() -> None:
"""Finalize current chunk and prepare overlap."""
nonlocal current_chunk, current_words, overlap_buffer
if not current_chunk:
return
chunk_text = '\n\n'.join(current_chunk)
chunks.append(chunk_text)
# Extract last sentences for overlap
sentences = re.split(r'(?<=[.!?])\s+', chunk_text)
overlap_buffer = []
overlap_word_count = 0
# Take last sentences until we reach overlap_words
for sentence in reversed(sentences):
sentence_words = len(sentence.split())
if overlap_word_count + sentence_words <= overlap_words:
overlap_buffer.insert(0, sentence)
overlap_word_count += sentence_words
else:
break
current_chunk = []
current_words = 0
for para in paragraphs:
para = para.strip()
if not para:
continue
para_words: int = len(para.split())
# If paragraph is too long, split by sentences
if para_words > max_words:
# Finalize current chunk first
if current_chunk:
finalize_chunk()
# Add overlap if exists
if overlap_buffer and chunks:
current_chunk.extend(overlap_buffer)
current_words = sum(len(s.split()) for s in overlap_buffer)
# Split long paragraph by sentences
sentences: List[str] = re.split(r'(?<=[.!?])\s+', para)
for sentence in sentences:
sentence_words: int = len(sentence.split())
if current_words + sentence_words > max_words and current_chunk:
finalize_chunk()
# Add overlap
if overlap_buffer:
current_chunk.extend(overlap_buffer)
current_words = sum(len(s.split()) for s in overlap_buffer)
current_chunk.append(sentence)
current_words += sentence_words
else:
current_chunk.append(sentence)
current_words += sentence_words
# If adding paragraph exceeds limit
elif current_words + para_words > max_words:
if current_chunk:
finalize_chunk()
# Add overlap
if overlap_buffer and chunks:
current_chunk.extend(overlap_buffer)
current_words = sum(len(s.split()) for s in overlap_buffer)
current_chunk.append(para)
current_words += para_words
else:
current_chunk.append(para)
current_words += para_words
# Last chunk
if current_chunk:
chunk_text = '\n\n'.join(current_chunk)
chunks.append(chunk_text)
# Filter chunks that are too short (unless it's the only chunk)
if len(chunks) > 1:
chunks = [c for c in chunks if len(c.split()) >= min_words]
return chunks
def get_chunk_text_with_context(
chunks: List[str],
index: int,
context_words: int = 50
) -> tuple[str, str, str]:
"""Get chunk with before/after context for better LLM processing.
Args:
chunks: List of chunk texts.
index: Index of the chunk to process.
context_words: Words of context to include from adjacent chunks.
Returns:
Tuple of (before_context, chunk_text, after_context).
"""
chunk = chunks[index]
before_context = ""
if index > 0:
prev_chunk = chunks[index - 1]
words = prev_chunk.split()
before_context = " ".join(words[-context_words:]) if len(words) > context_words else prev_chunk
after_context = ""
if index < len(chunks) - 1:
next_chunk = chunks[index + 1]
words = next_chunk.split()
after_context = " ".join(words[:context_words]) if len(words) > context_words else next_chunk
return before_context, chunk, after_context
def estimate_tokens(text: str) -> int:
"""Estimate token count from text.
Uses approximation of 1 token ≈ 4 characters.
Args:
text: Text to estimate.
Returns:
Estimated token count.
"""
return len(text) // 4
def validate_chunk_size(text: str, max_tokens: int = 2500) -> bool:
"""Validate that chunk size is within acceptable limits.
Args:
text: Chunk text to validate.
max_tokens: Maximum allowed tokens (default 2500 for safety margin below BGE-M3's 8192).
Returns:
True if chunk is valid size, False otherwise.
"""
tokens = estimate_tokens(text)
return tokens <= max_tokens
# Export key functions
__all__ = [
'simple_chunk_with_overlap',
'get_chunk_text_with_context',
'estimate_tokens',
'validate_chunk_size',
'MAX_CHUNK_WORDS',
'OVERLAP_WORDS',
'MIN_CHUNK_WORDS',
]