feat: Optimize chunk sizes with 1000-word limit and overlap
Implemented chunking optimization to resolve oversized chunks and improve semantic search quality: CHUNKING IMPROVEMENTS: - Added strict 1000-word max limit (vs previous 1500-2000) - Implemented 100-word overlap between consecutive chunks - Created llm_chunker_improved.py with overlap functionality - Added 3 fallback points in llm_chunker.py for robustness RE-CHUNKING RESULTS: - Identified and re-chunked 31 oversized chunks (>2000 tokens) - Split into 92 optimally-sized chunks (max 1995 tokens) - Preserved all metadata (workTitle, workAuthor, sectionPath, etc.) - 0 chunks now exceed 2000 tokens (vs 31 before) VECTORIZATION: - Created manual vectorization script for chunks without vectors - Successfully vectorized all 92 new chunks (100% coverage) - All 5,304 chunks now have BGE-M3 embeddings DOCKER CONFIGURATION: - Exposed text2vec-transformers port 8090 for manual vectorization - Added cluster configuration to fix "No private IP address found" - Increased worker timeout to 600s for large chunks TESTING: - Created comprehensive search quality test suite - Tests distribution, overlap detection, and semantic search - Modified to use near_vector() (Chunk_v2 has no vectorizer) Scripts: - 08_fix_summaries_properties.py - Add missing Work metadata to summaries - 09_rechunk_oversized.py - Re-chunk giant chunks with overlap - 10_test_search_quality.py - Validate search improvements - 11_vectorize_missing_chunks.py - Manual vectorization via API Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -31,6 +31,10 @@ services:
|
||||
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: "true" # ok pour dev/local
|
||||
PERSISTENCE_DATA_PATH: "/var/lib/weaviate"
|
||||
CLUSTER_HOSTNAME: "node1"
|
||||
CLUSTER_GOSSIP_BIND_PORT: "7946"
|
||||
CLUSTER_DATA_BIND_PORT: "7947"
|
||||
# Fix for "No private IP address found" error
|
||||
CLUSTER_JOIN: ""
|
||||
DEFAULT_VECTORIZER_MODULE: "text2vec-transformers"
|
||||
ENABLE_MODULES: "text2vec-transformers"
|
||||
TRANSFORMERS_INFERENCE_API: "http://text2vec-transformers:8080"
|
||||
@@ -56,6 +60,8 @@ services:
|
||||
# - Current setup: CPU-only with AVX2 optimization (functional but slower)
|
||||
image: cr.weaviate.io/semitechnologies/transformers-inference:baai-bge-m3-onnx-latest
|
||||
restart: on-failure:0
|
||||
ports:
|
||||
- "8090:8080" # Expose vectorizer API for manual vectorization
|
||||
environment:
|
||||
# ONNX runtime - CPU only (CUDA not supported in ONNX version)
|
||||
ENABLE_CUDA: "0"
|
||||
|
||||
@@ -52,9 +52,15 @@ from .llm_structurer import (
|
||||
)
|
||||
from .llm_cleaner import clean_page_markers, is_chunk_valid
|
||||
from .types import LLMProvider, SemanticChunk
|
||||
from .llm_chunker_improved import simple_chunk_with_overlap, validate_chunk_size
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
# Chunk size limits (2024-01 optimization)
|
||||
MAX_CHUNK_WORDS = 1000 # Hard limit to stay within BGE-M3 context
|
||||
OVERLAP_WORDS = 100 # Overlap for context preservation
|
||||
FORCE_SIMPLE_CHUNKING_THRESHOLD = 1500 # Words - force simple chunking above this
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Type Definitions for LLM Chunker
|
||||
@@ -221,8 +227,43 @@ def chunk_section_with_llm(
|
||||
# Nettoyer le contenu
|
||||
content: str = clean_page_markers(section_content)
|
||||
|
||||
# Si le contenu est court, ne pas découper
|
||||
# Compter les mots
|
||||
word_count: int = len(content.split())
|
||||
|
||||
# FORCE SIMPLE CHUNKING if section is too long (> 1500 words)
|
||||
# This prevents giant chunks that exceed BGE-M3 limits
|
||||
if word_count > FORCE_SIMPLE_CHUNKING_THRESHOLD:
|
||||
logger.warning(
|
||||
f"Section '{section_title}' is too long ({word_count} words), "
|
||||
f"forcing simple chunking with overlap"
|
||||
)
|
||||
simple_texts = simple_chunk_with_overlap(
|
||||
content,
|
||||
max_words=MAX_CHUNK_WORDS,
|
||||
overlap_words=OVERLAP_WORDS
|
||||
)
|
||||
|
||||
# Convert to SemanticChunk format
|
||||
result_chunks: List[SemanticChunk] = []
|
||||
for i, text in enumerate(simple_texts):
|
||||
para_num = extract_paragraph_number(text)
|
||||
chunk: SemanticChunk = {
|
||||
"text": text,
|
||||
"summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
|
||||
"concepts": [],
|
||||
"type": "main_content",
|
||||
"section_level": section_level,
|
||||
}
|
||||
if para_num is not None:
|
||||
chunk["paragraph_number"] = para_num
|
||||
if subsection_title and subsection_title != section_title:
|
||||
chunk["subsection_title"] = subsection_title
|
||||
result_chunks.append(chunk)
|
||||
|
||||
logger.info(f"Section split into {len(result_chunks)} chunks with overlap")
|
||||
return result_chunks
|
||||
|
||||
# Si le contenu est court, ne pas découper
|
||||
if word_count < target_chunk_size * 0.8:
|
||||
para_num: Optional[int] = extract_paragraph_number(content)
|
||||
chunk: SemanticChunk = {
|
||||
@@ -320,39 +361,66 @@ RÉPONDS avec un JSON entre <JSON></JSON>:
|
||||
|
||||
valid_chunks.append(chunk_data)
|
||||
|
||||
# Si aucun chunk valide, retourner le contenu complet
|
||||
# Si aucun chunk valide, utiliser simple chunking avec overlap
|
||||
if not valid_chunks:
|
||||
logger.warning(f"Aucun chunk valide pour '{section_title}', retour contenu complet")
|
||||
para_num = extract_paragraph_number(content)
|
||||
fallback: SemanticChunk = {
|
||||
"text": content,
|
||||
"summary": section_title,
|
||||
"concepts": [],
|
||||
"type": "main_content",
|
||||
"section_level": section_level,
|
||||
}
|
||||
if para_num is not None:
|
||||
fallback["paragraph_number"] = para_num
|
||||
return [fallback]
|
||||
logger.warning(
|
||||
f"Aucun chunk valide pour '{section_title}', "
|
||||
f"fallback vers simple chunking avec overlap"
|
||||
)
|
||||
simple_texts = simple_chunk_with_overlap(
|
||||
content,
|
||||
max_words=MAX_CHUNK_WORDS,
|
||||
overlap_words=OVERLAP_WORDS
|
||||
)
|
||||
|
||||
fallback_chunks: List[SemanticChunk] = []
|
||||
for i, text in enumerate(simple_texts):
|
||||
para_num = extract_paragraph_number(text)
|
||||
chunk_data: SemanticChunk = {
|
||||
"text": text,
|
||||
"summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
|
||||
"concepts": [],
|
||||
"type": "main_content",
|
||||
"section_level": section_level,
|
||||
}
|
||||
if para_num is not None:
|
||||
chunk_data["paragraph_number"] = para_num
|
||||
fallback_chunks.append(chunk_data)
|
||||
|
||||
logger.info(f"Fallback: section split into {len(fallback_chunks)} chunks")
|
||||
return fallback_chunks
|
||||
|
||||
logger.info(f"Section '{section_title}' découpée en {len(valid_chunks)} chunks")
|
||||
return valid_chunks
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur chunking LLM: {e}")
|
||||
# Fallback: retourner le contenu complet
|
||||
para_num = extract_paragraph_number(content)
|
||||
fallback_err: SemanticChunk = {
|
||||
"text": content,
|
||||
"summary": section_title,
|
||||
"concepts": [],
|
||||
"type": "main_content",
|
||||
"section_level": section_level,
|
||||
"error": str(e),
|
||||
}
|
||||
if para_num is not None:
|
||||
fallback_err["paragraph_number"] = para_num
|
||||
return [fallback_err]
|
||||
# Fallback: utiliser simple chunking avec overlap
|
||||
logger.warning(f"Exception LLM, fallback vers simple chunking avec overlap")
|
||||
|
||||
simple_texts = simple_chunk_with_overlap(
|
||||
content,
|
||||
max_words=MAX_CHUNK_WORDS,
|
||||
overlap_words=OVERLAP_WORDS
|
||||
)
|
||||
|
||||
error_chunks: List[SemanticChunk] = []
|
||||
for i, text in enumerate(simple_texts):
|
||||
para_num = extract_paragraph_number(text)
|
||||
chunk_data: SemanticChunk = {
|
||||
"text": text,
|
||||
"summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
|
||||
"concepts": [],
|
||||
"type": "main_content",
|
||||
"section_level": section_level,
|
||||
"error": f"LLM failed: {str(e)}",
|
||||
}
|
||||
if para_num is not None:
|
||||
chunk_data["paragraph_number"] = para_num
|
||||
error_chunks.append(chunk_data)
|
||||
|
||||
logger.info(f"Error fallback: section split into {len(error_chunks)} chunks")
|
||||
return error_chunks
|
||||
|
||||
|
||||
def simple_chunk_by_paragraphs(
|
||||
|
||||
232
generations/library_rag/utils/llm_chunker_improved.py
Normal file
232
generations/library_rag/utils/llm_chunker_improved.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""Improved semantic chunking with strict size limits and overlap.
|
||||
|
||||
This module adds strict chunk size constraints (max 1000 words) and overlap
|
||||
functionality (100 words) to prevent giant chunks that exceed BGE-M3 limits.
|
||||
|
||||
Key improvements:
|
||||
- MAX_CHUNK_WORDS = 1000 (hard limit)
|
||||
- OVERLAP_WORDS = 100 (context preservation)
|
||||
- Fallback to simple chunking if section > 1500 words
|
||||
- Fallback to simple chunking if LLM fails
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import List, Optional
|
||||
|
||||
from .llm_cleaner import clean_page_markers
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
# Constants
|
||||
MAX_CHUNK_WORDS = 1000 # Hard limit per chunk (~2500 tokens)
|
||||
OVERLAP_WORDS = 100 # Overlap between chunks for context
|
||||
MIN_CHUNK_WORDS = 100 # Minimum chunk size
|
||||
|
||||
|
||||
def simple_chunk_with_overlap(
|
||||
content: str,
|
||||
max_words: int = MAX_CHUNK_WORDS,
|
||||
min_words: int = MIN_CHUNK_WORDS,
|
||||
overlap_words: int = OVERLAP_WORDS,
|
||||
) -> List[str]:
|
||||
"""Split text into chunks with overlap for context preservation.
|
||||
|
||||
This is an improved version of simple_chunk_by_paragraphs that adds
|
||||
overlap between consecutive chunks to maintain context.
|
||||
|
||||
Algorithm:
|
||||
1. Split by paragraph boundaries (double newlines)
|
||||
2. Merge small paragraphs until max_words is reached
|
||||
3. Split long paragraphs at sentence boundaries
|
||||
4. Add overlap_words from previous chunk to next chunk
|
||||
5. Filter chunks below min_words threshold
|
||||
|
||||
Args:
|
||||
content: Text content to split into chunks.
|
||||
max_words: Maximum words per chunk. Defaults to 1000.
|
||||
min_words: Minimum words per chunk. Defaults to 100.
|
||||
overlap_words: Words to overlap between chunks. Defaults to 100.
|
||||
|
||||
Returns:
|
||||
List of text chunks as strings with overlap.
|
||||
|
||||
Example:
|
||||
>>> chunks = simple_chunk_with_overlap(text, max_words=1000, overlap_words=100)
|
||||
>>> # Each chunk overlaps with 100 words from previous chunk
|
||||
"""
|
||||
content = clean_page_markers(content)
|
||||
|
||||
# Split by paragraphs
|
||||
paragraphs: List[str] = re.split(r'\n\n+', content)
|
||||
|
||||
chunks: List[str] = []
|
||||
current_chunk: List[str] = []
|
||||
current_words: int = 0
|
||||
overlap_buffer: List[str] = [] # Store last sentences for overlap
|
||||
|
||||
def finalize_chunk() -> None:
|
||||
"""Finalize current chunk and prepare overlap."""
|
||||
nonlocal current_chunk, current_words, overlap_buffer
|
||||
|
||||
if not current_chunk:
|
||||
return
|
||||
|
||||
chunk_text = '\n\n'.join(current_chunk)
|
||||
chunks.append(chunk_text)
|
||||
|
||||
# Extract last sentences for overlap
|
||||
sentences = re.split(r'(?<=[.!?])\s+', chunk_text)
|
||||
overlap_buffer = []
|
||||
overlap_word_count = 0
|
||||
|
||||
# Take last sentences until we reach overlap_words
|
||||
for sentence in reversed(sentences):
|
||||
sentence_words = len(sentence.split())
|
||||
if overlap_word_count + sentence_words <= overlap_words:
|
||||
overlap_buffer.insert(0, sentence)
|
||||
overlap_word_count += sentence_words
|
||||
else:
|
||||
break
|
||||
|
||||
current_chunk = []
|
||||
current_words = 0
|
||||
|
||||
for para in paragraphs:
|
||||
para = para.strip()
|
||||
if not para:
|
||||
continue
|
||||
|
||||
para_words: int = len(para.split())
|
||||
|
||||
# If paragraph is too long, split by sentences
|
||||
if para_words > max_words:
|
||||
# Finalize current chunk first
|
||||
if current_chunk:
|
||||
finalize_chunk()
|
||||
|
||||
# Add overlap if exists
|
||||
if overlap_buffer and chunks:
|
||||
current_chunk.extend(overlap_buffer)
|
||||
current_words = sum(len(s.split()) for s in overlap_buffer)
|
||||
|
||||
# Split long paragraph by sentences
|
||||
sentences: List[str] = re.split(r'(?<=[.!?])\s+', para)
|
||||
for sentence in sentences:
|
||||
sentence_words: int = len(sentence.split())
|
||||
|
||||
if current_words + sentence_words > max_words and current_chunk:
|
||||
finalize_chunk()
|
||||
|
||||
# Add overlap
|
||||
if overlap_buffer:
|
||||
current_chunk.extend(overlap_buffer)
|
||||
current_words = sum(len(s.split()) for s in overlap_buffer)
|
||||
|
||||
current_chunk.append(sentence)
|
||||
current_words += sentence_words
|
||||
else:
|
||||
current_chunk.append(sentence)
|
||||
current_words += sentence_words
|
||||
|
||||
# If adding paragraph exceeds limit
|
||||
elif current_words + para_words > max_words:
|
||||
if current_chunk:
|
||||
finalize_chunk()
|
||||
|
||||
# Add overlap
|
||||
if overlap_buffer and chunks:
|
||||
current_chunk.extend(overlap_buffer)
|
||||
current_words = sum(len(s.split()) for s in overlap_buffer)
|
||||
|
||||
current_chunk.append(para)
|
||||
current_words += para_words
|
||||
|
||||
else:
|
||||
current_chunk.append(para)
|
||||
current_words += para_words
|
||||
|
||||
# Last chunk
|
||||
if current_chunk:
|
||||
chunk_text = '\n\n'.join(current_chunk)
|
||||
chunks.append(chunk_text)
|
||||
|
||||
# Filter chunks that are too short (unless it's the only chunk)
|
||||
if len(chunks) > 1:
|
||||
chunks = [c for c in chunks if len(c.split()) >= min_words]
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def get_chunk_text_with_context(
|
||||
chunks: List[str],
|
||||
index: int,
|
||||
context_words: int = 50
|
||||
) -> tuple[str, str, str]:
|
||||
"""Get chunk with before/after context for better LLM processing.
|
||||
|
||||
Args:
|
||||
chunks: List of chunk texts.
|
||||
index: Index of the chunk to process.
|
||||
context_words: Words of context to include from adjacent chunks.
|
||||
|
||||
Returns:
|
||||
Tuple of (before_context, chunk_text, after_context).
|
||||
"""
|
||||
chunk = chunks[index]
|
||||
|
||||
before_context = ""
|
||||
if index > 0:
|
||||
prev_chunk = chunks[index - 1]
|
||||
words = prev_chunk.split()
|
||||
before_context = " ".join(words[-context_words:]) if len(words) > context_words else prev_chunk
|
||||
|
||||
after_context = ""
|
||||
if index < len(chunks) - 1:
|
||||
next_chunk = chunks[index + 1]
|
||||
words = next_chunk.split()
|
||||
after_context = " ".join(words[:context_words]) if len(words) > context_words else next_chunk
|
||||
|
||||
return before_context, chunk, after_context
|
||||
|
||||
|
||||
def estimate_tokens(text: str) -> int:
|
||||
"""Estimate token count from text.
|
||||
|
||||
Uses approximation of 1 token ≈ 4 characters.
|
||||
|
||||
Args:
|
||||
text: Text to estimate.
|
||||
|
||||
Returns:
|
||||
Estimated token count.
|
||||
"""
|
||||
return len(text) // 4
|
||||
|
||||
|
||||
def validate_chunk_size(text: str, max_tokens: int = 2500) -> bool:
|
||||
"""Validate that chunk size is within acceptable limits.
|
||||
|
||||
Args:
|
||||
text: Chunk text to validate.
|
||||
max_tokens: Maximum allowed tokens (default 2500 for safety margin below BGE-M3's 8192).
|
||||
|
||||
Returns:
|
||||
True if chunk is valid size, False otherwise.
|
||||
"""
|
||||
tokens = estimate_tokens(text)
|
||||
return tokens <= max_tokens
|
||||
|
||||
|
||||
# Export key functions
|
||||
__all__ = [
|
||||
'simple_chunk_with_overlap',
|
||||
'get_chunk_text_with_context',
|
||||
'estimate_tokens',
|
||||
'validate_chunk_size',
|
||||
'MAX_CHUNK_WORDS',
|
||||
'OVERLAP_WORDS',
|
||||
'MIN_CHUNK_WORDS',
|
||||
]
|
||||
Reference in New Issue
Block a user