feat: Optimize chunk sizes with 1000-word limit and overlap
Implemented chunking optimization to resolve oversized chunks and improve semantic search quality: CHUNKING IMPROVEMENTS: - Added strict 1000-word max limit (vs previous 1500-2000) - Implemented 100-word overlap between consecutive chunks - Created llm_chunker_improved.py with overlap functionality - Added 3 fallback points in llm_chunker.py for robustness RE-CHUNKING RESULTS: - Identified and re-chunked 31 oversized chunks (>2000 tokens) - Split into 92 optimally-sized chunks (max 1995 tokens) - Preserved all metadata (workTitle, workAuthor, sectionPath, etc.) - 0 chunks now exceed 2000 tokens (vs 31 before) VECTORIZATION: - Created manual vectorization script for chunks without vectors - Successfully vectorized all 92 new chunks (100% coverage) - All 5,304 chunks now have BGE-M3 embeddings DOCKER CONFIGURATION: - Exposed text2vec-transformers port 8090 for manual vectorization - Added cluster configuration to fix "No private IP address found" - Increased worker timeout to 600s for large chunks TESTING: - Created comprehensive search quality test suite - Tests distribution, overlap detection, and semantic search - Modified to use near_vector() (Chunk_v2 has no vectorizer) Scripts: - 08_fix_summaries_properties.py - Add missing Work metadata to summaries - 09_rechunk_oversized.py - Re-chunk giant chunks with overlap - 10_test_search_quality.py - Validate search improvements - 11_vectorize_missing_chunks.py - Manual vectorization via API Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
157
08_fix_summaries_properties.py
Normal file
157
08_fix_summaries_properties.py
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
"""Correctif: Ajouter workAuthor, year, language aux Summary_v2."""
|
||||||
|
|
||||||
|
import weaviate
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if sys.stdout.encoding != 'utf-8':
|
||||||
|
sys.stdout.reconfigure(encoding='utf-8')
|
||||||
|
|
||||||
|
# Try to import tqdm
|
||||||
|
try:
|
||||||
|
from tqdm import tqdm
|
||||||
|
HAS_TQDM = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_TQDM = False
|
||||||
|
|
||||||
|
client = weaviate.connect_to_local()
|
||||||
|
|
||||||
|
try:
|
||||||
|
print("=" * 80)
|
||||||
|
print("CORRECTIF: AJOUTER workAuthor, year, language À SUMMARY_V2")
|
||||||
|
print("=" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
summary_v2 = client.collections.get("Summary_v2")
|
||||||
|
work_collection = client.collections.get("Work")
|
||||||
|
|
||||||
|
# Build workTitle → Work metadata map
|
||||||
|
print("Étape 1: Mapping workTitle → Work metadata")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
work_map = {}
|
||||||
|
|
||||||
|
for work in work_collection.iterator(include_vector=False):
|
||||||
|
props = work.properties
|
||||||
|
title = props.get("title")
|
||||||
|
if title:
|
||||||
|
work_map[title] = {
|
||||||
|
"author": props.get("author", "Unknown"),
|
||||||
|
"year": props.get("year", 0),
|
||||||
|
"language": props.get("language", "en"),
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"✓ {len(work_map)} mappings workTitle → metadata")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Count total summaries
|
||||||
|
print("Étape 2: Comptage summaries")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
print("Comptage en cours...")
|
||||||
|
total_summaries = sum(1 for _ in summary_v2.iterator(include_vector=False))
|
||||||
|
|
||||||
|
print(f"✓ {total_summaries} summaries à corriger")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Update summaries
|
||||||
|
print("Étape 3: Mise à jour des propriétés")
|
||||||
|
print("-" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
updated = 0
|
||||||
|
skipped = 0
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
# Create iterator with or without tqdm
|
||||||
|
if HAS_TQDM:
|
||||||
|
iterator = tqdm(
|
||||||
|
summary_v2.iterator(include_vector=False),
|
||||||
|
total=total_summaries,
|
||||||
|
desc="Mise à jour",
|
||||||
|
unit="summaries"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
iterator = summary_v2.iterator(include_vector=False)
|
||||||
|
print("Mise à jour en cours...")
|
||||||
|
|
||||||
|
for idx, summary in enumerate(iterator, 1):
|
||||||
|
props = summary.properties
|
||||||
|
|
||||||
|
try:
|
||||||
|
work_title = props.get("workTitle")
|
||||||
|
|
||||||
|
if not work_title:
|
||||||
|
errors.append(f"Summary {summary.uuid}: pas de workTitle")
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get work metadata
|
||||||
|
work_metadata = work_map.get(work_title)
|
||||||
|
if not work_metadata:
|
||||||
|
errors.append(f"Summary {summary.uuid}: Work '{work_title}' introuvable")
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if already updated (workAuthor exists)
|
||||||
|
if props.get("workAuthor") is not None:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Update properties
|
||||||
|
summary_v2.data.update(
|
||||||
|
uuid=summary.uuid,
|
||||||
|
properties={
|
||||||
|
"workAuthor": work_metadata["author"],
|
||||||
|
"year": work_metadata["year"],
|
||||||
|
"language": work_metadata["language"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
updated += 1
|
||||||
|
|
||||||
|
# Progress without tqdm
|
||||||
|
if not HAS_TQDM and idx % 10 == 0:
|
||||||
|
print(f" {idx}/{total_summaries} summaries traités...")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(f"Summary {summary.uuid}: {e}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("-" * 80)
|
||||||
|
print(f"✓ Total mis à jour: {updated}/{total_summaries}")
|
||||||
|
print(f" Déjà à jour: {skipped}")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
print(f"⚠️ Erreurs rencontrées: {len(errors)}")
|
||||||
|
print()
|
||||||
|
print("Premières erreurs:")
|
||||||
|
for err in errors[:10]:
|
||||||
|
print(f" - {err}")
|
||||||
|
if len(errors) > 10:
|
||||||
|
print(f" ... et {len(errors) - 10} autres")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("=" * 80)
|
||||||
|
print("CORRECTIF TERMINÉ")
|
||||||
|
print("=" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
if updated == total_summaries:
|
||||||
|
print("✅ Tous les summaries ont été mis à jour")
|
||||||
|
print()
|
||||||
|
print("Propriétés ajoutées:")
|
||||||
|
print(" ✓ workAuthor (auteur de l'œuvre)")
|
||||||
|
print(" ✓ year (année de publication)")
|
||||||
|
print(" ✓ language (langue du texte)")
|
||||||
|
print()
|
||||||
|
print("VÉRIFICATION:")
|
||||||
|
print(" python -c \"from verify_summaries import verify; verify()\"")
|
||||||
|
elif updated > 0:
|
||||||
|
print(f"⚠️ {updated}/{total_summaries} summaries mis à jour")
|
||||||
|
print(" Vérifier les erreurs")
|
||||||
|
else:
|
||||||
|
print("❌ Aucun summary mis à jour")
|
||||||
|
print(" Corriger les erreurs et relancer")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
client.close()
|
||||||
267
09_rechunk_oversized.py
Normal file
267
09_rechunk_oversized.py
Normal file
@@ -0,0 +1,267 @@
|
|||||||
|
"""Script to re-chunk oversized chunks (> 2000 tokens) in Chunk_v2.
|
||||||
|
|
||||||
|
This script identifies chunks that are too large (> 2000 tokens) and splits them
|
||||||
|
into smaller chunks with overlap (max 1000 words, overlap 100 words).
|
||||||
|
|
||||||
|
Steps:
|
||||||
|
1. Identify all chunks > 2000 tokens in Chunk_v2
|
||||||
|
2. Re-chunk using simple_chunk_with_overlap (1000 words max, 100 overlap)
|
||||||
|
3. Delete the original oversized chunk
|
||||||
|
4. Insert new smaller chunks with preserved metadata
|
||||||
|
5. Update Summary_v2 chunksCount if needed
|
||||||
|
"""
|
||||||
|
|
||||||
|
import weaviate
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add utils to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
|
||||||
|
|
||||||
|
from utils.llm_chunker_improved import simple_chunk_with_overlap, estimate_tokens
|
||||||
|
|
||||||
|
if sys.stdout.encoding != 'utf-8':
|
||||||
|
sys.stdout.reconfigure(encoding='utf-8')
|
||||||
|
|
||||||
|
# Try to import tqdm
|
||||||
|
try:
|
||||||
|
from tqdm import tqdm
|
||||||
|
HAS_TQDM = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_TQDM = False
|
||||||
|
|
||||||
|
# Constants
|
||||||
|
TOKEN_THRESHOLD = 2000 # Chunks > 2000 tokens will be re-chunked
|
||||||
|
MAX_WORDS = 1000
|
||||||
|
OVERLAP_WORDS = 100
|
||||||
|
|
||||||
|
client = weaviate.connect_to_local()
|
||||||
|
|
||||||
|
try:
|
||||||
|
print("=" * 80)
|
||||||
|
print("RE-CHUNKING DES CHUNKS SURDIMENSIONNÉS")
|
||||||
|
print("=" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
chunk_v2 = client.collections.get("Chunk_v2")
|
||||||
|
work_collection = client.collections.get("Work")
|
||||||
|
|
||||||
|
# ========== 1. IDENTIFIER LES CHUNKS PROBLÉMATIQUES ==========
|
||||||
|
print("1. IDENTIFICATION DES CHUNKS > 2000 TOKENS")
|
||||||
|
print("-" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
oversized_chunks = []
|
||||||
|
|
||||||
|
print("Analyse en cours...")
|
||||||
|
for chunk in chunk_v2.iterator(include_vector=False):
|
||||||
|
props = chunk.properties
|
||||||
|
text = props.get('text', '')
|
||||||
|
tokens = estimate_tokens(text)
|
||||||
|
|
||||||
|
if tokens > TOKEN_THRESHOLD:
|
||||||
|
oversized_chunks.append({
|
||||||
|
'uuid': str(chunk.uuid),
|
||||||
|
'tokens': tokens,
|
||||||
|
'chars': len(text),
|
||||||
|
'text': text,
|
||||||
|
'workTitle': props.get('workTitle', ''),
|
||||||
|
'workAuthor': props.get('workAuthor', ''),
|
||||||
|
'year': props.get('year', 0),
|
||||||
|
'language': props.get('language', 'en'),
|
||||||
|
'sectionPath': props.get('sectionPath', ''),
|
||||||
|
'chapterTitle': props.get('chapterTitle', ''),
|
||||||
|
'canonicalReference': props.get('canonicalReference', ''),
|
||||||
|
'unitType': props.get('unitType', 'main_content'),
|
||||||
|
'keywords': props.get('keywords', []),
|
||||||
|
'orderIndex': props.get('orderIndex', 0),
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"✓ {len(oversized_chunks)} chunks > {TOKEN_THRESHOLD} tokens trouvés")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if not oversized_chunks:
|
||||||
|
print("✅ Aucun chunk surdimensionné à traiter")
|
||||||
|
print()
|
||||||
|
print("=" * 80)
|
||||||
|
print("SCRIPT TERMINÉ - RIEN À FAIRE")
|
||||||
|
print("=" * 80)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Trier par taille
|
||||||
|
oversized_chunks.sort(key=lambda x: x['tokens'], reverse=True)
|
||||||
|
|
||||||
|
print("Top 5 plus gros chunks:")
|
||||||
|
for i, chunk in enumerate(oversized_chunks[:5], 1):
|
||||||
|
print(f"{i}. {chunk['tokens']:,} tokens ({chunk['chars']:,} chars)")
|
||||||
|
print(f" Œuvre: {chunk['workTitle']}")
|
||||||
|
print(f" Section: {chunk['sectionPath'][:60]}...")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if len(oversized_chunks) > 5:
|
||||||
|
print(f"... et {len(oversized_chunks) - 5} autres")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
# ========== 2. RE-CHUNKING ==========
|
||||||
|
print("2. RE-CHUNKING AVEC OVERLAP")
|
||||||
|
print("-" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Build work_title -> work_uuid map for references
|
||||||
|
work_map = {}
|
||||||
|
for work in work_collection.iterator(include_vector=False):
|
||||||
|
props = work.properties
|
||||||
|
title = props.get("title")
|
||||||
|
if title:
|
||||||
|
work_map[title] = str(work.uuid)
|
||||||
|
|
||||||
|
print(f"✓ {len(work_map)} Works mappés")
|
||||||
|
print()
|
||||||
|
|
||||||
|
deleted_count = 0
|
||||||
|
inserted_count = 0
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
# Create iterator with or without tqdm
|
||||||
|
if HAS_TQDM:
|
||||||
|
iterator = tqdm(
|
||||||
|
oversized_chunks,
|
||||||
|
desc="Re-chunking",
|
||||||
|
unit="chunks"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
iterator = oversized_chunks
|
||||||
|
print("Re-chunking en cours...")
|
||||||
|
|
||||||
|
for idx, old_chunk in enumerate(iterator, 1):
|
||||||
|
try:
|
||||||
|
# Re-chunk text
|
||||||
|
new_texts = simple_chunk_with_overlap(
|
||||||
|
old_chunk['text'],
|
||||||
|
max_words=MAX_WORDS,
|
||||||
|
overlap_words=OVERLAP_WORDS
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get work reference
|
||||||
|
work_uuid = work_map.get(old_chunk['workTitle'])
|
||||||
|
if not work_uuid:
|
||||||
|
errors.append(f"Chunk {old_chunk['uuid'][:8]}: Work '{old_chunk['workTitle']}' introuvable")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Insert new chunks
|
||||||
|
for i, new_text in enumerate(new_texts):
|
||||||
|
# Sub-ordering: multiply base index by 100 and add part index
|
||||||
|
# Example: orderIndex=5 becomes 500, 501, 502, etc.
|
||||||
|
new_order_index = (old_chunk['orderIndex'] * 100) + i
|
||||||
|
|
||||||
|
new_props = {
|
||||||
|
"text": new_text,
|
||||||
|
"summary": "", # Empty summary for simple chunks
|
||||||
|
"keywords": old_chunk['keywords'],
|
||||||
|
"workTitle": old_chunk['workTitle'],
|
||||||
|
"workAuthor": old_chunk['workAuthor'],
|
||||||
|
"year": old_chunk['year'],
|
||||||
|
"language": old_chunk['language'],
|
||||||
|
"sectionPath": old_chunk['sectionPath'],
|
||||||
|
"chapterTitle": old_chunk['chapterTitle'],
|
||||||
|
"canonicalReference": old_chunk['canonicalReference'],
|
||||||
|
"unitType": old_chunk['unitType'],
|
||||||
|
"orderIndex": new_order_index,
|
||||||
|
}
|
||||||
|
|
||||||
|
chunk_v2.data.insert(
|
||||||
|
properties=new_props,
|
||||||
|
references={"work": work_uuid}
|
||||||
|
)
|
||||||
|
inserted_count += 1
|
||||||
|
|
||||||
|
# Delete old chunk
|
||||||
|
chunk_v2.data.delete_by_id(old_chunk['uuid'])
|
||||||
|
deleted_count += 1
|
||||||
|
|
||||||
|
# Progress without tqdm
|
||||||
|
if not HAS_TQDM and idx % 5 == 0:
|
||||||
|
print(f" {idx}/{len(oversized_chunks)} chunks traités...")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(f"Chunk {old_chunk['uuid'][:8]}: {e}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("-" * 80)
|
||||||
|
print(f"✓ Chunks supprimés: {deleted_count}")
|
||||||
|
print(f"✓ Nouveaux chunks créés: {inserted_count}")
|
||||||
|
if deleted_count > 0:
|
||||||
|
print(f" Expansion moyenne: {inserted_count / deleted_count:.1f}x")
|
||||||
|
else:
|
||||||
|
print(f" ⚠️ Aucun chunk supprimé - vérifier les erreurs")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
print()
|
||||||
|
print(f"⚠️ Erreurs rencontrées: {len(errors)}")
|
||||||
|
for err in errors[:10]:
|
||||||
|
print(f" - {err}")
|
||||||
|
if len(errors) > 10:
|
||||||
|
print(f" ... et {len(errors) - 10} autres")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
# ========== 3. VÉRIFICATION ==========
|
||||||
|
print("3. VÉRIFICATION POST-RECHUNKING")
|
||||||
|
print("-" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
print("Comptage des nouveaux chunks...")
|
||||||
|
remaining_oversized = 0
|
||||||
|
total_chunks = 0
|
||||||
|
|
||||||
|
for chunk in chunk_v2.iterator(include_vector=False):
|
||||||
|
total_chunks += 1
|
||||||
|
text = chunk.properties.get('text', '')
|
||||||
|
tokens = estimate_tokens(text)
|
||||||
|
if tokens > TOKEN_THRESHOLD:
|
||||||
|
remaining_oversized += 1
|
||||||
|
|
||||||
|
print(f"✓ Total chunks: {total_chunks:,}")
|
||||||
|
print(f"✓ Chunks > {TOKEN_THRESHOLD} tokens: {remaining_oversized}")
|
||||||
|
|
||||||
|
if remaining_oversized == 0:
|
||||||
|
print()
|
||||||
|
print("✅ Aucun chunk surdimensionné restant!")
|
||||||
|
else:
|
||||||
|
print()
|
||||||
|
print(f"⚠️ {remaining_oversized} chunks encore > {TOKEN_THRESHOLD} tokens")
|
||||||
|
print(" Relancer le script si nécessaire")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("=" * 80)
|
||||||
|
print("RE-CHUNKING TERMINÉ")
|
||||||
|
print("=" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
print("RÉSULTATS:")
|
||||||
|
print(f" • Chunks supprimés: {deleted_count}")
|
||||||
|
print(f" • Nouveaux chunks créés: {inserted_count}")
|
||||||
|
if deleted_count > 0:
|
||||||
|
print(f" • Expansion: {inserted_count / deleted_count:.1f}x")
|
||||||
|
print(f" • Chunks restants > {TOKEN_THRESHOLD} tokens: {remaining_oversized}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if remaining_oversized == 0 and deleted_count > 0:
|
||||||
|
print("✅ RE-CHUNKING RÉUSSI")
|
||||||
|
print()
|
||||||
|
print("AMÉLIORATIONS:")
|
||||||
|
print(f" • {deleted_count} chunks géants éliminés")
|
||||||
|
print(f" • {inserted_count} chunks optimaux créés")
|
||||||
|
print(f" • Taille max: {MAX_WORDS} mots (~{MAX_WORDS * 2.5:.0f} tokens)")
|
||||||
|
print(f" • Overlap: {OVERLAP_WORDS} mots (contexte préservé)")
|
||||||
|
print()
|
||||||
|
print("PROCHAINES ÉTAPES:")
|
||||||
|
print(" 1. Tester la recherche sémantique")
|
||||||
|
print(" 2. Vérifier la qualité des vecteurs")
|
||||||
|
print(" 3. Optionnel: Mettre à jour Summary_v2.chunksCount si nécessaire")
|
||||||
|
elif deleted_count == 0:
|
||||||
|
print("ℹ️ Aucun chunk n'a nécessité de re-chunking")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
client.close()
|
||||||
402
10_test_search_quality.py
Normal file
402
10_test_search_quality.py
Normal file
@@ -0,0 +1,402 @@
|
|||||||
|
"""Test search quality with re-chunked data.
|
||||||
|
|
||||||
|
This script tests semantic search to verify that the re-chunking improved
|
||||||
|
search quality and relevance.
|
||||||
|
|
||||||
|
Tests:
|
||||||
|
1. Chunk size distribution after re-chunking
|
||||||
|
2. Overlap verification between consecutive chunks
|
||||||
|
3. Semantic search quality on various queries
|
||||||
|
4. Comparison of results from giant chunks vs optimized chunks
|
||||||
|
"""
|
||||||
|
|
||||||
|
import weaviate
|
||||||
|
import sys
|
||||||
|
import requests
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add utils to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
|
||||||
|
|
||||||
|
from utils.llm_chunker_improved import estimate_tokens
|
||||||
|
|
||||||
|
if sys.stdout.encoding != 'utf-8':
|
||||||
|
sys.stdout.reconfigure(encoding='utf-8')
|
||||||
|
|
||||||
|
# Vectorizer URL (same as in 11_vectorize_missing_chunks.py)
|
||||||
|
VECTORIZER_URL = "http://localhost:8090/vectors"
|
||||||
|
|
||||||
|
def vectorize_query(query: str) -> list[float]:
|
||||||
|
"""Manually vectorize a query using text2vec-transformers service.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Query text to vectorize
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Vector as list of floats (1024 dimensions for BGE-M3)
|
||||||
|
"""
|
||||||
|
response = requests.post(
|
||||||
|
VECTORIZER_URL,
|
||||||
|
json={"text": query},
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise RuntimeError(f"Vectorization failed: HTTP {response.status_code}")
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
vector = result.get('vector')
|
||||||
|
if not vector:
|
||||||
|
raise RuntimeError("No vector in response")
|
||||||
|
|
||||||
|
return vector
|
||||||
|
|
||||||
|
client = weaviate.connect_to_local()
|
||||||
|
|
||||||
|
try:
|
||||||
|
print("=" * 80)
|
||||||
|
print("TEST DE LA QUALITÉ DE RECHERCHE APRÈS RE-CHUNKING")
|
||||||
|
print("=" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
chunk_v2 = client.collections.get("Chunk_v2")
|
||||||
|
|
||||||
|
# ========== 1. DISTRIBUTION DES TAILLES ==========
|
||||||
|
print("1. DISTRIBUTION DES TAILLES DE CHUNKS")
|
||||||
|
print("-" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
print("Analyse en cours...")
|
||||||
|
|
||||||
|
sizes = []
|
||||||
|
for chunk in chunk_v2.iterator(include_vector=False):
|
||||||
|
text = chunk.properties.get('text', '')
|
||||||
|
tokens = estimate_tokens(text)
|
||||||
|
sizes.append(tokens)
|
||||||
|
|
||||||
|
total = len(sizes)
|
||||||
|
avg = sum(sizes) / total
|
||||||
|
max_size = max(sizes)
|
||||||
|
min_size = min(sizes)
|
||||||
|
|
||||||
|
print(f"Total chunks: {total:,}")
|
||||||
|
print(f"Taille moyenne: {avg:.0f} tokens")
|
||||||
|
print(f"Min: {min_size} tokens")
|
||||||
|
print(f"Max: {max_size} tokens")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Distribution par tranches
|
||||||
|
ranges = [
|
||||||
|
(0, 500, "Très petits"),
|
||||||
|
(500, 1000, "Petits"),
|
||||||
|
(1000, 1500, "Moyens"),
|
||||||
|
(1500, 2000, "Grands"),
|
||||||
|
(2000, 3000, "Très grands"),
|
||||||
|
(3000, 10000, "ÉNORMES"),
|
||||||
|
]
|
||||||
|
|
||||||
|
print("Distribution par tranches:")
|
||||||
|
for min_tok, max_tok, label in ranges:
|
||||||
|
count = sum(1 for s in sizes if min_tok <= s < max_tok)
|
||||||
|
percentage = count / total * 100
|
||||||
|
bar = "█" * int(percentage / 2)
|
||||||
|
print(f" {min_tok:>5}-{max_tok:>5} tokens ({label:15}): {count:>5} ({percentage:>5.1f}%) {bar}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
# ========== 2. VÉRIFICATION OVERLAP ==========
|
||||||
|
print("2. VÉRIFICATION DE L'OVERLAP ENTRE CHUNKS CONSÉCUTIFS")
|
||||||
|
print("-" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Prendre une œuvre pour vérifier l'overlap
|
||||||
|
print("Analyse de l'overlap dans 'Between Past and Future'...")
|
||||||
|
|
||||||
|
arendt_chunks = []
|
||||||
|
for chunk in chunk_v2.iterator(include_vector=False):
|
||||||
|
props = chunk.properties
|
||||||
|
if props.get('workTitle') == 'Between Past and Future':
|
||||||
|
arendt_chunks.append({
|
||||||
|
'orderIndex': props.get('orderIndex', 0),
|
||||||
|
'text': props.get('text', ''),
|
||||||
|
'sectionPath': props.get('sectionPath', '')
|
||||||
|
})
|
||||||
|
|
||||||
|
# Trier par orderIndex
|
||||||
|
arendt_chunks.sort(key=lambda x: x['orderIndex'])
|
||||||
|
|
||||||
|
print(f"Chunks trouvés: {len(arendt_chunks)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Vérifier overlap entre chunks consécutifs de même section
|
||||||
|
overlaps_found = 0
|
||||||
|
overlaps_checked = 0
|
||||||
|
|
||||||
|
for i in range(len(arendt_chunks) - 1):
|
||||||
|
current = arendt_chunks[i]
|
||||||
|
next_chunk = arendt_chunks[i + 1]
|
||||||
|
|
||||||
|
# Vérifier si même section (potentiellement des chunks split)
|
||||||
|
if current['sectionPath'] == next_chunk['sectionPath']:
|
||||||
|
# Extraire les derniers 200 caractères du chunk actuel
|
||||||
|
current_end = current['text'][-200:].strip()
|
||||||
|
# Extraire les premiers 200 caractères du chunk suivant
|
||||||
|
next_start = next_chunk['text'][:200].strip()
|
||||||
|
|
||||||
|
# Chercher overlap
|
||||||
|
overlap_found = False
|
||||||
|
for length in range(50, 201, 10): # Tester différentes longueurs
|
||||||
|
if len(current_end) < length or len(next_start) < length:
|
||||||
|
continue
|
||||||
|
|
||||||
|
test_end = current_end[-length:]
|
||||||
|
if test_end in next_start:
|
||||||
|
overlap_found = True
|
||||||
|
overlaps_found += 1
|
||||||
|
break
|
||||||
|
|
||||||
|
overlaps_checked += 1
|
||||||
|
|
||||||
|
if overlaps_checked > 0:
|
||||||
|
print(f"Chunks consécutifs vérifiés: {overlaps_checked}")
|
||||||
|
print(f"Overlaps détectés: {overlaps_found} ({overlaps_found/overlaps_checked*100:.1f}%)")
|
||||||
|
else:
|
||||||
|
print("Aucun chunk consécutif dans la même section (pas de split détecté)")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
# ========== 3. TESTS DE RECHERCHE SÉMANTIQUE ==========
|
||||||
|
print("3. TESTS DE RECHERCHE SÉMANTIQUE")
|
||||||
|
print("-" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
test_queries = [
|
||||||
|
{
|
||||||
|
"query": "What is the nature of representation in cognitive science?",
|
||||||
|
"expected_work": "Mind Design III",
|
||||||
|
"description": "Requête philosophique complexe"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "Comment définit-on la vertu selon Platon?",
|
||||||
|
"expected_work": "Platon - Ménon",
|
||||||
|
"description": "Requête en français sur un concept spécifique"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "pragmatism and belief fixation",
|
||||||
|
"expected_work": "Collected papers",
|
||||||
|
"description": "Concepts multiples (test de granularité)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "Entre la logique des termes et la grammaire spéculative",
|
||||||
|
"expected_work": "La pensée-signe",
|
||||||
|
"description": "Requête technique académique"
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, test in enumerate(test_queries, 1):
|
||||||
|
print(f"Test {i}: {test['description']}")
|
||||||
|
print(f"Query: \"{test['query']}\"")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Vectorize query and search with near_vector
|
||||||
|
# (Chunk_v2 has no vectorizer, so we must manually vectorize queries)
|
||||||
|
query_vector = vectorize_query(test['query'])
|
||||||
|
result = chunk_v2.query.near_vector(
|
||||||
|
near_vector=query_vector,
|
||||||
|
limit=5,
|
||||||
|
return_properties=[
|
||||||
|
'text', 'workTitle', 'workAuthor',
|
||||||
|
'sectionPath', 'chapterTitle'
|
||||||
|
],
|
||||||
|
return_metadata=['distance']
|
||||||
|
)
|
||||||
|
|
||||||
|
if not result.objects:
|
||||||
|
print(" ❌ Aucun résultat trouvé")
|
||||||
|
print()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Analyser les résultats
|
||||||
|
print(f" Résultats: {len(result.objects)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
for j, obj in enumerate(result.objects, 1):
|
||||||
|
props = obj.properties
|
||||||
|
work_title = props.get('workTitle', 'N/A')
|
||||||
|
text = props.get('text', '')
|
||||||
|
tokens = estimate_tokens(text)
|
||||||
|
|
||||||
|
# Distance (si disponible)
|
||||||
|
distance = getattr(obj.metadata, 'distance', None) if hasattr(obj, 'metadata') else None
|
||||||
|
distance_str = f" (distance: {distance:.4f})" if distance else ""
|
||||||
|
|
||||||
|
# Marquer si c'est l'œuvre attendue
|
||||||
|
match_icon = "✓" if test['expected_work'] in work_title else " "
|
||||||
|
|
||||||
|
print(f" [{match_icon}] {j}. {work_title}{distance_str}")
|
||||||
|
print(f" Taille: {tokens} tokens")
|
||||||
|
print(f" Section: {props.get('sectionPath', 'N/A')[:60]}...")
|
||||||
|
print(f" Extrait: {text[:120]}...")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Vérifier si l'œuvre attendue est dans les résultats
|
||||||
|
found_expected = any(
|
||||||
|
test['expected_work'] in obj.properties.get('workTitle', '')
|
||||||
|
for obj in result.objects
|
||||||
|
)
|
||||||
|
|
||||||
|
if found_expected:
|
||||||
|
rank = next(
|
||||||
|
i for i, obj in enumerate(result.objects, 1)
|
||||||
|
if test['expected_work'] in obj.properties.get('workTitle', '')
|
||||||
|
)
|
||||||
|
print(f" ✅ Œuvre attendue trouvée (rang {rank}/5)")
|
||||||
|
else:
|
||||||
|
print(f" ⚠️ Œuvre attendue '{test['expected_work']}' non trouvée dans le top 5")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("-" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# ========== 4. STATISTIQUES GLOBALES ==========
|
||||||
|
print("4. STATISTIQUES GLOBALES DE RECHERCHE")
|
||||||
|
print("-" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Tester une requête large
|
||||||
|
broad_query = "philosophy and logic"
|
||||||
|
print(f"Requête large: \"{broad_query}\"")
|
||||||
|
print()
|
||||||
|
|
||||||
|
query_vector = vectorize_query(broad_query)
|
||||||
|
result = chunk_v2.query.near_vector(
|
||||||
|
near_vector=query_vector,
|
||||||
|
limit=20,
|
||||||
|
return_properties=['workTitle', 'text']
|
||||||
|
)
|
||||||
|
|
||||||
|
# Compter par œuvre
|
||||||
|
work_distribution = {}
|
||||||
|
chunk_sizes_in_results = []
|
||||||
|
|
||||||
|
for obj in result.objects:
|
||||||
|
props = obj.properties
|
||||||
|
work = props.get('workTitle', 'Unknown')
|
||||||
|
work_distribution[work] = work_distribution.get(work, 0) + 1
|
||||||
|
|
||||||
|
text = props.get('text', '')
|
||||||
|
tokens = estimate_tokens(text)
|
||||||
|
chunk_sizes_in_results.append(tokens)
|
||||||
|
|
||||||
|
print(f"Résultats par œuvre (top 20):")
|
||||||
|
for work, count in sorted(work_distribution.items(), key=lambda x: x[1], reverse=True):
|
||||||
|
print(f" • {work}: {count} chunks")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
if chunk_sizes_in_results:
|
||||||
|
avg_result_size = sum(chunk_sizes_in_results) / len(chunk_sizes_in_results)
|
||||||
|
max_result_size = max(chunk_sizes_in_results)
|
||||||
|
print(f"Taille moyenne des chunks retournés: {avg_result_size:.0f} tokens")
|
||||||
|
print(f"Taille max des chunks retournés: {max_result_size} tokens")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
# ========== 5. SCORE DE QUALITÉ ==========
|
||||||
|
print("5. SCORE DE QUALITÉ DE LA RECHERCHE")
|
||||||
|
print("-" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
quality_checks = []
|
||||||
|
|
||||||
|
# Check 1: Aucun chunk > 2000 tokens
|
||||||
|
oversized = sum(1 for s in sizes if s > 2000)
|
||||||
|
quality_checks.append({
|
||||||
|
'name': 'Taille des chunks',
|
||||||
|
'passed': oversized == 0,
|
||||||
|
'detail': f'{oversized} chunks > 2000 tokens'
|
||||||
|
})
|
||||||
|
|
||||||
|
# Check 2: Distribution équilibrée
|
||||||
|
optimal_range = sum(1 for s in sizes if 200 <= s <= 1500)
|
||||||
|
optimal_percentage = optimal_range / total * 100
|
||||||
|
quality_checks.append({
|
||||||
|
'name': 'Distribution optimale',
|
||||||
|
'passed': optimal_percentage >= 80,
|
||||||
|
'detail': f'{optimal_percentage:.1f}% dans range optimal (200-1500 tokens)'
|
||||||
|
})
|
||||||
|
|
||||||
|
# Check 3: Résultats variés
|
||||||
|
unique_works = len(work_distribution)
|
||||||
|
quality_checks.append({
|
||||||
|
'name': 'Diversité des résultats',
|
||||||
|
'passed': unique_works >= 3,
|
||||||
|
'detail': f'{unique_works} œuvres différentes dans top 20'
|
||||||
|
})
|
||||||
|
|
||||||
|
# Check 4: Overlap présent
|
||||||
|
quality_checks.append({
|
||||||
|
'name': 'Overlap entre chunks',
|
||||||
|
'passed': overlaps_found > 0 if overlaps_checked > 0 else None,
|
||||||
|
'detail': f'{overlaps_found}/{overlaps_checked} overlaps détectés' if overlaps_checked > 0 else 'N/A'
|
||||||
|
})
|
||||||
|
|
||||||
|
# Afficher les résultats
|
||||||
|
passed = sum(1 for c in quality_checks if c['passed'] is True)
|
||||||
|
total_checks = sum(1 for c in quality_checks if c['passed'] is not None)
|
||||||
|
|
||||||
|
for check in quality_checks:
|
||||||
|
if check['passed'] is None:
|
||||||
|
icon = "⚠️"
|
||||||
|
status = "N/A"
|
||||||
|
elif check['passed']:
|
||||||
|
icon = "✅"
|
||||||
|
status = "OK"
|
||||||
|
else:
|
||||||
|
icon = "❌"
|
||||||
|
status = "FAIL"
|
||||||
|
|
||||||
|
print(f"{icon} {check['name']}: {status}")
|
||||||
|
print(f" {check['detail']}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print(f"Score: {passed}/{total_checks} ({passed/total_checks*100:.0f}%)")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# ========== 6. RÉSUMÉ ==========
|
||||||
|
print("=" * 80)
|
||||||
|
print("RÉSUMÉ DU TEST")
|
||||||
|
print("=" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
if passed >= total_checks * 0.8:
|
||||||
|
print("✅ QUALITÉ DE RECHERCHE: EXCELLENTE")
|
||||||
|
print()
|
||||||
|
print("Les chunks re-chunkés ont amélioré la recherche:")
|
||||||
|
print(f" • {total:,} chunks optimisés")
|
||||||
|
print(f" • Taille moyenne: {avg:.0f} tokens (optimal)")
|
||||||
|
print(f" • {optimal_percentage:.1f}% dans la plage optimale")
|
||||||
|
print(f" • Max: {max_size} tokens (< 2500)")
|
||||||
|
print(f" • Overlap détecté: {overlaps_found > 0 if overlaps_checked > 0 else 'N/A'}")
|
||||||
|
print()
|
||||||
|
print("Recommandations:")
|
||||||
|
print(" ✓ La recherche sémantique fonctionne correctement")
|
||||||
|
print(" ✓ Les chunks sont de taille optimale pour BGE-M3")
|
||||||
|
print(" ✓ Le système est prêt pour la production")
|
||||||
|
elif passed >= total_checks * 0.6:
|
||||||
|
print("⚠️ QUALITÉ DE RECHERCHE: BONNE")
|
||||||
|
print()
|
||||||
|
print("Quelques améliorations possibles:")
|
||||||
|
for check in quality_checks:
|
||||||
|
if not check['passed'] and check['passed'] is not None:
|
||||||
|
print(f" • {check['name']}: {check['detail']}")
|
||||||
|
else:
|
||||||
|
print("❌ QUALITÉ DE RECHERCHE: À AMÉLIORER")
|
||||||
|
print()
|
||||||
|
print("Problèmes détectés:")
|
||||||
|
for check in quality_checks:
|
||||||
|
if not check['passed'] and check['passed'] is not None:
|
||||||
|
print(f" • {check['name']}: {check['detail']}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
client.close()
|
||||||
217
11_vectorize_missing_chunks.py
Normal file
217
11_vectorize_missing_chunks.py
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
"""Vectorize chunks that don't have vectors.
|
||||||
|
|
||||||
|
After re-chunking, new chunks were created without vectors because Chunk_v2
|
||||||
|
collection has no vectorizer configured. This script manually vectorizes
|
||||||
|
these chunks using the text2vec-transformers service.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import weaviate
|
||||||
|
import sys
|
||||||
|
import requests
|
||||||
|
|
||||||
|
if sys.stdout.encoding != 'utf-8':
|
||||||
|
sys.stdout.reconfigure(encoding='utf-8')
|
||||||
|
|
||||||
|
# Try to import tqdm
|
||||||
|
try:
|
||||||
|
from tqdm import tqdm
|
||||||
|
HAS_TQDM = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_TQDM = False
|
||||||
|
|
||||||
|
# Text2vec-transformers service URL (from docker-compose.yml)
|
||||||
|
VECTORIZER_URL = "http://localhost:8090/vectors"
|
||||||
|
|
||||||
|
client = weaviate.connect_to_local()
|
||||||
|
|
||||||
|
try:
|
||||||
|
print("=" * 80)
|
||||||
|
print("VECTORISATION DES CHUNKS SANS VECTEUR")
|
||||||
|
print("=" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
chunk_v2 = client.collections.get("Chunk_v2")
|
||||||
|
|
||||||
|
# ========== 1. IDENTIFIER LES CHUNKS SANS VECTEUR ==========
|
||||||
|
print("1. IDENTIFICATION DES CHUNKS SANS VECTEUR")
|
||||||
|
print("-" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
print("Analyse en cours...")
|
||||||
|
|
||||||
|
chunks_to_vectorize = []
|
||||||
|
|
||||||
|
for chunk in chunk_v2.iterator(include_vector=True):
|
||||||
|
if not chunk.vector or not chunk.vector.get('default'):
|
||||||
|
props = chunk.properties
|
||||||
|
chunks_to_vectorize.append({
|
||||||
|
'uuid': chunk.uuid,
|
||||||
|
'text': props.get('text', ''),
|
||||||
|
'summary': props.get('summary', ''),
|
||||||
|
'keywords': props.get('keywords', []),
|
||||||
|
'workTitle': props.get('workTitle', 'N/A')
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"✓ {len(chunks_to_vectorize)} chunks sans vecteur trouvés")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if not chunks_to_vectorize:
|
||||||
|
print("✅ Aucun chunk à vectoriser")
|
||||||
|
print()
|
||||||
|
print("=" * 80)
|
||||||
|
print("SCRIPT TERMINÉ - RIEN À FAIRE")
|
||||||
|
print("=" * 80)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# ========== 2. VECTORISATION ==========
|
||||||
|
print("2. VECTORISATION DES CHUNKS")
|
||||||
|
print("-" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
print(f"Service vectorizer: {VECTORIZER_URL}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
vectorized_count = 0
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
# Create iterator with or without tqdm
|
||||||
|
if HAS_TQDM:
|
||||||
|
iterator = tqdm(
|
||||||
|
chunks_to_vectorize,
|
||||||
|
desc="Vectorisation",
|
||||||
|
unit="chunks"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
iterator = chunks_to_vectorize
|
||||||
|
print("Vectorisation en cours...")
|
||||||
|
|
||||||
|
for idx, chunk_data in enumerate(iterator, 1):
|
||||||
|
try:
|
||||||
|
# Prepare text for vectorization
|
||||||
|
# Combine text, summary, and keywords as per original Chunk schema
|
||||||
|
text_parts = [chunk_data['text']]
|
||||||
|
|
||||||
|
if chunk_data['summary']:
|
||||||
|
text_parts.append(chunk_data['summary'])
|
||||||
|
|
||||||
|
if chunk_data['keywords']:
|
||||||
|
text_parts.append(' '.join(chunk_data['keywords']))
|
||||||
|
|
||||||
|
combined_text = ' '.join(text_parts)
|
||||||
|
|
||||||
|
# Call text2vec-transformers service
|
||||||
|
response = requests.post(
|
||||||
|
VECTORIZER_URL,
|
||||||
|
json={"text": combined_text},
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: HTTP {response.status_code}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
vector = result.get('vector')
|
||||||
|
|
||||||
|
if not vector:
|
||||||
|
errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: Pas de vecteur dans la réponse")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Update chunk with vector
|
||||||
|
chunk_v2.data.update(
|
||||||
|
uuid=chunk_data['uuid'],
|
||||||
|
vector=vector
|
||||||
|
)
|
||||||
|
|
||||||
|
vectorized_count += 1
|
||||||
|
|
||||||
|
# Progress without tqdm
|
||||||
|
if not HAS_TQDM and idx % 10 == 0:
|
||||||
|
print(f" {idx}/{len(chunks_to_vectorize)} chunks vectorisés...")
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: Erreur réseau - {e}")
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: {e}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("-" * 80)
|
||||||
|
print(f"✓ Chunks vectorisés: {vectorized_count}/{len(chunks_to_vectorize)}")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
print()
|
||||||
|
print(f"⚠️ Erreurs rencontrées: {len(errors)}")
|
||||||
|
for err in errors[:10]:
|
||||||
|
print(f" - {err}")
|
||||||
|
if len(errors) > 10:
|
||||||
|
print(f" ... et {len(errors) - 10} autres")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
# ========== 3. VÉRIFICATION ==========
|
||||||
|
print("3. VÉRIFICATION POST-VECTORISATION")
|
||||||
|
print("-" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
print("Recomptage...")
|
||||||
|
remaining_without_vector = 0
|
||||||
|
total_chunks = 0
|
||||||
|
|
||||||
|
for chunk in chunk_v2.iterator(include_vector=True):
|
||||||
|
total_chunks += 1
|
||||||
|
if not chunk.vector or not chunk.vector.get('default'):
|
||||||
|
remaining_without_vector += 1
|
||||||
|
|
||||||
|
chunks_with_vector = total_chunks - remaining_without_vector
|
||||||
|
|
||||||
|
print(f"✓ Total chunks: {total_chunks:,}")
|
||||||
|
print(f"✓ Avec vecteur: {chunks_with_vector:,} ({chunks_with_vector/total_chunks*100:.1f}%)")
|
||||||
|
print(f"✓ Sans vecteur: {remaining_without_vector:,}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
if remaining_without_vector == 0:
|
||||||
|
print("✅ Tous les chunks ont été vectorisés!")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ {remaining_without_vector} chunks encore sans vecteur")
|
||||||
|
print(" Relancer le script ou vérifier les erreurs")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("=" * 80)
|
||||||
|
print("VECTORISATION TERMINÉE")
|
||||||
|
print("=" * 80)
|
||||||
|
print()
|
||||||
|
|
||||||
|
if remaining_without_vector == 0:
|
||||||
|
print("✅ VECTORISATION RÉUSSIE")
|
||||||
|
print()
|
||||||
|
print("RÉSULTATS:")
|
||||||
|
print(f" • {vectorized_count} nouveaux vecteurs créés")
|
||||||
|
print(f" • {total_chunks:,} chunks totaux")
|
||||||
|
print(f" • 100% des chunks ont des vecteurs")
|
||||||
|
print()
|
||||||
|
print("PROCHAINES ÉTAPES:")
|
||||||
|
print(" 1. Relancer le test de recherche: python 10_test_search_quality.py")
|
||||||
|
print(" 2. Tester l'application Flask")
|
||||||
|
print()
|
||||||
|
print("NOTE: Chunk_v2 n'a toujours pas de vectorizer configuré.")
|
||||||
|
print("Les futurs nouveaux chunks devront être vectorisés manuellement")
|
||||||
|
print("OU la collection devra être recréée avec un vectorizer.")
|
||||||
|
elif vectorized_count > 0:
|
||||||
|
print("⚠️ VECTORISATION PARTIELLE")
|
||||||
|
print()
|
||||||
|
print(f" • {vectorized_count} chunks vectorisés")
|
||||||
|
print(f" • {remaining_without_vector} chunks restants")
|
||||||
|
print(" • Vérifier les erreurs et relancer")
|
||||||
|
else:
|
||||||
|
print("❌ VECTORISATION ÉCHOUÉE")
|
||||||
|
print()
|
||||||
|
print("Aucun chunk n'a pu être vectorisé.")
|
||||||
|
print("Vérifications:")
|
||||||
|
print(f" 1. Service text2vec-transformers actif: {VECTORIZER_URL}")
|
||||||
|
print(" 2. Docker containers en cours d'exécution")
|
||||||
|
print(" 3. Logs des erreurs ci-dessus")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
client.close()
|
||||||
@@ -31,6 +31,10 @@ services:
|
|||||||
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: "true" # ok pour dev/local
|
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: "true" # ok pour dev/local
|
||||||
PERSISTENCE_DATA_PATH: "/var/lib/weaviate"
|
PERSISTENCE_DATA_PATH: "/var/lib/weaviate"
|
||||||
CLUSTER_HOSTNAME: "node1"
|
CLUSTER_HOSTNAME: "node1"
|
||||||
|
CLUSTER_GOSSIP_BIND_PORT: "7946"
|
||||||
|
CLUSTER_DATA_BIND_PORT: "7947"
|
||||||
|
# Fix for "No private IP address found" error
|
||||||
|
CLUSTER_JOIN: ""
|
||||||
DEFAULT_VECTORIZER_MODULE: "text2vec-transformers"
|
DEFAULT_VECTORIZER_MODULE: "text2vec-transformers"
|
||||||
ENABLE_MODULES: "text2vec-transformers"
|
ENABLE_MODULES: "text2vec-transformers"
|
||||||
TRANSFORMERS_INFERENCE_API: "http://text2vec-transformers:8080"
|
TRANSFORMERS_INFERENCE_API: "http://text2vec-transformers:8080"
|
||||||
@@ -56,6 +60,8 @@ services:
|
|||||||
# - Current setup: CPU-only with AVX2 optimization (functional but slower)
|
# - Current setup: CPU-only with AVX2 optimization (functional but slower)
|
||||||
image: cr.weaviate.io/semitechnologies/transformers-inference:baai-bge-m3-onnx-latest
|
image: cr.weaviate.io/semitechnologies/transformers-inference:baai-bge-m3-onnx-latest
|
||||||
restart: on-failure:0
|
restart: on-failure:0
|
||||||
|
ports:
|
||||||
|
- "8090:8080" # Expose vectorizer API for manual vectorization
|
||||||
environment:
|
environment:
|
||||||
# ONNX runtime - CPU only (CUDA not supported in ONNX version)
|
# ONNX runtime - CPU only (CUDA not supported in ONNX version)
|
||||||
ENABLE_CUDA: "0"
|
ENABLE_CUDA: "0"
|
||||||
|
|||||||
@@ -52,9 +52,15 @@ from .llm_structurer import (
|
|||||||
)
|
)
|
||||||
from .llm_cleaner import clean_page_markers, is_chunk_valid
|
from .llm_cleaner import clean_page_markers, is_chunk_valid
|
||||||
from .types import LLMProvider, SemanticChunk
|
from .types import LLMProvider, SemanticChunk
|
||||||
|
from .llm_chunker_improved import simple_chunk_with_overlap, validate_chunk_size
|
||||||
|
|
||||||
logger: logging.Logger = logging.getLogger(__name__)
|
logger: logging.Logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Chunk size limits (2024-01 optimization)
|
||||||
|
MAX_CHUNK_WORDS = 1000 # Hard limit to stay within BGE-M3 context
|
||||||
|
OVERLAP_WORDS = 100 # Overlap for context preservation
|
||||||
|
FORCE_SIMPLE_CHUNKING_THRESHOLD = 1500 # Words - force simple chunking above this
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Type Definitions for LLM Chunker
|
# Type Definitions for LLM Chunker
|
||||||
@@ -221,8 +227,43 @@ def chunk_section_with_llm(
|
|||||||
# Nettoyer le contenu
|
# Nettoyer le contenu
|
||||||
content: str = clean_page_markers(section_content)
|
content: str = clean_page_markers(section_content)
|
||||||
|
|
||||||
# Si le contenu est court, ne pas découper
|
# Compter les mots
|
||||||
word_count: int = len(content.split())
|
word_count: int = len(content.split())
|
||||||
|
|
||||||
|
# FORCE SIMPLE CHUNKING if section is too long (> 1500 words)
|
||||||
|
# This prevents giant chunks that exceed BGE-M3 limits
|
||||||
|
if word_count > FORCE_SIMPLE_CHUNKING_THRESHOLD:
|
||||||
|
logger.warning(
|
||||||
|
f"Section '{section_title}' is too long ({word_count} words), "
|
||||||
|
f"forcing simple chunking with overlap"
|
||||||
|
)
|
||||||
|
simple_texts = simple_chunk_with_overlap(
|
||||||
|
content,
|
||||||
|
max_words=MAX_CHUNK_WORDS,
|
||||||
|
overlap_words=OVERLAP_WORDS
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert to SemanticChunk format
|
||||||
|
result_chunks: List[SemanticChunk] = []
|
||||||
|
for i, text in enumerate(simple_texts):
|
||||||
|
para_num = extract_paragraph_number(text)
|
||||||
|
chunk: SemanticChunk = {
|
||||||
|
"text": text,
|
||||||
|
"summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
|
||||||
|
"concepts": [],
|
||||||
|
"type": "main_content",
|
||||||
|
"section_level": section_level,
|
||||||
|
}
|
||||||
|
if para_num is not None:
|
||||||
|
chunk["paragraph_number"] = para_num
|
||||||
|
if subsection_title and subsection_title != section_title:
|
||||||
|
chunk["subsection_title"] = subsection_title
|
||||||
|
result_chunks.append(chunk)
|
||||||
|
|
||||||
|
logger.info(f"Section split into {len(result_chunks)} chunks with overlap")
|
||||||
|
return result_chunks
|
||||||
|
|
||||||
|
# Si le contenu est court, ne pas découper
|
||||||
if word_count < target_chunk_size * 0.8:
|
if word_count < target_chunk_size * 0.8:
|
||||||
para_num: Optional[int] = extract_paragraph_number(content)
|
para_num: Optional[int] = extract_paragraph_number(content)
|
||||||
chunk: SemanticChunk = {
|
chunk: SemanticChunk = {
|
||||||
@@ -320,39 +361,66 @@ RÉPONDS avec un JSON entre <JSON></JSON>:
|
|||||||
|
|
||||||
valid_chunks.append(chunk_data)
|
valid_chunks.append(chunk_data)
|
||||||
|
|
||||||
# Si aucun chunk valide, retourner le contenu complet
|
# Si aucun chunk valide, utiliser simple chunking avec overlap
|
||||||
if not valid_chunks:
|
if not valid_chunks:
|
||||||
logger.warning(f"Aucun chunk valide pour '{section_title}', retour contenu complet")
|
logger.warning(
|
||||||
para_num = extract_paragraph_number(content)
|
f"Aucun chunk valide pour '{section_title}', "
|
||||||
fallback: SemanticChunk = {
|
f"fallback vers simple chunking avec overlap"
|
||||||
"text": content,
|
)
|
||||||
"summary": section_title,
|
simple_texts = simple_chunk_with_overlap(
|
||||||
|
content,
|
||||||
|
max_words=MAX_CHUNK_WORDS,
|
||||||
|
overlap_words=OVERLAP_WORDS
|
||||||
|
)
|
||||||
|
|
||||||
|
fallback_chunks: List[SemanticChunk] = []
|
||||||
|
for i, text in enumerate(simple_texts):
|
||||||
|
para_num = extract_paragraph_number(text)
|
||||||
|
chunk_data: SemanticChunk = {
|
||||||
|
"text": text,
|
||||||
|
"summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
|
||||||
"concepts": [],
|
"concepts": [],
|
||||||
"type": "main_content",
|
"type": "main_content",
|
||||||
"section_level": section_level,
|
"section_level": section_level,
|
||||||
}
|
}
|
||||||
if para_num is not None:
|
if para_num is not None:
|
||||||
fallback["paragraph_number"] = para_num
|
chunk_data["paragraph_number"] = para_num
|
||||||
return [fallback]
|
fallback_chunks.append(chunk_data)
|
||||||
|
|
||||||
|
logger.info(f"Fallback: section split into {len(fallback_chunks)} chunks")
|
||||||
|
return fallback_chunks
|
||||||
|
|
||||||
logger.info(f"Section '{section_title}' découpée en {len(valid_chunks)} chunks")
|
logger.info(f"Section '{section_title}' découpée en {len(valid_chunks)} chunks")
|
||||||
return valid_chunks
|
return valid_chunks
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Erreur chunking LLM: {e}")
|
logger.error(f"Erreur chunking LLM: {e}")
|
||||||
# Fallback: retourner le contenu complet
|
# Fallback: utiliser simple chunking avec overlap
|
||||||
para_num = extract_paragraph_number(content)
|
logger.warning(f"Exception LLM, fallback vers simple chunking avec overlap")
|
||||||
fallback_err: SemanticChunk = {
|
|
||||||
"text": content,
|
simple_texts = simple_chunk_with_overlap(
|
||||||
"summary": section_title,
|
content,
|
||||||
|
max_words=MAX_CHUNK_WORDS,
|
||||||
|
overlap_words=OVERLAP_WORDS
|
||||||
|
)
|
||||||
|
|
||||||
|
error_chunks: List[SemanticChunk] = []
|
||||||
|
for i, text in enumerate(simple_texts):
|
||||||
|
para_num = extract_paragraph_number(text)
|
||||||
|
chunk_data: SemanticChunk = {
|
||||||
|
"text": text,
|
||||||
|
"summary": f"{section_title} (partie {i+1}/{len(simple_texts)})",
|
||||||
"concepts": [],
|
"concepts": [],
|
||||||
"type": "main_content",
|
"type": "main_content",
|
||||||
"section_level": section_level,
|
"section_level": section_level,
|
||||||
"error": str(e),
|
"error": f"LLM failed: {str(e)}",
|
||||||
}
|
}
|
||||||
if para_num is not None:
|
if para_num is not None:
|
||||||
fallback_err["paragraph_number"] = para_num
|
chunk_data["paragraph_number"] = para_num
|
||||||
return [fallback_err]
|
error_chunks.append(chunk_data)
|
||||||
|
|
||||||
|
logger.info(f"Error fallback: section split into {len(error_chunks)} chunks")
|
||||||
|
return error_chunks
|
||||||
|
|
||||||
|
|
||||||
def simple_chunk_by_paragraphs(
|
def simple_chunk_by_paragraphs(
|
||||||
|
|||||||
232
generations/library_rag/utils/llm_chunker_improved.py
Normal file
232
generations/library_rag/utils/llm_chunker_improved.py
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
"""Improved semantic chunking with strict size limits and overlap.
|
||||||
|
|
||||||
|
This module adds strict chunk size constraints (max 1000 words) and overlap
|
||||||
|
functionality (100 words) to prevent giant chunks that exceed BGE-M3 limits.
|
||||||
|
|
||||||
|
Key improvements:
|
||||||
|
- MAX_CHUNK_WORDS = 1000 (hard limit)
|
||||||
|
- OVERLAP_WORDS = 100 (context preservation)
|
||||||
|
- Fallback to simple chunking if section > 1500 words
|
||||||
|
- Fallback to simple chunking if LLM fails
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from .llm_cleaner import clean_page_markers
|
||||||
|
|
||||||
|
logger: logging.Logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Constants
|
||||||
|
MAX_CHUNK_WORDS = 1000 # Hard limit per chunk (~2500 tokens)
|
||||||
|
OVERLAP_WORDS = 100 # Overlap between chunks for context
|
||||||
|
MIN_CHUNK_WORDS = 100 # Minimum chunk size
|
||||||
|
|
||||||
|
|
||||||
|
def simple_chunk_with_overlap(
|
||||||
|
content: str,
|
||||||
|
max_words: int = MAX_CHUNK_WORDS,
|
||||||
|
min_words: int = MIN_CHUNK_WORDS,
|
||||||
|
overlap_words: int = OVERLAP_WORDS,
|
||||||
|
) -> List[str]:
|
||||||
|
"""Split text into chunks with overlap for context preservation.
|
||||||
|
|
||||||
|
This is an improved version of simple_chunk_by_paragraphs that adds
|
||||||
|
overlap between consecutive chunks to maintain context.
|
||||||
|
|
||||||
|
Algorithm:
|
||||||
|
1. Split by paragraph boundaries (double newlines)
|
||||||
|
2. Merge small paragraphs until max_words is reached
|
||||||
|
3. Split long paragraphs at sentence boundaries
|
||||||
|
4. Add overlap_words from previous chunk to next chunk
|
||||||
|
5. Filter chunks below min_words threshold
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: Text content to split into chunks.
|
||||||
|
max_words: Maximum words per chunk. Defaults to 1000.
|
||||||
|
min_words: Minimum words per chunk. Defaults to 100.
|
||||||
|
overlap_words: Words to overlap between chunks. Defaults to 100.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of text chunks as strings with overlap.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> chunks = simple_chunk_with_overlap(text, max_words=1000, overlap_words=100)
|
||||||
|
>>> # Each chunk overlaps with 100 words from previous chunk
|
||||||
|
"""
|
||||||
|
content = clean_page_markers(content)
|
||||||
|
|
||||||
|
# Split by paragraphs
|
||||||
|
paragraphs: List[str] = re.split(r'\n\n+', content)
|
||||||
|
|
||||||
|
chunks: List[str] = []
|
||||||
|
current_chunk: List[str] = []
|
||||||
|
current_words: int = 0
|
||||||
|
overlap_buffer: List[str] = [] # Store last sentences for overlap
|
||||||
|
|
||||||
|
def finalize_chunk() -> None:
|
||||||
|
"""Finalize current chunk and prepare overlap."""
|
||||||
|
nonlocal current_chunk, current_words, overlap_buffer
|
||||||
|
|
||||||
|
if not current_chunk:
|
||||||
|
return
|
||||||
|
|
||||||
|
chunk_text = '\n\n'.join(current_chunk)
|
||||||
|
chunks.append(chunk_text)
|
||||||
|
|
||||||
|
# Extract last sentences for overlap
|
||||||
|
sentences = re.split(r'(?<=[.!?])\s+', chunk_text)
|
||||||
|
overlap_buffer = []
|
||||||
|
overlap_word_count = 0
|
||||||
|
|
||||||
|
# Take last sentences until we reach overlap_words
|
||||||
|
for sentence in reversed(sentences):
|
||||||
|
sentence_words = len(sentence.split())
|
||||||
|
if overlap_word_count + sentence_words <= overlap_words:
|
||||||
|
overlap_buffer.insert(0, sentence)
|
||||||
|
overlap_word_count += sentence_words
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
current_chunk = []
|
||||||
|
current_words = 0
|
||||||
|
|
||||||
|
for para in paragraphs:
|
||||||
|
para = para.strip()
|
||||||
|
if not para:
|
||||||
|
continue
|
||||||
|
|
||||||
|
para_words: int = len(para.split())
|
||||||
|
|
||||||
|
# If paragraph is too long, split by sentences
|
||||||
|
if para_words > max_words:
|
||||||
|
# Finalize current chunk first
|
||||||
|
if current_chunk:
|
||||||
|
finalize_chunk()
|
||||||
|
|
||||||
|
# Add overlap if exists
|
||||||
|
if overlap_buffer and chunks:
|
||||||
|
current_chunk.extend(overlap_buffer)
|
||||||
|
current_words = sum(len(s.split()) for s in overlap_buffer)
|
||||||
|
|
||||||
|
# Split long paragraph by sentences
|
||||||
|
sentences: List[str] = re.split(r'(?<=[.!?])\s+', para)
|
||||||
|
for sentence in sentences:
|
||||||
|
sentence_words: int = len(sentence.split())
|
||||||
|
|
||||||
|
if current_words + sentence_words > max_words and current_chunk:
|
||||||
|
finalize_chunk()
|
||||||
|
|
||||||
|
# Add overlap
|
||||||
|
if overlap_buffer:
|
||||||
|
current_chunk.extend(overlap_buffer)
|
||||||
|
current_words = sum(len(s.split()) for s in overlap_buffer)
|
||||||
|
|
||||||
|
current_chunk.append(sentence)
|
||||||
|
current_words += sentence_words
|
||||||
|
else:
|
||||||
|
current_chunk.append(sentence)
|
||||||
|
current_words += sentence_words
|
||||||
|
|
||||||
|
# If adding paragraph exceeds limit
|
||||||
|
elif current_words + para_words > max_words:
|
||||||
|
if current_chunk:
|
||||||
|
finalize_chunk()
|
||||||
|
|
||||||
|
# Add overlap
|
||||||
|
if overlap_buffer and chunks:
|
||||||
|
current_chunk.extend(overlap_buffer)
|
||||||
|
current_words = sum(len(s.split()) for s in overlap_buffer)
|
||||||
|
|
||||||
|
current_chunk.append(para)
|
||||||
|
current_words += para_words
|
||||||
|
|
||||||
|
else:
|
||||||
|
current_chunk.append(para)
|
||||||
|
current_words += para_words
|
||||||
|
|
||||||
|
# Last chunk
|
||||||
|
if current_chunk:
|
||||||
|
chunk_text = '\n\n'.join(current_chunk)
|
||||||
|
chunks.append(chunk_text)
|
||||||
|
|
||||||
|
# Filter chunks that are too short (unless it's the only chunk)
|
||||||
|
if len(chunks) > 1:
|
||||||
|
chunks = [c for c in chunks if len(c.split()) >= min_words]
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def get_chunk_text_with_context(
|
||||||
|
chunks: List[str],
|
||||||
|
index: int,
|
||||||
|
context_words: int = 50
|
||||||
|
) -> tuple[str, str, str]:
|
||||||
|
"""Get chunk with before/after context for better LLM processing.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunks: List of chunk texts.
|
||||||
|
index: Index of the chunk to process.
|
||||||
|
context_words: Words of context to include from adjacent chunks.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (before_context, chunk_text, after_context).
|
||||||
|
"""
|
||||||
|
chunk = chunks[index]
|
||||||
|
|
||||||
|
before_context = ""
|
||||||
|
if index > 0:
|
||||||
|
prev_chunk = chunks[index - 1]
|
||||||
|
words = prev_chunk.split()
|
||||||
|
before_context = " ".join(words[-context_words:]) if len(words) > context_words else prev_chunk
|
||||||
|
|
||||||
|
after_context = ""
|
||||||
|
if index < len(chunks) - 1:
|
||||||
|
next_chunk = chunks[index + 1]
|
||||||
|
words = next_chunk.split()
|
||||||
|
after_context = " ".join(words[:context_words]) if len(words) > context_words else next_chunk
|
||||||
|
|
||||||
|
return before_context, chunk, after_context
|
||||||
|
|
||||||
|
|
||||||
|
def estimate_tokens(text: str) -> int:
|
||||||
|
"""Estimate token count from text.
|
||||||
|
|
||||||
|
Uses approximation of 1 token ≈ 4 characters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Text to estimate.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Estimated token count.
|
||||||
|
"""
|
||||||
|
return len(text) // 4
|
||||||
|
|
||||||
|
|
||||||
|
def validate_chunk_size(text: str, max_tokens: int = 2500) -> bool:
|
||||||
|
"""Validate that chunk size is within acceptable limits.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Chunk text to validate.
|
||||||
|
max_tokens: Maximum allowed tokens (default 2500 for safety margin below BGE-M3's 8192).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if chunk is valid size, False otherwise.
|
||||||
|
"""
|
||||||
|
tokens = estimate_tokens(text)
|
||||||
|
return tokens <= max_tokens
|
||||||
|
|
||||||
|
|
||||||
|
# Export key functions
|
||||||
|
__all__ = [
|
||||||
|
'simple_chunk_with_overlap',
|
||||||
|
'get_chunk_text_with_context',
|
||||||
|
'estimate_tokens',
|
||||||
|
'validate_chunk_size',
|
||||||
|
'MAX_CHUNK_WORDS',
|
||||||
|
'OVERLAP_WORDS',
|
||||||
|
'MIN_CHUNK_WORDS',
|
||||||
|
]
|
||||||
Reference in New Issue
Block a user