From 7045907173deb44fcf7d4004d6479c69fb318359 Mon Sep 17 00:00:00 2001 From: David Blanc Brioir Date: Thu, 8 Jan 2026 17:37:49 +0100 Subject: [PATCH] feat: Optimize chunk sizes with 1000-word limit and overlap Implemented chunking optimization to resolve oversized chunks and improve semantic search quality: CHUNKING IMPROVEMENTS: - Added strict 1000-word max limit (vs previous 1500-2000) - Implemented 100-word overlap between consecutive chunks - Created llm_chunker_improved.py with overlap functionality - Added 3 fallback points in llm_chunker.py for robustness RE-CHUNKING RESULTS: - Identified and re-chunked 31 oversized chunks (>2000 tokens) - Split into 92 optimally-sized chunks (max 1995 tokens) - Preserved all metadata (workTitle, workAuthor, sectionPath, etc.) - 0 chunks now exceed 2000 tokens (vs 31 before) VECTORIZATION: - Created manual vectorization script for chunks without vectors - Successfully vectorized all 92 new chunks (100% coverage) - All 5,304 chunks now have BGE-M3 embeddings DOCKER CONFIGURATION: - Exposed text2vec-transformers port 8090 for manual vectorization - Added cluster configuration to fix "No private IP address found" - Increased worker timeout to 600s for large chunks TESTING: - Created comprehensive search quality test suite - Tests distribution, overlap detection, and semantic search - Modified to use near_vector() (Chunk_v2 has no vectorizer) Scripts: - 08_fix_summaries_properties.py - Add missing Work metadata to summaries - 09_rechunk_oversized.py - Re-chunk giant chunks with overlap - 10_test_search_quality.py - Validate search improvements - 11_vectorize_missing_chunks.py - Manual vectorization via API Co-Authored-By: Claude Sonnet 4.5 --- 08_fix_summaries_properties.py | 157 +++++++ 09_rechunk_oversized.py | 267 ++++++++++++ 10_test_search_quality.py | 402 ++++++++++++++++++ 11_vectorize_missing_chunks.py | 217 ++++++++++ generations/library_rag/docker-compose.yml | 6 + generations/library_rag/utils/llm_chunker.py | 122 ++++-- .../library_rag/utils/llm_chunker_improved.py | 232 ++++++++++ 7 files changed, 1376 insertions(+), 27 deletions(-) create mode 100644 08_fix_summaries_properties.py create mode 100644 09_rechunk_oversized.py create mode 100644 10_test_search_quality.py create mode 100644 11_vectorize_missing_chunks.py create mode 100644 generations/library_rag/utils/llm_chunker_improved.py diff --git a/08_fix_summaries_properties.py b/08_fix_summaries_properties.py new file mode 100644 index 0000000..e74d8a0 --- /dev/null +++ b/08_fix_summaries_properties.py @@ -0,0 +1,157 @@ +"""Correctif: Ajouter workAuthor, year, language aux Summary_v2.""" + +import weaviate +import sys + +if sys.stdout.encoding != 'utf-8': + sys.stdout.reconfigure(encoding='utf-8') + +# Try to import tqdm +try: + from tqdm import tqdm + HAS_TQDM = True +except ImportError: + HAS_TQDM = False + +client = weaviate.connect_to_local() + +try: + print("=" * 80) + print("CORRECTIF: AJOUTER workAuthor, year, language À SUMMARY_V2") + print("=" * 80) + print() + + summary_v2 = client.collections.get("Summary_v2") + work_collection = client.collections.get("Work") + + # Build workTitle → Work metadata map + print("Étape 1: Mapping workTitle → Work metadata") + print("-" * 80) + + work_map = {} + + for work in work_collection.iterator(include_vector=False): + props = work.properties + title = props.get("title") + if title: + work_map[title] = { + "author": props.get("author", "Unknown"), + "year": props.get("year", 0), + "language": props.get("language", "en"), + } + + print(f"✓ {len(work_map)} mappings workTitle → metadata") + print() + + # Count total summaries + print("Étape 2: Comptage summaries") + print("-" * 80) + + print("Comptage en cours...") + total_summaries = sum(1 for _ in summary_v2.iterator(include_vector=False)) + + print(f"✓ {total_summaries} summaries à corriger") + print() + + # Update summaries + print("Étape 3: Mise à jour des propriétés") + print("-" * 80) + print() + + updated = 0 + skipped = 0 + errors = [] + + # Create iterator with or without tqdm + if HAS_TQDM: + iterator = tqdm( + summary_v2.iterator(include_vector=False), + total=total_summaries, + desc="Mise à jour", + unit="summaries" + ) + else: + iterator = summary_v2.iterator(include_vector=False) + print("Mise à jour en cours...") + + for idx, summary in enumerate(iterator, 1): + props = summary.properties + + try: + work_title = props.get("workTitle") + + if not work_title: + errors.append(f"Summary {summary.uuid}: pas de workTitle") + skipped += 1 + continue + + # Get work metadata + work_metadata = work_map.get(work_title) + if not work_metadata: + errors.append(f"Summary {summary.uuid}: Work '{work_title}' introuvable") + skipped += 1 + continue + + # Check if already updated (workAuthor exists) + if props.get("workAuthor") is not None: + skipped += 1 + continue + + # Update properties + summary_v2.data.update( + uuid=summary.uuid, + properties={ + "workAuthor": work_metadata["author"], + "year": work_metadata["year"], + "language": work_metadata["language"], + } + ) + + updated += 1 + + # Progress without tqdm + if not HAS_TQDM and idx % 10 == 0: + print(f" {idx}/{total_summaries} summaries traités...") + + except Exception as e: + errors.append(f"Summary {summary.uuid}: {e}") + + print() + print("-" * 80) + print(f"✓ Total mis à jour: {updated}/{total_summaries}") + print(f" Déjà à jour: {skipped}") + + if errors: + print(f"⚠️ Erreurs rencontrées: {len(errors)}") + print() + print("Premières erreurs:") + for err in errors[:10]: + print(f" - {err}") + if len(errors) > 10: + print(f" ... et {len(errors) - 10} autres") + + print() + print("=" * 80) + print("CORRECTIF TERMINÉ") + print("=" * 80) + print() + + if updated == total_summaries: + print("✅ Tous les summaries ont été mis à jour") + print() + print("Propriétés ajoutées:") + print(" ✓ workAuthor (auteur de l'œuvre)") + print(" ✓ year (année de publication)") + print(" ✓ language (langue du texte)") + print() + print("VÉRIFICATION:") + print(" python -c \"from verify_summaries import verify; verify()\"") + elif updated > 0: + print(f"⚠️ {updated}/{total_summaries} summaries mis à jour") + print(" Vérifier les erreurs") + else: + print("❌ Aucun summary mis à jour") + print(" Corriger les erreurs et relancer") + +finally: + client.close() diff --git a/09_rechunk_oversized.py b/09_rechunk_oversized.py new file mode 100644 index 0000000..5083b9e --- /dev/null +++ b/09_rechunk_oversized.py @@ -0,0 +1,267 @@ +"""Script to re-chunk oversized chunks (> 2000 tokens) in Chunk_v2. + +This script identifies chunks that are too large (> 2000 tokens) and splits them +into smaller chunks with overlap (max 1000 words, overlap 100 words). + +Steps: + 1. Identify all chunks > 2000 tokens in Chunk_v2 + 2. Re-chunk using simple_chunk_with_overlap (1000 words max, 100 overlap) + 3. Delete the original oversized chunk + 4. Insert new smaller chunks with preserved metadata + 5. Update Summary_v2 chunksCount if needed +""" + +import weaviate +import sys +from pathlib import Path + +# Add utils to path +sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag")) + +from utils.llm_chunker_improved import simple_chunk_with_overlap, estimate_tokens + +if sys.stdout.encoding != 'utf-8': + sys.stdout.reconfigure(encoding='utf-8') + +# Try to import tqdm +try: + from tqdm import tqdm + HAS_TQDM = True +except ImportError: + HAS_TQDM = False + +# Constants +TOKEN_THRESHOLD = 2000 # Chunks > 2000 tokens will be re-chunked +MAX_WORDS = 1000 +OVERLAP_WORDS = 100 + +client = weaviate.connect_to_local() + +try: + print("=" * 80) + print("RE-CHUNKING DES CHUNKS SURDIMENSIONNÉS") + print("=" * 80) + print() + + chunk_v2 = client.collections.get("Chunk_v2") + work_collection = client.collections.get("Work") + + # ========== 1. IDENTIFIER LES CHUNKS PROBLÉMATIQUES ========== + print("1. IDENTIFICATION DES CHUNKS > 2000 TOKENS") + print("-" * 80) + print() + + oversized_chunks = [] + + print("Analyse en cours...") + for chunk in chunk_v2.iterator(include_vector=False): + props = chunk.properties + text = props.get('text', '') + tokens = estimate_tokens(text) + + if tokens > TOKEN_THRESHOLD: + oversized_chunks.append({ + 'uuid': str(chunk.uuid), + 'tokens': tokens, + 'chars': len(text), + 'text': text, + 'workTitle': props.get('workTitle', ''), + 'workAuthor': props.get('workAuthor', ''), + 'year': props.get('year', 0), + 'language': props.get('language', 'en'), + 'sectionPath': props.get('sectionPath', ''), + 'chapterTitle': props.get('chapterTitle', ''), + 'canonicalReference': props.get('canonicalReference', ''), + 'unitType': props.get('unitType', 'main_content'), + 'keywords': props.get('keywords', []), + 'orderIndex': props.get('orderIndex', 0), + }) + + print(f"✓ {len(oversized_chunks)} chunks > {TOKEN_THRESHOLD} tokens trouvés") + print() + + if not oversized_chunks: + print("✅ Aucun chunk surdimensionné à traiter") + print() + print("=" * 80) + print("SCRIPT TERMINÉ - RIEN À FAIRE") + print("=" * 80) + sys.exit(0) + + # Trier par taille + oversized_chunks.sort(key=lambda x: x['tokens'], reverse=True) + + print("Top 5 plus gros chunks:") + for i, chunk in enumerate(oversized_chunks[:5], 1): + print(f"{i}. {chunk['tokens']:,} tokens ({chunk['chars']:,} chars)") + print(f" Œuvre: {chunk['workTitle']}") + print(f" Section: {chunk['sectionPath'][:60]}...") + print() + + if len(oversized_chunks) > 5: + print(f"... et {len(oversized_chunks) - 5} autres") + + print() + + # ========== 2. RE-CHUNKING ========== + print("2. RE-CHUNKING AVEC OVERLAP") + print("-" * 80) + print() + + # Build work_title -> work_uuid map for references + work_map = {} + for work in work_collection.iterator(include_vector=False): + props = work.properties + title = props.get("title") + if title: + work_map[title] = str(work.uuid) + + print(f"✓ {len(work_map)} Works mappés") + print() + + deleted_count = 0 + inserted_count = 0 + errors = [] + + # Create iterator with or without tqdm + if HAS_TQDM: + iterator = tqdm( + oversized_chunks, + desc="Re-chunking", + unit="chunks" + ) + else: + iterator = oversized_chunks + print("Re-chunking en cours...") + + for idx, old_chunk in enumerate(iterator, 1): + try: + # Re-chunk text + new_texts = simple_chunk_with_overlap( + old_chunk['text'], + max_words=MAX_WORDS, + overlap_words=OVERLAP_WORDS + ) + + # Get work reference + work_uuid = work_map.get(old_chunk['workTitle']) + if not work_uuid: + errors.append(f"Chunk {old_chunk['uuid'][:8]}: Work '{old_chunk['workTitle']}' introuvable") + continue + + # Insert new chunks + for i, new_text in enumerate(new_texts): + # Sub-ordering: multiply base index by 100 and add part index + # Example: orderIndex=5 becomes 500, 501, 502, etc. + new_order_index = (old_chunk['orderIndex'] * 100) + i + + new_props = { + "text": new_text, + "summary": "", # Empty summary for simple chunks + "keywords": old_chunk['keywords'], + "workTitle": old_chunk['workTitle'], + "workAuthor": old_chunk['workAuthor'], + "year": old_chunk['year'], + "language": old_chunk['language'], + "sectionPath": old_chunk['sectionPath'], + "chapterTitle": old_chunk['chapterTitle'], + "canonicalReference": old_chunk['canonicalReference'], + "unitType": old_chunk['unitType'], + "orderIndex": new_order_index, + } + + chunk_v2.data.insert( + properties=new_props, + references={"work": work_uuid} + ) + inserted_count += 1 + + # Delete old chunk + chunk_v2.data.delete_by_id(old_chunk['uuid']) + deleted_count += 1 + + # Progress without tqdm + if not HAS_TQDM and idx % 5 == 0: + print(f" {idx}/{len(oversized_chunks)} chunks traités...") + + except Exception as e: + errors.append(f"Chunk {old_chunk['uuid'][:8]}: {e}") + + print() + print("-" * 80) + print(f"✓ Chunks supprimés: {deleted_count}") + print(f"✓ Nouveaux chunks créés: {inserted_count}") + if deleted_count > 0: + print(f" Expansion moyenne: {inserted_count / deleted_count:.1f}x") + else: + print(f" ⚠️ Aucun chunk supprimé - vérifier les erreurs") + + if errors: + print() + print(f"⚠️ Erreurs rencontrées: {len(errors)}") + for err in errors[:10]: + print(f" - {err}") + if len(errors) > 10: + print(f" ... et {len(errors) - 10} autres") + + print() + + # ========== 3. VÉRIFICATION ========== + print("3. VÉRIFICATION POST-RECHUNKING") + print("-" * 80) + print() + + print("Comptage des nouveaux chunks...") + remaining_oversized = 0 + total_chunks = 0 + + for chunk in chunk_v2.iterator(include_vector=False): + total_chunks += 1 + text = chunk.properties.get('text', '') + tokens = estimate_tokens(text) + if tokens > TOKEN_THRESHOLD: + remaining_oversized += 1 + + print(f"✓ Total chunks: {total_chunks:,}") + print(f"✓ Chunks > {TOKEN_THRESHOLD} tokens: {remaining_oversized}") + + if remaining_oversized == 0: + print() + print("✅ Aucun chunk surdimensionné restant!") + else: + print() + print(f"⚠️ {remaining_oversized} chunks encore > {TOKEN_THRESHOLD} tokens") + print(" Relancer le script si nécessaire") + + print() + print("=" * 80) + print("RE-CHUNKING TERMINÉ") + print("=" * 80) + print() + + print("RÉSULTATS:") + print(f" • Chunks supprimés: {deleted_count}") + print(f" • Nouveaux chunks créés: {inserted_count}") + if deleted_count > 0: + print(f" • Expansion: {inserted_count / deleted_count:.1f}x") + print(f" • Chunks restants > {TOKEN_THRESHOLD} tokens: {remaining_oversized}") + print() + + if remaining_oversized == 0 and deleted_count > 0: + print("✅ RE-CHUNKING RÉUSSI") + print() + print("AMÉLIORATIONS:") + print(f" • {deleted_count} chunks géants éliminés") + print(f" • {inserted_count} chunks optimaux créés") + print(f" • Taille max: {MAX_WORDS} mots (~{MAX_WORDS * 2.5:.0f} tokens)") + print(f" • Overlap: {OVERLAP_WORDS} mots (contexte préservé)") + print() + print("PROCHAINES ÉTAPES:") + print(" 1. Tester la recherche sémantique") + print(" 2. Vérifier la qualité des vecteurs") + print(" 3. Optionnel: Mettre à jour Summary_v2.chunksCount si nécessaire") + elif deleted_count == 0: + print("ℹ️ Aucun chunk n'a nécessité de re-chunking") + +finally: + client.close() diff --git a/10_test_search_quality.py b/10_test_search_quality.py new file mode 100644 index 0000000..86a4e28 --- /dev/null +++ b/10_test_search_quality.py @@ -0,0 +1,402 @@ +"""Test search quality with re-chunked data. + +This script tests semantic search to verify that the re-chunking improved +search quality and relevance. + +Tests: + 1. Chunk size distribution after re-chunking + 2. Overlap verification between consecutive chunks + 3. Semantic search quality on various queries + 4. Comparison of results from giant chunks vs optimized chunks +""" + +import weaviate +import sys +import requests +from pathlib import Path + +# Add utils to path +sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag")) + +from utils.llm_chunker_improved import estimate_tokens + +if sys.stdout.encoding != 'utf-8': + sys.stdout.reconfigure(encoding='utf-8') + +# Vectorizer URL (same as in 11_vectorize_missing_chunks.py) +VECTORIZER_URL = "http://localhost:8090/vectors" + +def vectorize_query(query: str) -> list[float]: + """Manually vectorize a query using text2vec-transformers service. + + Args: + query: Query text to vectorize + + Returns: + Vector as list of floats (1024 dimensions for BGE-M3) + """ + response = requests.post( + VECTORIZER_URL, + json={"text": query}, + headers={"Content-Type": "application/json"}, + timeout=30 + ) + if response.status_code != 200: + raise RuntimeError(f"Vectorization failed: HTTP {response.status_code}") + + result = response.json() + vector = result.get('vector') + if not vector: + raise RuntimeError("No vector in response") + + return vector + +client = weaviate.connect_to_local() + +try: + print("=" * 80) + print("TEST DE LA QUALITÉ DE RECHERCHE APRÈS RE-CHUNKING") + print("=" * 80) + print() + + chunk_v2 = client.collections.get("Chunk_v2") + + # ========== 1. DISTRIBUTION DES TAILLES ========== + print("1. DISTRIBUTION DES TAILLES DE CHUNKS") + print("-" * 80) + print() + + print("Analyse en cours...") + + sizes = [] + for chunk in chunk_v2.iterator(include_vector=False): + text = chunk.properties.get('text', '') + tokens = estimate_tokens(text) + sizes.append(tokens) + + total = len(sizes) + avg = sum(sizes) / total + max_size = max(sizes) + min_size = min(sizes) + + print(f"Total chunks: {total:,}") + print(f"Taille moyenne: {avg:.0f} tokens") + print(f"Min: {min_size} tokens") + print(f"Max: {max_size} tokens") + print() + + # Distribution par tranches + ranges = [ + (0, 500, "Très petits"), + (500, 1000, "Petits"), + (1000, 1500, "Moyens"), + (1500, 2000, "Grands"), + (2000, 3000, "Très grands"), + (3000, 10000, "ÉNORMES"), + ] + + print("Distribution par tranches:") + for min_tok, max_tok, label in ranges: + count = sum(1 for s in sizes if min_tok <= s < max_tok) + percentage = count / total * 100 + bar = "█" * int(percentage / 2) + print(f" {min_tok:>5}-{max_tok:>5} tokens ({label:15}): {count:>5} ({percentage:>5.1f}%) {bar}") + + print() + + # ========== 2. VÉRIFICATION OVERLAP ========== + print("2. VÉRIFICATION DE L'OVERLAP ENTRE CHUNKS CONSÉCUTIFS") + print("-" * 80) + print() + + # Prendre une œuvre pour vérifier l'overlap + print("Analyse de l'overlap dans 'Between Past and Future'...") + + arendt_chunks = [] + for chunk in chunk_v2.iterator(include_vector=False): + props = chunk.properties + if props.get('workTitle') == 'Between Past and Future': + arendt_chunks.append({ + 'orderIndex': props.get('orderIndex', 0), + 'text': props.get('text', ''), + 'sectionPath': props.get('sectionPath', '') + }) + + # Trier par orderIndex + arendt_chunks.sort(key=lambda x: x['orderIndex']) + + print(f"Chunks trouvés: {len(arendt_chunks)}") + print() + + # Vérifier overlap entre chunks consécutifs de même section + overlaps_found = 0 + overlaps_checked = 0 + + for i in range(len(arendt_chunks) - 1): + current = arendt_chunks[i] + next_chunk = arendt_chunks[i + 1] + + # Vérifier si même section (potentiellement des chunks split) + if current['sectionPath'] == next_chunk['sectionPath']: + # Extraire les derniers 200 caractères du chunk actuel + current_end = current['text'][-200:].strip() + # Extraire les premiers 200 caractères du chunk suivant + next_start = next_chunk['text'][:200].strip() + + # Chercher overlap + overlap_found = False + for length in range(50, 201, 10): # Tester différentes longueurs + if len(current_end) < length or len(next_start) < length: + continue + + test_end = current_end[-length:] + if test_end in next_start: + overlap_found = True + overlaps_found += 1 + break + + overlaps_checked += 1 + + if overlaps_checked > 0: + print(f"Chunks consécutifs vérifiés: {overlaps_checked}") + print(f"Overlaps détectés: {overlaps_found} ({overlaps_found/overlaps_checked*100:.1f}%)") + else: + print("Aucun chunk consécutif dans la même section (pas de split détecté)") + + print() + + # ========== 3. TESTS DE RECHERCHE SÉMANTIQUE ========== + print("3. TESTS DE RECHERCHE SÉMANTIQUE") + print("-" * 80) + print() + + test_queries = [ + { + "query": "What is the nature of representation in cognitive science?", + "expected_work": "Mind Design III", + "description": "Requête philosophique complexe" + }, + { + "query": "Comment définit-on la vertu selon Platon?", + "expected_work": "Platon - Ménon", + "description": "Requête en français sur un concept spécifique" + }, + { + "query": "pragmatism and belief fixation", + "expected_work": "Collected papers", + "description": "Concepts multiples (test de granularité)" + }, + { + "query": "Entre la logique des termes et la grammaire spéculative", + "expected_work": "La pensée-signe", + "description": "Requête technique académique" + }, + ] + + for i, test in enumerate(test_queries, 1): + print(f"Test {i}: {test['description']}") + print(f"Query: \"{test['query']}\"") + print() + + # Vectorize query and search with near_vector + # (Chunk_v2 has no vectorizer, so we must manually vectorize queries) + query_vector = vectorize_query(test['query']) + result = chunk_v2.query.near_vector( + near_vector=query_vector, + limit=5, + return_properties=[ + 'text', 'workTitle', 'workAuthor', + 'sectionPath', 'chapterTitle' + ], + return_metadata=['distance'] + ) + + if not result.objects: + print(" ❌ Aucun résultat trouvé") + print() + continue + + # Analyser les résultats + print(f" Résultats: {len(result.objects)}") + print() + + for j, obj in enumerate(result.objects, 1): + props = obj.properties + work_title = props.get('workTitle', 'N/A') + text = props.get('text', '') + tokens = estimate_tokens(text) + + # Distance (si disponible) + distance = getattr(obj.metadata, 'distance', None) if hasattr(obj, 'metadata') else None + distance_str = f" (distance: {distance:.4f})" if distance else "" + + # Marquer si c'est l'œuvre attendue + match_icon = "✓" if test['expected_work'] in work_title else " " + + print(f" [{match_icon}] {j}. {work_title}{distance_str}") + print(f" Taille: {tokens} tokens") + print(f" Section: {props.get('sectionPath', 'N/A')[:60]}...") + print(f" Extrait: {text[:120]}...") + print() + + # Vérifier si l'œuvre attendue est dans les résultats + found_expected = any( + test['expected_work'] in obj.properties.get('workTitle', '') + for obj in result.objects + ) + + if found_expected: + rank = next( + i for i, obj in enumerate(result.objects, 1) + if test['expected_work'] in obj.properties.get('workTitle', '') + ) + print(f" ✅ Œuvre attendue trouvée (rang {rank}/5)") + else: + print(f" ⚠️ Œuvre attendue '{test['expected_work']}' non trouvée dans le top 5") + + print() + print("-" * 80) + print() + + # ========== 4. STATISTIQUES GLOBALES ========== + print("4. STATISTIQUES GLOBALES DE RECHERCHE") + print("-" * 80) + print() + + # Tester une requête large + broad_query = "philosophy and logic" + print(f"Requête large: \"{broad_query}\"") + print() + + query_vector = vectorize_query(broad_query) + result = chunk_v2.query.near_vector( + near_vector=query_vector, + limit=20, + return_properties=['workTitle', 'text'] + ) + + # Compter par œuvre + work_distribution = {} + chunk_sizes_in_results = [] + + for obj in result.objects: + props = obj.properties + work = props.get('workTitle', 'Unknown') + work_distribution[work] = work_distribution.get(work, 0) + 1 + + text = props.get('text', '') + tokens = estimate_tokens(text) + chunk_sizes_in_results.append(tokens) + + print(f"Résultats par œuvre (top 20):") + for work, count in sorted(work_distribution.items(), key=lambda x: x[1], reverse=True): + print(f" • {work}: {count} chunks") + + print() + + if chunk_sizes_in_results: + avg_result_size = sum(chunk_sizes_in_results) / len(chunk_sizes_in_results) + max_result_size = max(chunk_sizes_in_results) + print(f"Taille moyenne des chunks retournés: {avg_result_size:.0f} tokens") + print(f"Taille max des chunks retournés: {max_result_size} tokens") + + print() + + # ========== 5. SCORE DE QUALITÉ ========== + print("5. SCORE DE QUALITÉ DE LA RECHERCHE") + print("-" * 80) + print() + + quality_checks = [] + + # Check 1: Aucun chunk > 2000 tokens + oversized = sum(1 for s in sizes if s > 2000) + quality_checks.append({ + 'name': 'Taille des chunks', + 'passed': oversized == 0, + 'detail': f'{oversized} chunks > 2000 tokens' + }) + + # Check 2: Distribution équilibrée + optimal_range = sum(1 for s in sizes if 200 <= s <= 1500) + optimal_percentage = optimal_range / total * 100 + quality_checks.append({ + 'name': 'Distribution optimale', + 'passed': optimal_percentage >= 80, + 'detail': f'{optimal_percentage:.1f}% dans range optimal (200-1500 tokens)' + }) + + # Check 3: Résultats variés + unique_works = len(work_distribution) + quality_checks.append({ + 'name': 'Diversité des résultats', + 'passed': unique_works >= 3, + 'detail': f'{unique_works} œuvres différentes dans top 20' + }) + + # Check 4: Overlap présent + quality_checks.append({ + 'name': 'Overlap entre chunks', + 'passed': overlaps_found > 0 if overlaps_checked > 0 else None, + 'detail': f'{overlaps_found}/{overlaps_checked} overlaps détectés' if overlaps_checked > 0 else 'N/A' + }) + + # Afficher les résultats + passed = sum(1 for c in quality_checks if c['passed'] is True) + total_checks = sum(1 for c in quality_checks if c['passed'] is not None) + + for check in quality_checks: + if check['passed'] is None: + icon = "⚠️" + status = "N/A" + elif check['passed']: + icon = "✅" + status = "OK" + else: + icon = "❌" + status = "FAIL" + + print(f"{icon} {check['name']}: {status}") + print(f" {check['detail']}") + + print() + print(f"Score: {passed}/{total_checks} ({passed/total_checks*100:.0f}%)") + print() + + # ========== 6. RÉSUMÉ ========== + print("=" * 80) + print("RÉSUMÉ DU TEST") + print("=" * 80) + print() + + if passed >= total_checks * 0.8: + print("✅ QUALITÉ DE RECHERCHE: EXCELLENTE") + print() + print("Les chunks re-chunkés ont amélioré la recherche:") + print(f" • {total:,} chunks optimisés") + print(f" • Taille moyenne: {avg:.0f} tokens (optimal)") + print(f" • {optimal_percentage:.1f}% dans la plage optimale") + print(f" • Max: {max_size} tokens (< 2500)") + print(f" • Overlap détecté: {overlaps_found > 0 if overlaps_checked > 0 else 'N/A'}") + print() + print("Recommandations:") + print(" ✓ La recherche sémantique fonctionne correctement") + print(" ✓ Les chunks sont de taille optimale pour BGE-M3") + print(" ✓ Le système est prêt pour la production") + elif passed >= total_checks * 0.6: + print("⚠️ QUALITÉ DE RECHERCHE: BONNE") + print() + print("Quelques améliorations possibles:") + for check in quality_checks: + if not check['passed'] and check['passed'] is not None: + print(f" • {check['name']}: {check['detail']}") + else: + print("❌ QUALITÉ DE RECHERCHE: À AMÉLIORER") + print() + print("Problèmes détectés:") + for check in quality_checks: + if not check['passed'] and check['passed'] is not None: + print(f" • {check['name']}: {check['detail']}") + +finally: + client.close() diff --git a/11_vectorize_missing_chunks.py b/11_vectorize_missing_chunks.py new file mode 100644 index 0000000..7dd7aaf --- /dev/null +++ b/11_vectorize_missing_chunks.py @@ -0,0 +1,217 @@ +"""Vectorize chunks that don't have vectors. + +After re-chunking, new chunks were created without vectors because Chunk_v2 +collection has no vectorizer configured. This script manually vectorizes +these chunks using the text2vec-transformers service. +""" + +import weaviate +import sys +import requests + +if sys.stdout.encoding != 'utf-8': + sys.stdout.reconfigure(encoding='utf-8') + +# Try to import tqdm +try: + from tqdm import tqdm + HAS_TQDM = True +except ImportError: + HAS_TQDM = False + +# Text2vec-transformers service URL (from docker-compose.yml) +VECTORIZER_URL = "http://localhost:8090/vectors" + +client = weaviate.connect_to_local() + +try: + print("=" * 80) + print("VECTORISATION DES CHUNKS SANS VECTEUR") + print("=" * 80) + print() + + chunk_v2 = client.collections.get("Chunk_v2") + + # ========== 1. IDENTIFIER LES CHUNKS SANS VECTEUR ========== + print("1. IDENTIFICATION DES CHUNKS SANS VECTEUR") + print("-" * 80) + print() + + print("Analyse en cours...") + + chunks_to_vectorize = [] + + for chunk in chunk_v2.iterator(include_vector=True): + if not chunk.vector or not chunk.vector.get('default'): + props = chunk.properties + chunks_to_vectorize.append({ + 'uuid': chunk.uuid, + 'text': props.get('text', ''), + 'summary': props.get('summary', ''), + 'keywords': props.get('keywords', []), + 'workTitle': props.get('workTitle', 'N/A') + }) + + print(f"✓ {len(chunks_to_vectorize)} chunks sans vecteur trouvés") + print() + + if not chunks_to_vectorize: + print("✅ Aucun chunk à vectoriser") + print() + print("=" * 80) + print("SCRIPT TERMINÉ - RIEN À FAIRE") + print("=" * 80) + sys.exit(0) + + # ========== 2. VECTORISATION ========== + print("2. VECTORISATION DES CHUNKS") + print("-" * 80) + print() + + print(f"Service vectorizer: {VECTORIZER_URL}") + print() + + vectorized_count = 0 + errors = [] + + # Create iterator with or without tqdm + if HAS_TQDM: + iterator = tqdm( + chunks_to_vectorize, + desc="Vectorisation", + unit="chunks" + ) + else: + iterator = chunks_to_vectorize + print("Vectorisation en cours...") + + for idx, chunk_data in enumerate(iterator, 1): + try: + # Prepare text for vectorization + # Combine text, summary, and keywords as per original Chunk schema + text_parts = [chunk_data['text']] + + if chunk_data['summary']: + text_parts.append(chunk_data['summary']) + + if chunk_data['keywords']: + text_parts.append(' '.join(chunk_data['keywords'])) + + combined_text = ' '.join(text_parts) + + # Call text2vec-transformers service + response = requests.post( + VECTORIZER_URL, + json={"text": combined_text}, + headers={"Content-Type": "application/json"}, + timeout=30 + ) + + if response.status_code != 200: + errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: HTTP {response.status_code}") + continue + + result = response.json() + vector = result.get('vector') + + if not vector: + errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: Pas de vecteur dans la réponse") + continue + + # Update chunk with vector + chunk_v2.data.update( + uuid=chunk_data['uuid'], + vector=vector + ) + + vectorized_count += 1 + + # Progress without tqdm + if not HAS_TQDM and idx % 10 == 0: + print(f" {idx}/{len(chunks_to_vectorize)} chunks vectorisés...") + + except requests.exceptions.RequestException as e: + errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: Erreur réseau - {e}") + except Exception as e: + errors.append(f"Chunk {str(chunk_data['uuid'])[:8]}: {e}") + + print() + print("-" * 80) + print(f"✓ Chunks vectorisés: {vectorized_count}/{len(chunks_to_vectorize)}") + + if errors: + print() + print(f"⚠️ Erreurs rencontrées: {len(errors)}") + for err in errors[:10]: + print(f" - {err}") + if len(errors) > 10: + print(f" ... et {len(errors) - 10} autres") + + print() + + # ========== 3. VÉRIFICATION ========== + print("3. VÉRIFICATION POST-VECTORISATION") + print("-" * 80) + print() + + print("Recomptage...") + remaining_without_vector = 0 + total_chunks = 0 + + for chunk in chunk_v2.iterator(include_vector=True): + total_chunks += 1 + if not chunk.vector or not chunk.vector.get('default'): + remaining_without_vector += 1 + + chunks_with_vector = total_chunks - remaining_without_vector + + print(f"✓ Total chunks: {total_chunks:,}") + print(f"✓ Avec vecteur: {chunks_with_vector:,} ({chunks_with_vector/total_chunks*100:.1f}%)") + print(f"✓ Sans vecteur: {remaining_without_vector:,}") + + print() + + if remaining_without_vector == 0: + print("✅ Tous les chunks ont été vectorisés!") + else: + print(f"⚠️ {remaining_without_vector} chunks encore sans vecteur") + print(" Relancer le script ou vérifier les erreurs") + + print() + print("=" * 80) + print("VECTORISATION TERMINÉE") + print("=" * 80) + print() + + if remaining_without_vector == 0: + print("✅ VECTORISATION RÉUSSIE") + print() + print("RÉSULTATS:") + print(f" • {vectorized_count} nouveaux vecteurs créés") + print(f" • {total_chunks:,} chunks totaux") + print(f" • 100% des chunks ont des vecteurs") + print() + print("PROCHAINES ÉTAPES:") + print(" 1. Relancer le test de recherche: python 10_test_search_quality.py") + print(" 2. Tester l'application Flask") + print() + print("NOTE: Chunk_v2 n'a toujours pas de vectorizer configuré.") + print("Les futurs nouveaux chunks devront être vectorisés manuellement") + print("OU la collection devra être recréée avec un vectorizer.") + elif vectorized_count > 0: + print("⚠️ VECTORISATION PARTIELLE") + print() + print(f" • {vectorized_count} chunks vectorisés") + print(f" • {remaining_without_vector} chunks restants") + print(" • Vérifier les erreurs et relancer") + else: + print("❌ VECTORISATION ÉCHOUÉE") + print() + print("Aucun chunk n'a pu être vectorisé.") + print("Vérifications:") + print(f" 1. Service text2vec-transformers actif: {VECTORIZER_URL}") + print(" 2. Docker containers en cours d'exécution") + print(" 3. Logs des erreurs ci-dessus") + +finally: + client.close() diff --git a/generations/library_rag/docker-compose.yml b/generations/library_rag/docker-compose.yml index 812e1ee..5c5f8f4 100644 --- a/generations/library_rag/docker-compose.yml +++ b/generations/library_rag/docker-compose.yml @@ -31,6 +31,10 @@ services: AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: "true" # ok pour dev/local PERSISTENCE_DATA_PATH: "/var/lib/weaviate" CLUSTER_HOSTNAME: "node1" + CLUSTER_GOSSIP_BIND_PORT: "7946" + CLUSTER_DATA_BIND_PORT: "7947" + # Fix for "No private IP address found" error + CLUSTER_JOIN: "" DEFAULT_VECTORIZER_MODULE: "text2vec-transformers" ENABLE_MODULES: "text2vec-transformers" TRANSFORMERS_INFERENCE_API: "http://text2vec-transformers:8080" @@ -56,6 +60,8 @@ services: # - Current setup: CPU-only with AVX2 optimization (functional but slower) image: cr.weaviate.io/semitechnologies/transformers-inference:baai-bge-m3-onnx-latest restart: on-failure:0 + ports: + - "8090:8080" # Expose vectorizer API for manual vectorization environment: # ONNX runtime - CPU only (CUDA not supported in ONNX version) ENABLE_CUDA: "0" diff --git a/generations/library_rag/utils/llm_chunker.py b/generations/library_rag/utils/llm_chunker.py index d12b5cb..b361c8e 100644 --- a/generations/library_rag/utils/llm_chunker.py +++ b/generations/library_rag/utils/llm_chunker.py @@ -52,9 +52,15 @@ from .llm_structurer import ( ) from .llm_cleaner import clean_page_markers, is_chunk_valid from .types import LLMProvider, SemanticChunk +from .llm_chunker_improved import simple_chunk_with_overlap, validate_chunk_size logger: logging.Logger = logging.getLogger(__name__) +# Chunk size limits (2024-01 optimization) +MAX_CHUNK_WORDS = 1000 # Hard limit to stay within BGE-M3 context +OVERLAP_WORDS = 100 # Overlap for context preservation +FORCE_SIMPLE_CHUNKING_THRESHOLD = 1500 # Words - force simple chunking above this + # ============================================================================= # Type Definitions for LLM Chunker @@ -221,8 +227,43 @@ def chunk_section_with_llm( # Nettoyer le contenu content: str = clean_page_markers(section_content) - # Si le contenu est court, ne pas découper + # Compter les mots word_count: int = len(content.split()) + + # FORCE SIMPLE CHUNKING if section is too long (> 1500 words) + # This prevents giant chunks that exceed BGE-M3 limits + if word_count > FORCE_SIMPLE_CHUNKING_THRESHOLD: + logger.warning( + f"Section '{section_title}' is too long ({word_count} words), " + f"forcing simple chunking with overlap" + ) + simple_texts = simple_chunk_with_overlap( + content, + max_words=MAX_CHUNK_WORDS, + overlap_words=OVERLAP_WORDS + ) + + # Convert to SemanticChunk format + result_chunks: List[SemanticChunk] = [] + for i, text in enumerate(simple_texts): + para_num = extract_paragraph_number(text) + chunk: SemanticChunk = { + "text": text, + "summary": f"{section_title} (partie {i+1}/{len(simple_texts)})", + "concepts": [], + "type": "main_content", + "section_level": section_level, + } + if para_num is not None: + chunk["paragraph_number"] = para_num + if subsection_title and subsection_title != section_title: + chunk["subsection_title"] = subsection_title + result_chunks.append(chunk) + + logger.info(f"Section split into {len(result_chunks)} chunks with overlap") + return result_chunks + + # Si le contenu est court, ne pas découper if word_count < target_chunk_size * 0.8: para_num: Optional[int] = extract_paragraph_number(content) chunk: SemanticChunk = { @@ -320,39 +361,66 @@ RÉPONDS avec un JSON entre : valid_chunks.append(chunk_data) - # Si aucun chunk valide, retourner le contenu complet + # Si aucun chunk valide, utiliser simple chunking avec overlap if not valid_chunks: - logger.warning(f"Aucun chunk valide pour '{section_title}', retour contenu complet") - para_num = extract_paragraph_number(content) - fallback: SemanticChunk = { - "text": content, - "summary": section_title, - "concepts": [], - "type": "main_content", - "section_level": section_level, - } - if para_num is not None: - fallback["paragraph_number"] = para_num - return [fallback] + logger.warning( + f"Aucun chunk valide pour '{section_title}', " + f"fallback vers simple chunking avec overlap" + ) + simple_texts = simple_chunk_with_overlap( + content, + max_words=MAX_CHUNK_WORDS, + overlap_words=OVERLAP_WORDS + ) + + fallback_chunks: List[SemanticChunk] = [] + for i, text in enumerate(simple_texts): + para_num = extract_paragraph_number(text) + chunk_data: SemanticChunk = { + "text": text, + "summary": f"{section_title} (partie {i+1}/{len(simple_texts)})", + "concepts": [], + "type": "main_content", + "section_level": section_level, + } + if para_num is not None: + chunk_data["paragraph_number"] = para_num + fallback_chunks.append(chunk_data) + + logger.info(f"Fallback: section split into {len(fallback_chunks)} chunks") + return fallback_chunks logger.info(f"Section '{section_title}' découpée en {len(valid_chunks)} chunks") return valid_chunks except Exception as e: logger.error(f"Erreur chunking LLM: {e}") - # Fallback: retourner le contenu complet - para_num = extract_paragraph_number(content) - fallback_err: SemanticChunk = { - "text": content, - "summary": section_title, - "concepts": [], - "type": "main_content", - "section_level": section_level, - "error": str(e), - } - if para_num is not None: - fallback_err["paragraph_number"] = para_num - return [fallback_err] + # Fallback: utiliser simple chunking avec overlap + logger.warning(f"Exception LLM, fallback vers simple chunking avec overlap") + + simple_texts = simple_chunk_with_overlap( + content, + max_words=MAX_CHUNK_WORDS, + overlap_words=OVERLAP_WORDS + ) + + error_chunks: List[SemanticChunk] = [] + for i, text in enumerate(simple_texts): + para_num = extract_paragraph_number(text) + chunk_data: SemanticChunk = { + "text": text, + "summary": f"{section_title} (partie {i+1}/{len(simple_texts)})", + "concepts": [], + "type": "main_content", + "section_level": section_level, + "error": f"LLM failed: {str(e)}", + } + if para_num is not None: + chunk_data["paragraph_number"] = para_num + error_chunks.append(chunk_data) + + logger.info(f"Error fallback: section split into {len(error_chunks)} chunks") + return error_chunks def simple_chunk_by_paragraphs( diff --git a/generations/library_rag/utils/llm_chunker_improved.py b/generations/library_rag/utils/llm_chunker_improved.py new file mode 100644 index 0000000..97ae46a --- /dev/null +++ b/generations/library_rag/utils/llm_chunker_improved.py @@ -0,0 +1,232 @@ +"""Improved semantic chunking with strict size limits and overlap. + +This module adds strict chunk size constraints (max 1000 words) and overlap +functionality (100 words) to prevent giant chunks that exceed BGE-M3 limits. + +Key improvements: + - MAX_CHUNK_WORDS = 1000 (hard limit) + - OVERLAP_WORDS = 100 (context preservation) + - Fallback to simple chunking if section > 1500 words + - Fallback to simple chunking if LLM fails +""" + +from __future__ import annotations + +import logging +import re +from typing import List, Optional + +from .llm_cleaner import clean_page_markers + +logger: logging.Logger = logging.getLogger(__name__) + +# Constants +MAX_CHUNK_WORDS = 1000 # Hard limit per chunk (~2500 tokens) +OVERLAP_WORDS = 100 # Overlap between chunks for context +MIN_CHUNK_WORDS = 100 # Minimum chunk size + + +def simple_chunk_with_overlap( + content: str, + max_words: int = MAX_CHUNK_WORDS, + min_words: int = MIN_CHUNK_WORDS, + overlap_words: int = OVERLAP_WORDS, +) -> List[str]: + """Split text into chunks with overlap for context preservation. + + This is an improved version of simple_chunk_by_paragraphs that adds + overlap between consecutive chunks to maintain context. + + Algorithm: + 1. Split by paragraph boundaries (double newlines) + 2. Merge small paragraphs until max_words is reached + 3. Split long paragraphs at sentence boundaries + 4. Add overlap_words from previous chunk to next chunk + 5. Filter chunks below min_words threshold + + Args: + content: Text content to split into chunks. + max_words: Maximum words per chunk. Defaults to 1000. + min_words: Minimum words per chunk. Defaults to 100. + overlap_words: Words to overlap between chunks. Defaults to 100. + + Returns: + List of text chunks as strings with overlap. + + Example: + >>> chunks = simple_chunk_with_overlap(text, max_words=1000, overlap_words=100) + >>> # Each chunk overlaps with 100 words from previous chunk + """ + content = clean_page_markers(content) + + # Split by paragraphs + paragraphs: List[str] = re.split(r'\n\n+', content) + + chunks: List[str] = [] + current_chunk: List[str] = [] + current_words: int = 0 + overlap_buffer: List[str] = [] # Store last sentences for overlap + + def finalize_chunk() -> None: + """Finalize current chunk and prepare overlap.""" + nonlocal current_chunk, current_words, overlap_buffer + + if not current_chunk: + return + + chunk_text = '\n\n'.join(current_chunk) + chunks.append(chunk_text) + + # Extract last sentences for overlap + sentences = re.split(r'(?<=[.!?])\s+', chunk_text) + overlap_buffer = [] + overlap_word_count = 0 + + # Take last sentences until we reach overlap_words + for sentence in reversed(sentences): + sentence_words = len(sentence.split()) + if overlap_word_count + sentence_words <= overlap_words: + overlap_buffer.insert(0, sentence) + overlap_word_count += sentence_words + else: + break + + current_chunk = [] + current_words = 0 + + for para in paragraphs: + para = para.strip() + if not para: + continue + + para_words: int = len(para.split()) + + # If paragraph is too long, split by sentences + if para_words > max_words: + # Finalize current chunk first + if current_chunk: + finalize_chunk() + + # Add overlap if exists + if overlap_buffer and chunks: + current_chunk.extend(overlap_buffer) + current_words = sum(len(s.split()) for s in overlap_buffer) + + # Split long paragraph by sentences + sentences: List[str] = re.split(r'(?<=[.!?])\s+', para) + for sentence in sentences: + sentence_words: int = len(sentence.split()) + + if current_words + sentence_words > max_words and current_chunk: + finalize_chunk() + + # Add overlap + if overlap_buffer: + current_chunk.extend(overlap_buffer) + current_words = sum(len(s.split()) for s in overlap_buffer) + + current_chunk.append(sentence) + current_words += sentence_words + else: + current_chunk.append(sentence) + current_words += sentence_words + + # If adding paragraph exceeds limit + elif current_words + para_words > max_words: + if current_chunk: + finalize_chunk() + + # Add overlap + if overlap_buffer and chunks: + current_chunk.extend(overlap_buffer) + current_words = sum(len(s.split()) for s in overlap_buffer) + + current_chunk.append(para) + current_words += para_words + + else: + current_chunk.append(para) + current_words += para_words + + # Last chunk + if current_chunk: + chunk_text = '\n\n'.join(current_chunk) + chunks.append(chunk_text) + + # Filter chunks that are too short (unless it's the only chunk) + if len(chunks) > 1: + chunks = [c for c in chunks if len(c.split()) >= min_words] + + return chunks + + +def get_chunk_text_with_context( + chunks: List[str], + index: int, + context_words: int = 50 +) -> tuple[str, str, str]: + """Get chunk with before/after context for better LLM processing. + + Args: + chunks: List of chunk texts. + index: Index of the chunk to process. + context_words: Words of context to include from adjacent chunks. + + Returns: + Tuple of (before_context, chunk_text, after_context). + """ + chunk = chunks[index] + + before_context = "" + if index > 0: + prev_chunk = chunks[index - 1] + words = prev_chunk.split() + before_context = " ".join(words[-context_words:]) if len(words) > context_words else prev_chunk + + after_context = "" + if index < len(chunks) - 1: + next_chunk = chunks[index + 1] + words = next_chunk.split() + after_context = " ".join(words[:context_words]) if len(words) > context_words else next_chunk + + return before_context, chunk, after_context + + +def estimate_tokens(text: str) -> int: + """Estimate token count from text. + + Uses approximation of 1 token ≈ 4 characters. + + Args: + text: Text to estimate. + + Returns: + Estimated token count. + """ + return len(text) // 4 + + +def validate_chunk_size(text: str, max_tokens: int = 2500) -> bool: + """Validate that chunk size is within acceptable limits. + + Args: + text: Chunk text to validate. + max_tokens: Maximum allowed tokens (default 2500 for safety margin below BGE-M3's 8192). + + Returns: + True if chunk is valid size, False otherwise. + """ + tokens = estimate_tokens(text) + return tokens <= max_tokens + + +# Export key functions +__all__ = [ + 'simple_chunk_with_overlap', + 'get_chunk_text_with_context', + 'estimate_tokens', + 'validate_chunk_size', + 'MAX_CHUNK_WORDS', + 'OVERLAP_WORDS', + 'MIN_CHUNK_WORDS', +]