linear-coding-agent/09_rechunk_oversized.py

"""Script to re-chunk oversized chunks (> 2000 tokens) in Chunk_v2.

This script identifies chunks that are too large (> 2000 tokens) and splits them
into smaller chunks with overlap (max 1000 words, overlap 100 words).

Steps:
    1. Identify all chunks > 2000 tokens in Chunk_v2
    2. Re-chunk using simple_chunk_with_overlap (1000 words max, 100 overlap)
    3. Delete the original oversized chunk
    4. Insert new smaller chunks with preserved metadata
    5. Update Summary_v2 chunksCount if needed
"""

import weaviate
import sys
from pathlib import Path

# Add utils to path
sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))

from utils.llm_chunker_improved import simple_chunk_with_overlap, estimate_tokens

if sys.stdout.encoding != 'utf-8':
    sys.stdout.reconfigure(encoding='utf-8')

# Try to import tqdm
try:
    from tqdm import tqdm
    HAS_TQDM = True
except ImportError:
    HAS_TQDM = False

# Constants
TOKEN_THRESHOLD = 2000  # Chunks > 2000 tokens will be re-chunked
MAX_WORDS = 1000
OVERLAP_WORDS = 100

client = weaviate.connect_to_local()

try:
    print("=" * 80)
    print("RE-CHUNKING DES CHUNKS SURDIMENSIONNÉS")
    print("=" * 80)
    print()

    chunk_v2 = client.collections.get("Chunk_v2")
    work_collection = client.collections.get("Work")

    # ========== 1. IDENTIFIER LES CHUNKS PROBLÉMATIQUES ==========
    print("1. IDENTIFICATION DES CHUNKS > 2000 TOKENS")
    print("-" * 80)
    print()

    oversized_chunks = []

    print("Analyse en cours...")
    for chunk in chunk_v2.iterator(include_vector=False):
        props = chunk.properties
        text = props.get('text', '')
        tokens = estimate_tokens(text)

        if tokens > TOKEN_THRESHOLD:
            oversized_chunks.append({
                'uuid': str(chunk.uuid),
                'tokens': tokens,
                'chars': len(text),
                'text': text,
                'workTitle': props.get('workTitle', ''),
                'workAuthor': props.get('workAuthor', ''),
                'year': props.get('year', 0),
                'language': props.get('language', 'en'),
                'sectionPath': props.get('sectionPath', ''),
                'chapterTitle': props.get('chapterTitle', ''),
                'canonicalReference': props.get('canonicalReference', ''),
                'unitType': props.get('unitType', 'main_content'),
                'keywords': props.get('keywords', []),
                'orderIndex': props.get('orderIndex', 0),
            })

    print(f"✓ {len(oversized_chunks)} chunks > {TOKEN_THRESHOLD} tokens trouvés")
    print()

    if not oversized_chunks:
        print("✅ Aucun chunk surdimensionné à traiter")
        print()
        print("=" * 80)
        print("SCRIPT TERMINÉ - RIEN À FAIRE")
        print("=" * 80)
        sys.exit(0)

    # Trier par taille
    oversized_chunks.sort(key=lambda x: x['tokens'], reverse=True)

    print("Top 5 plus gros chunks:")
    for i, chunk in enumerate(oversized_chunks[:5], 1):
        print(f"{i}. {chunk['tokens']:,} tokens ({chunk['chars']:,} chars)")
        print(f"   Œuvre: {chunk['workTitle']}")
        print(f"   Section: {chunk['sectionPath'][:60]}...")
        print()

    if len(oversized_chunks) > 5:
        print(f"... et {len(oversized_chunks) - 5} autres")

    print()

    # ========== 2. RE-CHUNKING ==========
    print("2. RE-CHUNKING AVEC OVERLAP")
    print("-" * 80)
    print()

    # Build work_title -> work_uuid map for references
    work_map = {}
    for work in work_collection.iterator(include_vector=False):
        props = work.properties
        title = props.get("title")
        if title:
            work_map[title] = str(work.uuid)

    print(f"✓ {len(work_map)} Works mappés")
    print()

    deleted_count = 0
    inserted_count = 0
    errors = []

    # Create iterator with or without tqdm
    if HAS_TQDM:
        iterator = tqdm(
            oversized_chunks,
            desc="Re-chunking",
            unit="chunks"
        )
    else:
        iterator = oversized_chunks
        print("Re-chunking en cours...")

    for idx, old_chunk in enumerate(iterator, 1):
        try:
            # Re-chunk text
            new_texts = simple_chunk_with_overlap(
                old_chunk['text'],
                max_words=MAX_WORDS,
                overlap_words=OVERLAP_WORDS
            )

            # Get work reference
            work_uuid = work_map.get(old_chunk['workTitle'])
            if not work_uuid:
                errors.append(f"Chunk {old_chunk['uuid'][:8]}: Work '{old_chunk['workTitle']}' introuvable")
                continue

            # Insert new chunks
            for i, new_text in enumerate(new_texts):
                # Sub-ordering: multiply base index by 100 and add part index
                # Example: orderIndex=5 becomes 500, 501, 502, etc.
                new_order_index = (old_chunk['orderIndex'] * 100) + i

                new_props = {
                    "text": new_text,
                    "summary": "",  # Empty summary for simple chunks
                    "keywords": old_chunk['keywords'],
                    "workTitle": old_chunk['workTitle'],
                    "workAuthor": old_chunk['workAuthor'],
                    "year": old_chunk['year'],
                    "language": old_chunk['language'],
                    "sectionPath": old_chunk['sectionPath'],
                    "chapterTitle": old_chunk['chapterTitle'],
                    "canonicalReference": old_chunk['canonicalReference'],
                    "unitType": old_chunk['unitType'],
                    "orderIndex": new_order_index,
                }

                chunk_v2.data.insert(
                    properties=new_props,
                    references={"work": work_uuid}
                )
                inserted_count += 1

            # Delete old chunk
            chunk_v2.data.delete_by_id(old_chunk['uuid'])
            deleted_count += 1

            # Progress without tqdm
            if not HAS_TQDM and idx % 5 == 0:
                print(f"  {idx}/{len(oversized_chunks)} chunks traités...")

        except Exception as e:
            errors.append(f"Chunk {old_chunk['uuid'][:8]}: {e}")

    print()
    print("-" * 80)
    print(f"✓ Chunks supprimés: {deleted_count}")
    print(f"✓ Nouveaux chunks créés: {inserted_count}")
    if deleted_count > 0:
        print(f"  Expansion moyenne: {inserted_count / deleted_count:.1f}x")
    else:
        print(f"  ⚠️  Aucun chunk supprimé - vérifier les erreurs")

    if errors:
        print()
        print(f"⚠️  Erreurs rencontrées: {len(errors)}")
        for err in errors[:10]:
            print(f"  - {err}")
        if len(errors) > 10:
            print(f"  ... et {len(errors) - 10} autres")

    print()

    # ========== 3. VÉRIFICATION ==========
    print("3. VÉRIFICATION POST-RECHUNKING")
    print("-" * 80)
    print()

    print("Comptage des nouveaux chunks...")
    remaining_oversized = 0
    total_chunks = 0

    for chunk in chunk_v2.iterator(include_vector=False):
        total_chunks += 1
        text = chunk.properties.get('text', '')
        tokens = estimate_tokens(text)
        if tokens > TOKEN_THRESHOLD:
            remaining_oversized += 1

    print(f"✓ Total chunks: {total_chunks:,}")
    print(f"✓ Chunks > {TOKEN_THRESHOLD} tokens: {remaining_oversized}")

    if remaining_oversized == 0:
        print()
        print("✅ Aucun chunk surdimensionné restant!")
    else:
        print()
        print(f"⚠️  {remaining_oversized} chunks encore > {TOKEN_THRESHOLD} tokens")
        print("   Relancer le script si nécessaire")

    print()
    print("=" * 80)
    print("RE-CHUNKING TERMINÉ")
    print("=" * 80)
    print()

    print("RÉSULTATS:")
    print(f"  • Chunks supprimés: {deleted_count}")
    print(f"  • Nouveaux chunks créés: {inserted_count}")
    if deleted_count > 0:
        print(f"  • Expansion: {inserted_count / deleted_count:.1f}x")
    print(f"  • Chunks restants > {TOKEN_THRESHOLD} tokens: {remaining_oversized}")
    print()

    if remaining_oversized == 0 and deleted_count > 0:
        print("✅ RE-CHUNKING RÉUSSI")
        print()
        print("AMÉLIORATIONS:")
        print(f"  • {deleted_count} chunks géants éliminés")
        print(f"  • {inserted_count} chunks optimaux créés")
        print(f"  • Taille max: {MAX_WORDS} mots (~{MAX_WORDS * 2.5:.0f} tokens)")
        print(f"  • Overlap: {OVERLAP_WORDS} mots (contexte préservé)")
        print()
        print("PROCHAINES ÉTAPES:")
        print("  1. Tester la recherche sémantique")
        print("  2. Vérifier la qualité des vecteurs")
        print("  3. Optionnel: Mettre à jour Summary_v2.chunksCount si nécessaire")
    elif deleted_count == 0:
        print("ℹ️  Aucun chunk n'a nécessité de re-chunking")

finally:
    client.close()