linear-coding-agent/generations/library_rag/utils/llm_chunker.py

"""Semantic chunking of documents via LLM.

This module provides intelligent semantic chunking capabilities for academic and
philosophical texts, using Large Language Models (LLM) to identify coherent units
of meaning (argumentative units, definitions, examples, citations, etc.).

Overview:
    The module offers two chunking strategies:

    1. **LLM-based semantic chunking** (chunk_section_with_llm):
       Uses an LLM to identify semantic boundaries and create chunks that preserve
       argumentative coherence. Each chunk is annotated with summary, concepts, type.

    2. **Simple paragraph-based chunking** (simple_chunk_by_paragraphs):
       A fast fallback that splits text by paragraph boundaries.

Semantic Unit Types:
    - argument: A logical argument or reasoning sequence
    - definition: A definition or conceptual clarification
    - example: An illustrative example or case study
    - citation: A quoted passage from another source
    - exposition: Expository content presenting ideas
    - transition: Transitional text between sections

Chunk Size Guidelines:
    - Target size: 300-500 words per chunk (configurable)
    - Chunks are never split mid-sentence or mid-paragraph
    - Short sections (< 80% of target) are kept as single chunks

LLM Provider Support:
    - ollama: Local LLM (free, slower, default)
    - mistral: Mistral API (faster, requires API key)

See Also:
    utils.llm_cleaner: Chunk cleaning and validation
    utils.llm_classifier: Section type classification
    utils.pdf_pipeline: Main pipeline orchestration
"""

from __future__ import annotations

import json
import logging
import re
from typing import Any, Dict, List, Literal, Optional, TypedDict

from .llm_structurer import (
    _clean_json_string,
    _get_default_mistral_model,
    _get_default_model,
    call_llm,
)
from .llm_cleaner import clean_page_markers, is_chunk_valid
from .types import LLMProvider, SemanticChunk

logger: logging.Logger = logging.getLogger(__name__)


# =============================================================================
# Type Definitions for LLM Chunker
# =============================================================================

#: Unit type for semantic chunking (specific to this module's LLM output)
ChunkUnitType = Literal[
    "argument",
    "definition",
    "example",
    "citation",
    "exposition",
    "transition",
    "main_content",
]


class LLMChunkResponse(TypedDict, total=False):
    """Individual chunk structure as returned by LLM.

    Attributes:
        text: Chunk text content (exact copy from source)
        summary: Brief one-sentence summary
        concepts: Key concepts extracted (3-5 items)
        type: Semantic unit type
    """

    text: str
    summary: str
    concepts: List[str]
    type: str


class LLMChunksResult(TypedDict):
    """Complete response structure from LLM chunking.

    Attributes:
        chunks: List of chunk objects
    """

    chunks: List[LLMChunkResponse]


# Note: SemanticChunk is imported from utils.types


def extract_paragraph_number(text: str) -> Optional[int]:
    """Extract paragraph number from the beginning of text.

    Many philosophical texts use numbered paragraphs. This function
    detects various numbering formats.

    Args:
        text: Text content that may start with a paragraph number.

    Returns:
        The paragraph number if detected, None otherwise.

    Example:
        >>> extract_paragraph_number("9 On presente...")
        9
        >>> extract_paragraph_number("Normal text")
        None
    """
    text = text.strip()

    # Patterns possibles pour les numéros de paragraphe
    patterns: List[str] = [
        r'^(\d+)\s+[A-ZÀ-Ü]',  # "9 On présente..."
        r'^(\d+)[A-ZÀ-Ü]',      # "10Dans la classification..."
        r'^§\s*(\d+)',          # "§ 15 ..."
        r'^\[(\d+)\]',          # "[9] ..."
        r'^(\d+)\.',            # "9. ..."
        r'^(\d+)\)',            # "9) ..."
    ]

    for pattern in patterns:
        match: Optional[re.Match[str]] = re.match(pattern, text)
        if match:
            try:
                return int(match.group(1))
            except ValueError:
                continue

    return None


def _extract_json_from_response(text: str) -> Dict[str, Any]:
    """Extract JSON from LLM response text.

    Handles both wrapped JSON (in <JSON></JSON> tags) and raw JSON responses.
    Falls back to empty chunks list if parsing fails.

    Args:
        text: Response text from LLM containing JSON.

    Returns:
        Parsed JSON as dictionary with 'chunks' key. Returns
        {"chunks": []} if parsing fails.
    """
    json_match: Optional[re.Match[str]] = re.search(
        r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL
    )
    if json_match:
        json_str: str = _clean_json_string(json_match.group(1))
        try:
            result: Dict[str, Any] = json.loads(json_str)
            return result
        except json.JSONDecodeError:
            pass

    start: int = text.find("{")
    end: int = text.rfind("}")
    if start != -1 and end > start:
        json_str = _clean_json_string(text[start:end + 1])
        try:
            result = json.loads(json_str)
            return result
        except json.JSONDecodeError as e:
            logger.warning(f"JSON invalide: {e}")

    return {"chunks": []}


def chunk_section_with_llm(
    section_content: str,
    section_title: str,
    chapter_title: Optional[str] = None,
    subsection_title: Optional[str] = None,
    section_level: int = 1,
    model: Optional[str] = None,
    provider: LLMProvider = "ollama",
    temperature: float = 0.2,
    target_chunk_size: int = 400,
) -> List[SemanticChunk]:
    """Split a section into semantically coherent chunks using an LLM.

    This is the main semantic chunking function. It uses an LLM to identify
    natural semantic boundaries in academic/philosophical texts, preserving
    argumentative coherence and annotating each chunk with metadata.

    Args:
        section_content: The text content of the section to chunk.
        section_title: Title of the current section being chunked.
        chapter_title: Title of the parent chapter (level 1) for context.
        subsection_title: Title of parent subsection (level 2) if applicable.
        section_level: Hierarchy level (1=chapter, 2=section, etc.).
        model: LLM model name. If None, uses provider default.
        provider: LLM provider ("ollama" for local, "mistral" for API).
        temperature: LLM temperature (lower = more deterministic).
        target_chunk_size: Target number of words per chunk.

    Returns:
        List of SemanticChunk dictionaries containing text, summary,
        concepts, type, section_level, and optionally paragraph_number.

    Note:
        If section is shorter than 80% of target_chunk_size, it is returned
        as a single chunk. If LLM fails, returns section with error field.
    """
    if model is None:
        model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()

    # Nettoyer le contenu
    content: str = clean_page_markers(section_content)

    # Si le contenu est court, ne pas découper
    word_count: int = len(content.split())
    if word_count < target_chunk_size * 0.8:
        para_num: Optional[int] = extract_paragraph_number(content)
        chunk: SemanticChunk = {
            "text": content,
            "summary": section_title,
            "concepts": [],
            "type": "main_content",
            "section_level": section_level,
        }
        if para_num is not None:
            chunk["paragraph_number"] = para_num
        if subsection_title and subsection_title != section_title:
            chunk["subsection_title"] = subsection_title
        return [chunk]

    chapter_info: str = f"Chapitre: {chapter_title}\n" if chapter_title else ""

    prompt = f"""Tu es un expert en analyse de textes académiques.

TÂCHE: Découper ce texte en unités sémantiques cohérentes.

{chapter_info}Section: {section_title}

RÈGLES DE DÉCOUPAGE:
1. Chaque chunk doit avoir un SENS COMPLET (une idée, un argument)
2. Taille idéale: {target_chunk_size - 100} à {target_chunk_size + 100} mots
3. NE PAS couper au milieu d'une phrase ou d'un paragraphe
4. NE PAS couper au milieu d'une citation
5. Regrouper les paragraphes qui développent la même idée
6. Un chunk peut être plus long si nécessaire pour préserver le sens

POUR CHAQUE CHUNK, INDIQUE:
- text: le texte exact (copié, pas reformulé)
- summary: résumé en 1 phrase courte
- concepts: 3-5 concepts clés (mots ou expressions)
- type: argument | définition | exemple | citation | exposition | transition

TEXTE À DÉCOUPER:
{content}

RÉPONDS avec un JSON entre <JSON></JSON>:

<JSON>
{{
    "chunks": [
        {{
            "text": "Premier paragraphe ou groupe de paragraphes...",
            "summary": "Présentation de l'idée principale",
            "concepts": ["concept1", "concept2", "concept3"],
            "type": "exposition"
        }},
        {{
            "text": "Deuxième partie du texte...",
            "summary": "Développement de l'argument",
            "concepts": ["concept4", "concept5"],
            "type": "argument"
        }}
    ]
}}
</JSON>
"""

    logger.info(f"Chunking sémantique de '{section_title}' ({word_count} mots) via {provider.upper()}")

    try:
        response: str = call_llm(
            prompt, model=model, provider=provider, temperature=temperature, timeout=300
        )
        result: Dict[str, Any] = _extract_json_from_response(response)
        chunks: List[Dict[str, Any]] = result.get("chunks", [])

        # Valider les chunks et extraire les numéros de paragraphe
        valid_chunks: List[SemanticChunk] = []
        for raw_chunk in chunks:
            text: str = raw_chunk.get("text", "")
            if is_chunk_valid(text):
                # Extraire le numéro de paragraphe s'il existe
                para_num = extract_paragraph_number(text)

                chunk_data: SemanticChunk = {
                    "text": text,
                    "summary": raw_chunk.get("summary", ""),
                    "concepts": raw_chunk.get("concepts", []),
                    "type": raw_chunk.get("type", "main_content"),
                    "section_level": section_level,
                }

                # Ajouter le numéro de paragraphe si détecté
                if para_num is not None:
                    chunk_data["paragraph_number"] = para_num

                # Ajouter la hiérarchie complète
                if subsection_title and subsection_title != section_title:
                    chunk_data["subsection_title"] = subsection_title

                valid_chunks.append(chunk_data)

        # Si aucun chunk valide, retourner le contenu complet
        if not valid_chunks:
            logger.warning(f"Aucun chunk valide pour '{section_title}', retour contenu complet")
            para_num = extract_paragraph_number(content)
            fallback: SemanticChunk = {
                "text": content,
                "summary": section_title,
                "concepts": [],
                "type": "main_content",
                "section_level": section_level,
            }
            if para_num is not None:
                fallback["paragraph_number"] = para_num
            return [fallback]

        logger.info(f"Section '{section_title}' découpée en {len(valid_chunks)} chunks")
        return valid_chunks

    except Exception as e:
        logger.error(f"Erreur chunking LLM: {e}")
        # Fallback: retourner le contenu complet
        para_num = extract_paragraph_number(content)
        fallback_err: SemanticChunk = {
            "text": content,
            "summary": section_title,
            "concepts": [],
            "type": "main_content",
            "section_level": section_level,
            "error": str(e),
        }
        if para_num is not None:
            fallback_err["paragraph_number"] = para_num
        return [fallback_err]


def simple_chunk_by_paragraphs(
    content: str,
    max_words: int = 500,
    min_words: int = 100,
) -> List[str]:
    """Split text into chunks by paragraph boundaries (no LLM required).

    This is a fast fallback chunking method that respects paragraph and
    sentence boundaries. Use when LLM processing is not desired.

    The algorithm:
    1. Split by double newlines (paragraph boundaries)
    2. Merge small paragraphs until max_words is reached
    3. Split long paragraphs at sentence boundaries
    4. Filter chunks below min_words threshold

    Args:
        content: Text content to split into chunks.
        max_words: Maximum words per chunk. Defaults to 500.
        min_words: Minimum words per chunk. Defaults to 100.

    Returns:
        List of text chunks as strings.

    Example:
        >>> chunks = simple_chunk_by_paragraphs(text, max_words=400)
        >>> len(chunks)
        3
    """
    content = clean_page_markers(content)

    # Découper par paragraphes (double saut de ligne)
    paragraphs: List[str] = re.split(r'\n\n+', content)

    chunks: List[str] = []
    current_chunk: List[str] = []
    current_words: int = 0

    for para in paragraphs:
        para = para.strip()
        if not para:
            continue

        para_words: int = len(para.split())

        # Si le paragraphe seul est trop long, le découper par phrases
        if para_words > max_words:
            if current_chunk:
                chunks.append('\n\n'.join(current_chunk))
                current_chunk = []
                current_words = 0

            # Découper par phrases
            sentences: List[str] = re.split(r'(?<=[.!?])\s+', para)
            for sentence in sentences:
                sentence_words: int = len(sentence.split())
                if current_words + sentence_words > max_words and current_chunk:
                    chunks.append('\n\n'.join(current_chunk))
                    current_chunk = [sentence]
                    current_words = sentence_words
                else:
                    current_chunk.append(sentence)
                    current_words += sentence_words

        # Si ajouter ce paragraphe dépasse la limite
        elif current_words + para_words > max_words:
            if current_chunk:
                chunks.append('\n\n'.join(current_chunk))
            current_chunk = [para]
            current_words = para_words

        else:
            current_chunk.append(para)
            current_words += para_words

    # Dernier chunk
    if current_chunk:
        chunks.append('\n\n'.join(current_chunk))

    # Filtrer les chunks trop courts
    return [c for c in chunks if len(c.split()) >= min_words or len(chunks) == 1]


def extract_concepts_from_chunk(
    chunk_text: str,
    model: Optional[str] = None,
    provider: LLMProvider = "ollama",
) -> List[str]:
    """Extract key concepts from a text chunk using an LLM.

    Useful for enriching chunks created without LLM processing or for
    extracting additional concepts from existing chunks.

    Args:
        chunk_text: The text content to analyze for concepts.
        model: LLM model name. If None, uses provider default.
        provider: LLM provider ("ollama" or "mistral").

    Returns:
        List of 3-5 key concepts (words or short phrases). Returns
        empty list if extraction fails or text is too short (< 100 chars).

    Example:
        >>> concepts = extract_concepts_from_chunk("L'etre-pour-la-mort...")
        >>> concepts
        ['etre-pour-la-mort', 'structure existentiale', 'Dasein']
    """
    if model is None:
        model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()

    if len(chunk_text) < 100:
        return []

    prompt: str = f"""Extrait les 3-5 concepts clés de ce texte.
Un concept = un mot ou une expression courte (2-3 mots max).

Texte:
{chunk_text[:1500]}

Réponds avec une liste JSON simple:
["concept1", "concept2", "concept3"]
"""

    try:
        response: str = call_llm(prompt, model=model, provider=provider, temperature=0.1, timeout=60)

        # Chercher la liste JSON
        match: Optional[re.Match[str]] = re.search(r'\[.*?\]', response, re.DOTALL)
        if match:
            concepts: List[str] = json.loads(match.group())
            return concepts[:5]  # Max 5 concepts

        return []

    except Exception as e:
        logger.warning(f"Erreur extraction concepts: {e}")
        return []