- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
496 lines
16 KiB
Python
496 lines
16 KiB
Python
"""Semantic chunking of documents via LLM.
|
|
|
|
This module provides intelligent semantic chunking capabilities for academic and
|
|
philosophical texts, using Large Language Models (LLM) to identify coherent units
|
|
of meaning (argumentative units, definitions, examples, citations, etc.).
|
|
|
|
Overview:
|
|
The module offers two chunking strategies:
|
|
|
|
1. **LLM-based semantic chunking** (chunk_section_with_llm):
|
|
Uses an LLM to identify semantic boundaries and create chunks that preserve
|
|
argumentative coherence. Each chunk is annotated with summary, concepts, type.
|
|
|
|
2. **Simple paragraph-based chunking** (simple_chunk_by_paragraphs):
|
|
A fast fallback that splits text by paragraph boundaries.
|
|
|
|
Semantic Unit Types:
|
|
- argument: A logical argument or reasoning sequence
|
|
- definition: A definition or conceptual clarification
|
|
- example: An illustrative example or case study
|
|
- citation: A quoted passage from another source
|
|
- exposition: Expository content presenting ideas
|
|
- transition: Transitional text between sections
|
|
|
|
Chunk Size Guidelines:
|
|
- Target size: 300-500 words per chunk (configurable)
|
|
- Chunks are never split mid-sentence or mid-paragraph
|
|
- Short sections (< 80% of target) are kept as single chunks
|
|
|
|
LLM Provider Support:
|
|
- ollama: Local LLM (free, slower, default)
|
|
- mistral: Mistral API (faster, requires API key)
|
|
|
|
See Also:
|
|
utils.llm_cleaner: Chunk cleaning and validation
|
|
utils.llm_classifier: Section type classification
|
|
utils.pdf_pipeline: Main pipeline orchestration
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List, Literal, Optional, TypedDict
|
|
|
|
from .llm_structurer import (
|
|
_clean_json_string,
|
|
_get_default_mistral_model,
|
|
_get_default_model,
|
|
call_llm,
|
|
)
|
|
from .llm_cleaner import clean_page_markers, is_chunk_valid
|
|
from .types import LLMProvider, SemanticChunk
|
|
|
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
|
|
|
|
# =============================================================================
|
|
# Type Definitions for LLM Chunker
|
|
# =============================================================================
|
|
|
|
#: Unit type for semantic chunking (specific to this module's LLM output)
|
|
ChunkUnitType = Literal[
|
|
"argument",
|
|
"definition",
|
|
"example",
|
|
"citation",
|
|
"exposition",
|
|
"transition",
|
|
"main_content",
|
|
]
|
|
|
|
|
|
class LLMChunkResponse(TypedDict, total=False):
|
|
"""Individual chunk structure as returned by LLM.
|
|
|
|
Attributes:
|
|
text: Chunk text content (exact copy from source)
|
|
summary: Brief one-sentence summary
|
|
concepts: Key concepts extracted (3-5 items)
|
|
type: Semantic unit type
|
|
"""
|
|
|
|
text: str
|
|
summary: str
|
|
concepts: List[str]
|
|
type: str
|
|
|
|
|
|
class LLMChunksResult(TypedDict):
|
|
"""Complete response structure from LLM chunking.
|
|
|
|
Attributes:
|
|
chunks: List of chunk objects
|
|
"""
|
|
|
|
chunks: List[LLMChunkResponse]
|
|
|
|
|
|
# Note: SemanticChunk is imported from utils.types
|
|
|
|
|
|
def extract_paragraph_number(text: str) -> Optional[int]:
|
|
"""Extract paragraph number from the beginning of text.
|
|
|
|
Many philosophical texts use numbered paragraphs. This function
|
|
detects various numbering formats.
|
|
|
|
Args:
|
|
text: Text content that may start with a paragraph number.
|
|
|
|
Returns:
|
|
The paragraph number if detected, None otherwise.
|
|
|
|
Example:
|
|
>>> extract_paragraph_number("9 On presente...")
|
|
9
|
|
>>> extract_paragraph_number("Normal text")
|
|
None
|
|
"""
|
|
text = text.strip()
|
|
|
|
# Patterns possibles pour les numéros de paragraphe
|
|
patterns: List[str] = [
|
|
r'^(\d+)\s+[A-ZÀ-Ü]', # "9 On présente..."
|
|
r'^(\d+)[A-ZÀ-Ü]', # "10Dans la classification..."
|
|
r'^§\s*(\d+)', # "§ 15 ..."
|
|
r'^\[(\d+)\]', # "[9] ..."
|
|
r'^(\d+)\.', # "9. ..."
|
|
r'^(\d+)\)', # "9) ..."
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match: Optional[re.Match[str]] = re.match(pattern, text)
|
|
if match:
|
|
try:
|
|
return int(match.group(1))
|
|
except ValueError:
|
|
continue
|
|
|
|
return None
|
|
|
|
|
|
def _extract_json_from_response(text: str) -> Dict[str, Any]:
|
|
"""Extract JSON from LLM response text.
|
|
|
|
Handles both wrapped JSON (in <JSON></JSON> tags) and raw JSON responses.
|
|
Falls back to empty chunks list if parsing fails.
|
|
|
|
Args:
|
|
text: Response text from LLM containing JSON.
|
|
|
|
Returns:
|
|
Parsed JSON as dictionary with 'chunks' key. Returns
|
|
{"chunks": []} if parsing fails.
|
|
"""
|
|
json_match: Optional[re.Match[str]] = re.search(
|
|
r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL
|
|
)
|
|
if json_match:
|
|
json_str: str = _clean_json_string(json_match.group(1))
|
|
try:
|
|
result: Dict[str, Any] = json.loads(json_str)
|
|
return result
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
start: int = text.find("{")
|
|
end: int = text.rfind("}")
|
|
if start != -1 and end > start:
|
|
json_str = _clean_json_string(text[start:end + 1])
|
|
try:
|
|
result = json.loads(json_str)
|
|
return result
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"JSON invalide: {e}")
|
|
|
|
return {"chunks": []}
|
|
|
|
|
|
def chunk_section_with_llm(
|
|
section_content: str,
|
|
section_title: str,
|
|
chapter_title: Optional[str] = None,
|
|
subsection_title: Optional[str] = None,
|
|
section_level: int = 1,
|
|
model: Optional[str] = None,
|
|
provider: LLMProvider = "ollama",
|
|
temperature: float = 0.2,
|
|
target_chunk_size: int = 400,
|
|
) -> List[SemanticChunk]:
|
|
"""Split a section into semantically coherent chunks using an LLM.
|
|
|
|
This is the main semantic chunking function. It uses an LLM to identify
|
|
natural semantic boundaries in academic/philosophical texts, preserving
|
|
argumentative coherence and annotating each chunk with metadata.
|
|
|
|
Args:
|
|
section_content: The text content of the section to chunk.
|
|
section_title: Title of the current section being chunked.
|
|
chapter_title: Title of the parent chapter (level 1) for context.
|
|
subsection_title: Title of parent subsection (level 2) if applicable.
|
|
section_level: Hierarchy level (1=chapter, 2=section, etc.).
|
|
model: LLM model name. If None, uses provider default.
|
|
provider: LLM provider ("ollama" for local, "mistral" for API).
|
|
temperature: LLM temperature (lower = more deterministic).
|
|
target_chunk_size: Target number of words per chunk.
|
|
|
|
Returns:
|
|
List of SemanticChunk dictionaries containing text, summary,
|
|
concepts, type, section_level, and optionally paragraph_number.
|
|
|
|
Note:
|
|
If section is shorter than 80% of target_chunk_size, it is returned
|
|
as a single chunk. If LLM fails, returns section with error field.
|
|
"""
|
|
if model is None:
|
|
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
|
|
|
# Nettoyer le contenu
|
|
content: str = clean_page_markers(section_content)
|
|
|
|
# Si le contenu est court, ne pas découper
|
|
word_count: int = len(content.split())
|
|
if word_count < target_chunk_size * 0.8:
|
|
para_num: Optional[int] = extract_paragraph_number(content)
|
|
chunk: SemanticChunk = {
|
|
"text": content,
|
|
"summary": section_title,
|
|
"concepts": [],
|
|
"type": "main_content",
|
|
"section_level": section_level,
|
|
}
|
|
if para_num is not None:
|
|
chunk["paragraph_number"] = para_num
|
|
if subsection_title and subsection_title != section_title:
|
|
chunk["subsection_title"] = subsection_title
|
|
return [chunk]
|
|
|
|
chapter_info: str = f"Chapitre: {chapter_title}\n" if chapter_title else ""
|
|
|
|
prompt = f"""Tu es un expert en analyse de textes académiques.
|
|
|
|
TÂCHE: Découper ce texte en unités sémantiques cohérentes.
|
|
|
|
{chapter_info}Section: {section_title}
|
|
|
|
RÈGLES DE DÉCOUPAGE:
|
|
1. Chaque chunk doit avoir un SENS COMPLET (une idée, un argument)
|
|
2. Taille idéale: {target_chunk_size - 100} à {target_chunk_size + 100} mots
|
|
3. NE PAS couper au milieu d'une phrase ou d'un paragraphe
|
|
4. NE PAS couper au milieu d'une citation
|
|
5. Regrouper les paragraphes qui développent la même idée
|
|
6. Un chunk peut être plus long si nécessaire pour préserver le sens
|
|
|
|
POUR CHAQUE CHUNK, INDIQUE:
|
|
- text: le texte exact (copié, pas reformulé)
|
|
- summary: résumé en 1 phrase courte
|
|
- concepts: 3-5 concepts clés (mots ou expressions)
|
|
- type: argument | définition | exemple | citation | exposition | transition
|
|
|
|
TEXTE À DÉCOUPER:
|
|
{content}
|
|
|
|
RÉPONDS avec un JSON entre <JSON></JSON>:
|
|
|
|
<JSON>
|
|
{{
|
|
"chunks": [
|
|
{{
|
|
"text": "Premier paragraphe ou groupe de paragraphes...",
|
|
"summary": "Présentation de l'idée principale",
|
|
"concepts": ["concept1", "concept2", "concept3"],
|
|
"type": "exposition"
|
|
}},
|
|
{{
|
|
"text": "Deuxième partie du texte...",
|
|
"summary": "Développement de l'argument",
|
|
"concepts": ["concept4", "concept5"],
|
|
"type": "argument"
|
|
}}
|
|
]
|
|
}}
|
|
</JSON>
|
|
"""
|
|
|
|
logger.info(f"Chunking sémantique de '{section_title}' ({word_count} mots) via {provider.upper()}")
|
|
|
|
try:
|
|
response: str = call_llm(
|
|
prompt, model=model, provider=provider, temperature=temperature, timeout=300
|
|
)
|
|
result: Dict[str, Any] = _extract_json_from_response(response)
|
|
chunks: List[Dict[str, Any]] = result.get("chunks", [])
|
|
|
|
# Valider les chunks et extraire les numéros de paragraphe
|
|
valid_chunks: List[SemanticChunk] = []
|
|
for raw_chunk in chunks:
|
|
text: str = raw_chunk.get("text", "")
|
|
if is_chunk_valid(text):
|
|
# Extraire le numéro de paragraphe s'il existe
|
|
para_num = extract_paragraph_number(text)
|
|
|
|
chunk_data: SemanticChunk = {
|
|
"text": text,
|
|
"summary": raw_chunk.get("summary", ""),
|
|
"concepts": raw_chunk.get("concepts", []),
|
|
"type": raw_chunk.get("type", "main_content"),
|
|
"section_level": section_level,
|
|
}
|
|
|
|
# Ajouter le numéro de paragraphe si détecté
|
|
if para_num is not None:
|
|
chunk_data["paragraph_number"] = para_num
|
|
|
|
# Ajouter la hiérarchie complète
|
|
if subsection_title and subsection_title != section_title:
|
|
chunk_data["subsection_title"] = subsection_title
|
|
|
|
valid_chunks.append(chunk_data)
|
|
|
|
# Si aucun chunk valide, retourner le contenu complet
|
|
if not valid_chunks:
|
|
logger.warning(f"Aucun chunk valide pour '{section_title}', retour contenu complet")
|
|
para_num = extract_paragraph_number(content)
|
|
fallback: SemanticChunk = {
|
|
"text": content,
|
|
"summary": section_title,
|
|
"concepts": [],
|
|
"type": "main_content",
|
|
"section_level": section_level,
|
|
}
|
|
if para_num is not None:
|
|
fallback["paragraph_number"] = para_num
|
|
return [fallback]
|
|
|
|
logger.info(f"Section '{section_title}' découpée en {len(valid_chunks)} chunks")
|
|
return valid_chunks
|
|
|
|
except Exception as e:
|
|
logger.error(f"Erreur chunking LLM: {e}")
|
|
# Fallback: retourner le contenu complet
|
|
para_num = extract_paragraph_number(content)
|
|
fallback_err: SemanticChunk = {
|
|
"text": content,
|
|
"summary": section_title,
|
|
"concepts": [],
|
|
"type": "main_content",
|
|
"section_level": section_level,
|
|
"error": str(e),
|
|
}
|
|
if para_num is not None:
|
|
fallback_err["paragraph_number"] = para_num
|
|
return [fallback_err]
|
|
|
|
|
|
def simple_chunk_by_paragraphs(
|
|
content: str,
|
|
max_words: int = 500,
|
|
min_words: int = 100,
|
|
) -> List[str]:
|
|
"""Split text into chunks by paragraph boundaries (no LLM required).
|
|
|
|
This is a fast fallback chunking method that respects paragraph and
|
|
sentence boundaries. Use when LLM processing is not desired.
|
|
|
|
The algorithm:
|
|
1. Split by double newlines (paragraph boundaries)
|
|
2. Merge small paragraphs until max_words is reached
|
|
3. Split long paragraphs at sentence boundaries
|
|
4. Filter chunks below min_words threshold
|
|
|
|
Args:
|
|
content: Text content to split into chunks.
|
|
max_words: Maximum words per chunk. Defaults to 500.
|
|
min_words: Minimum words per chunk. Defaults to 100.
|
|
|
|
Returns:
|
|
List of text chunks as strings.
|
|
|
|
Example:
|
|
>>> chunks = simple_chunk_by_paragraphs(text, max_words=400)
|
|
>>> len(chunks)
|
|
3
|
|
"""
|
|
content = clean_page_markers(content)
|
|
|
|
# Découper par paragraphes (double saut de ligne)
|
|
paragraphs: List[str] = re.split(r'\n\n+', content)
|
|
|
|
chunks: List[str] = []
|
|
current_chunk: List[str] = []
|
|
current_words: int = 0
|
|
|
|
for para in paragraphs:
|
|
para = para.strip()
|
|
if not para:
|
|
continue
|
|
|
|
para_words: int = len(para.split())
|
|
|
|
# Si le paragraphe seul est trop long, le découper par phrases
|
|
if para_words > max_words:
|
|
if current_chunk:
|
|
chunks.append('\n\n'.join(current_chunk))
|
|
current_chunk = []
|
|
current_words = 0
|
|
|
|
# Découper par phrases
|
|
sentences: List[str] = re.split(r'(?<=[.!?])\s+', para)
|
|
for sentence in sentences:
|
|
sentence_words: int = len(sentence.split())
|
|
if current_words + sentence_words > max_words and current_chunk:
|
|
chunks.append('\n\n'.join(current_chunk))
|
|
current_chunk = [sentence]
|
|
current_words = sentence_words
|
|
else:
|
|
current_chunk.append(sentence)
|
|
current_words += sentence_words
|
|
|
|
# Si ajouter ce paragraphe dépasse la limite
|
|
elif current_words + para_words > max_words:
|
|
if current_chunk:
|
|
chunks.append('\n\n'.join(current_chunk))
|
|
current_chunk = [para]
|
|
current_words = para_words
|
|
|
|
else:
|
|
current_chunk.append(para)
|
|
current_words += para_words
|
|
|
|
# Dernier chunk
|
|
if current_chunk:
|
|
chunks.append('\n\n'.join(current_chunk))
|
|
|
|
# Filtrer les chunks trop courts
|
|
return [c for c in chunks if len(c.split()) >= min_words or len(chunks) == 1]
|
|
|
|
|
|
def extract_concepts_from_chunk(
|
|
chunk_text: str,
|
|
model: Optional[str] = None,
|
|
provider: LLMProvider = "ollama",
|
|
) -> List[str]:
|
|
"""Extract key concepts from a text chunk using an LLM.
|
|
|
|
Useful for enriching chunks created without LLM processing or for
|
|
extracting additional concepts from existing chunks.
|
|
|
|
Args:
|
|
chunk_text: The text content to analyze for concepts.
|
|
model: LLM model name. If None, uses provider default.
|
|
provider: LLM provider ("ollama" or "mistral").
|
|
|
|
Returns:
|
|
List of 3-5 key concepts (words or short phrases). Returns
|
|
empty list if extraction fails or text is too short (< 100 chars).
|
|
|
|
Example:
|
|
>>> concepts = extract_concepts_from_chunk("L'etre-pour-la-mort...")
|
|
>>> concepts
|
|
['etre-pour-la-mort', 'structure existentiale', 'Dasein']
|
|
"""
|
|
if model is None:
|
|
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
|
|
|
if len(chunk_text) < 100:
|
|
return []
|
|
|
|
prompt: str = f"""Extrait les 3-5 concepts clés de ce texte.
|
|
Un concept = un mot ou une expression courte (2-3 mots max).
|
|
|
|
Texte:
|
|
{chunk_text[:1500]}
|
|
|
|
Réponds avec une liste JSON simple:
|
|
["concept1", "concept2", "concept3"]
|
|
"""
|
|
|
|
try:
|
|
response: str = call_llm(prompt, model=model, provider=provider, temperature=0.1, timeout=60)
|
|
|
|
# Chercher la liste JSON
|
|
match: Optional[re.Match[str]] = re.search(r'\[.*?\]', response, re.DOTALL)
|
|
if match:
|
|
concepts: List[str] = json.loads(match.group())
|
|
return concepts[:5] # Max 5 concepts
|
|
|
|
return []
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Erreur extraction concepts: {e}")
|
|
return []
|
|
|