Add Library RAG project and cleanup root directory
- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
74
generations/library_rag/utils/__init__.py
Normal file
74
generations/library_rag/utils/__init__.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""Utils - Pipeline de parsing PDF avec OCR Mistral et structuration LLM.
|
||||
|
||||
Version 2.0 : Pipeline intelligent avec extraction LLM des métadonnées,
|
||||
TOC, classification des sections, chunking sémantique et validation.
|
||||
"""
|
||||
|
||||
from .mistral_client import create_client, get_api_key, estimate_ocr_cost
|
||||
from .pdf_uploader import upload_pdf
|
||||
from .ocr_processor import run_ocr, serialize_ocr_response
|
||||
from .markdown_builder import build_markdown
|
||||
from .image_extractor import extract_images, create_image_writer
|
||||
from .hierarchy_parser import build_hierarchy
|
||||
from .llm_structurer import structure_with_llm, LLMStructureError
|
||||
|
||||
# Nouveaux modules LLM v2
|
||||
from .llm_metadata import extract_metadata
|
||||
from .llm_toc import extract_toc
|
||||
from .llm_classifier import classify_sections, filter_indexable_sections
|
||||
from .llm_cleaner import clean_chunk, clean_page_markers, is_chunk_valid
|
||||
from .llm_chunker import chunk_section_with_llm, simple_chunk_by_paragraphs, extract_concepts_from_chunk, extract_paragraph_number
|
||||
from .llm_validator import validate_document, apply_corrections, enrich_chunks_with_concepts
|
||||
|
||||
# Pipeline
|
||||
from .pdf_pipeline import process_pdf, process_pdf_v2, process_pdf_bytes
|
||||
from .weaviate_ingest import ingest_document, delete_document_chunks
|
||||
|
||||
__all__ = [
|
||||
# Client Mistral
|
||||
"create_client",
|
||||
"get_api_key",
|
||||
"estimate_ocr_cost",
|
||||
# Upload
|
||||
"upload_pdf",
|
||||
# OCR
|
||||
"run_ocr",
|
||||
"serialize_ocr_response",
|
||||
# Markdown
|
||||
"build_markdown",
|
||||
# Images
|
||||
"extract_images",
|
||||
"create_image_writer",
|
||||
# Hiérarchie
|
||||
"build_hierarchy",
|
||||
# LLM Legacy
|
||||
"structure_with_llm",
|
||||
"LLMStructureError",
|
||||
# LLM v2 - Métadonnées
|
||||
"extract_metadata",
|
||||
# LLM v2 - TOC
|
||||
"extract_toc",
|
||||
# LLM v2 - Classification
|
||||
"classify_sections",
|
||||
"filter_indexable_sections",
|
||||
# LLM v2 - Nettoyage
|
||||
"clean_chunk",
|
||||
"clean_page_markers",
|
||||
"is_chunk_valid",
|
||||
# LLM v2 - Chunking
|
||||
"chunk_section_with_llm",
|
||||
"simple_chunk_by_paragraphs",
|
||||
"extract_concepts_from_chunk",
|
||||
"extract_paragraph_number",
|
||||
# LLM v2 - Validation
|
||||
"validate_document",
|
||||
"apply_corrections",
|
||||
"enrich_chunks_with_concepts",
|
||||
# Pipeline
|
||||
"process_pdf",
|
||||
"process_pdf_v2",
|
||||
"process_pdf_bytes",
|
||||
# Weaviate
|
||||
"ingest_document",
|
||||
"delete_document_chunks",
|
||||
]
|
||||
267
generations/library_rag/utils/hierarchy_parser.py
Normal file
267
generations/library_rag/utils/hierarchy_parser.py
Normal file
@@ -0,0 +1,267 @@
|
||||
"""Hierarchical Markdown document parser for semantic chunking.
|
||||
|
||||
This module provides utilities for parsing Markdown documents into
|
||||
hierarchical structures based on heading levels (# to ######). It is
|
||||
a key component of the RAG pipeline, enabling:
|
||||
|
||||
1. **Structure Extraction**: Parse Markdown into a tree of sections
|
||||
2. **Context Preservation**: Maintain hierarchical context (part > chapter > section)
|
||||
3. **Semantic Chunking**: Flatten hierarchy into chunks with full path context
|
||||
|
||||
The parser uses a stack-based algorithm to build nested section trees,
|
||||
preserving the document's logical structure for downstream processing.
|
||||
|
||||
Architecture:
|
||||
Input: Raw Markdown text with headings
|
||||
↓
|
||||
build_hierarchy() → DocumentHierarchy (tree structure)
|
||||
↓
|
||||
flatten_hierarchy() → List[FlatChunk] (with hierarchical context)
|
||||
|
||||
TypedDict Definitions:
|
||||
- HierarchyPath: Hierarchical path (part/chapter/section/subsection)
|
||||
- HierarchyNode: Tree node with title, level, content, children
|
||||
- DocumentHierarchy: Complete document structure
|
||||
- FlatChunk: Flattened chunk with context for RAG ingestion
|
||||
|
||||
Algorithm:
|
||||
The build_hierarchy() function uses a stack-based approach:
|
||||
1. Initialize a virtual root node at level 0
|
||||
2. For each line in the document:
|
||||
- If heading: pop stack until parent level found, then push new node
|
||||
- If content: append to current node's content
|
||||
3. Finalize nodes by joining content lines
|
||||
|
||||
Example:
|
||||
>>> markdown = '''
|
||||
... # Introduction
|
||||
... This is the intro.
|
||||
...
|
||||
... ## Background
|
||||
... Some background text.
|
||||
...
|
||||
... ## Methodology
|
||||
... Methods used here.
|
||||
... '''
|
||||
>>> hierarchy = build_hierarchy(markdown)
|
||||
>>> print(hierarchy["sections"][0]["title"])
|
||||
'Introduction'
|
||||
>>> chunks = flatten_hierarchy(hierarchy)
|
||||
>>> for chunk in chunks:
|
||||
... print(f"{chunk['chunk_id']}: {chunk['title']}")
|
||||
chunk_00001: Introduction
|
||||
chunk_00002: Background
|
||||
chunk_00003: Methodology
|
||||
|
||||
See Also:
|
||||
- utils.llm_chunker: Semantic chunking using LLM
|
||||
- utils.markdown_builder: Markdown generation from OCR
|
||||
- utils.weaviate_ingest: Ingestion of chunks into Weaviate
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import List, Optional, Pattern, TypedDict
|
||||
|
||||
# Import type definitions from central types module
|
||||
from utils.types import (
|
||||
DocumentHierarchy,
|
||||
FlatChunk,
|
||||
HierarchyNode,
|
||||
HierarchyPath,
|
||||
)
|
||||
|
||||
|
||||
class _BuildNode(TypedDict):
|
||||
"""Noeud interne pour la construction de la hiérarchie."""
|
||||
|
||||
title: Optional[str]
|
||||
level: int
|
||||
content: List[str]
|
||||
children: List[_BuildNode]
|
||||
|
||||
|
||||
def build_hierarchy(markdown_text: str) -> DocumentHierarchy:
|
||||
"""Construit une structure hiérarchique à partir des titres Markdown.
|
||||
|
||||
Analyse les titres (# à ######) et construit un arbre de sections
|
||||
avec leur contenu textuel.
|
||||
|
||||
Args:
|
||||
markdown_text: Texte Markdown à analyser
|
||||
|
||||
Returns:
|
||||
Dictionnaire avec :
|
||||
- preamble: Texte avant le premier titre
|
||||
- sections: Liste de sections imbriquées
|
||||
|
||||
Chaque section contient :
|
||||
- title: Titre de la section
|
||||
- level: Niveau (1-6)
|
||||
- content: Contenu textuel
|
||||
- children: Sous-sections
|
||||
"""
|
||||
# Regex pour les titres Markdown
|
||||
heading_re: Pattern[str] = re.compile(r"^(#{1,6})\s+(.*)$")
|
||||
|
||||
lines: List[str] = markdown_text.splitlines()
|
||||
|
||||
# Noeud racine (niveau 0, virtuel)
|
||||
root: _BuildNode = {
|
||||
"title": None,
|
||||
"level": 0,
|
||||
"content": [],
|
||||
"children": [],
|
||||
}
|
||||
|
||||
# Pile pour suivre la hiérarchie
|
||||
stack: List[_BuildNode] = [root]
|
||||
|
||||
for line in lines:
|
||||
stripped: str = line.rstrip()
|
||||
match: Optional[re.Match[str]] = heading_re.match(stripped)
|
||||
|
||||
if match:
|
||||
# C'est un titre
|
||||
level: int = len(match.group(1))
|
||||
title: str = match.group(2).strip()
|
||||
|
||||
# Remonter dans la pile jusqu'au parent approprié
|
||||
while stack and stack[-1]["level"] >= level:
|
||||
stack.pop()
|
||||
|
||||
# Créer le nouveau noeud
|
||||
node: _BuildNode = {
|
||||
"title": title,
|
||||
"level": level,
|
||||
"content": [],
|
||||
"children": [],
|
||||
}
|
||||
|
||||
# Ajouter au parent
|
||||
parent: _BuildNode = stack[-1]
|
||||
parent["children"].append(node)
|
||||
|
||||
# Empiler le nouveau noeud
|
||||
stack.append(node)
|
||||
else:
|
||||
# C'est du contenu, l'ajouter au noeud courant
|
||||
stack[-1]["content"].append(stripped)
|
||||
|
||||
# Finaliser les noeuds (joindre le contenu)
|
||||
def finalize(node: _BuildNode) -> HierarchyNode:
|
||||
"""Convertit un noeud de construction en noeud final."""
|
||||
return HierarchyNode(
|
||||
title=node["title"],
|
||||
level=node["level"],
|
||||
content="\n".join(node["content"]).strip(),
|
||||
children=[finalize(child) for child in node["children"]],
|
||||
)
|
||||
|
||||
# Extraire le préambule et les sections
|
||||
preamble: str = "\n".join(root["content"]).strip()
|
||||
sections: List[HierarchyNode] = [finalize(child) for child in root["children"]]
|
||||
|
||||
return DocumentHierarchy(
|
||||
preamble=preamble,
|
||||
sections=sections,
|
||||
)
|
||||
|
||||
|
||||
def flatten_hierarchy(hierarchy: DocumentHierarchy) -> List[FlatChunk]:
|
||||
"""Aplatit la hiérarchie en une liste de chunks.
|
||||
|
||||
Args:
|
||||
hierarchy: Structure hiérarchique (sortie de build_hierarchy)
|
||||
|
||||
Returns:
|
||||
Liste de chunks avec leur contexte hiérarchique
|
||||
"""
|
||||
chunks: List[FlatChunk] = []
|
||||
|
||||
# Préambule comme premier chunk
|
||||
if hierarchy.get("preamble"):
|
||||
preamble_chunk: FlatChunk = {
|
||||
"chunk_id": "chunk_00000",
|
||||
"text": hierarchy["preamble"],
|
||||
"hierarchy": HierarchyPath(
|
||||
part=None,
|
||||
chapter=None,
|
||||
section=None,
|
||||
subsection=None,
|
||||
),
|
||||
"type": "preamble",
|
||||
"level": 0,
|
||||
"title": None,
|
||||
}
|
||||
chunks.append(preamble_chunk)
|
||||
|
||||
def process_section(
|
||||
section: HierarchyNode,
|
||||
path: HierarchyPath,
|
||||
index: int,
|
||||
) -> int:
|
||||
"""Traite récursivement une section.
|
||||
|
||||
Args:
|
||||
section: Noeud de section à traiter
|
||||
path: Chemin hiérarchique courant
|
||||
index: Index du prochain chunk
|
||||
|
||||
Returns:
|
||||
Nouvel index après traitement
|
||||
"""
|
||||
level: int = section["level"]
|
||||
title: Optional[str] = section["title"]
|
||||
|
||||
# Mettre à jour le chemin hiérarchique
|
||||
current_path: HierarchyPath = path.copy()
|
||||
if level == 1:
|
||||
current_path = HierarchyPath(
|
||||
part=title,
|
||||
chapter=None,
|
||||
section=None,
|
||||
subsection=None,
|
||||
)
|
||||
elif level == 2:
|
||||
current_path["chapter"] = title
|
||||
current_path["section"] = None
|
||||
current_path["subsection"] = None
|
||||
elif level == 3:
|
||||
current_path["section"] = title
|
||||
current_path["subsection"] = None
|
||||
elif level >= 4:
|
||||
current_path["subsection"] = title
|
||||
|
||||
# Créer le chunk si contenu
|
||||
if section["content"]:
|
||||
chunk: FlatChunk = {
|
||||
"chunk_id": f"chunk_{index:05d}",
|
||||
"text": section["content"],
|
||||
"hierarchy": current_path.copy(),
|
||||
"type": "main_content",
|
||||
"level": level,
|
||||
"title": title,
|
||||
}
|
||||
chunks.append(chunk)
|
||||
index += 1
|
||||
|
||||
# Traiter les enfants
|
||||
for child in section["children"]:
|
||||
index = process_section(child, current_path, index)
|
||||
|
||||
return index
|
||||
|
||||
# Traiter toutes les sections
|
||||
idx: int = 1
|
||||
initial_path: HierarchyPath = HierarchyPath(
|
||||
part=None,
|
||||
chapter=None,
|
||||
section=None,
|
||||
subsection=None,
|
||||
)
|
||||
for section in hierarchy.get("sections", []):
|
||||
idx = process_section(section, initial_path, idx)
|
||||
|
||||
return chunks
|
||||
192
generations/library_rag/utils/image_extractor.py
Normal file
192
generations/library_rag/utils/image_extractor.py
Normal file
@@ -0,0 +1,192 @@
|
||||
"""Image extraction and storage from OCR API responses.
|
||||
|
||||
This module provides utilities for extracting and saving images from
|
||||
Mistral OCR API responses. It is a companion module to markdown_builder,
|
||||
handling the image-specific aspects of document processing.
|
||||
|
||||
Features:
|
||||
- **Image Writer Factory**: Creates reusable callbacks for image saving
|
||||
- **Batch Extraction**: Processes all images from an OCR response
|
||||
- **Protocol-based Design**: Flexible interface for custom implementations
|
||||
|
||||
Pipeline Position:
|
||||
OCR Response → **Image Extractor** → Saved images + paths for Markdown
|
||||
|
||||
Components:
|
||||
1. ImageWriterProtocol: Interface definition for image saving
|
||||
2. create_image_writer(): Factory for standard file-based writers
|
||||
3. extract_images(): Batch extraction from OCR responses
|
||||
|
||||
Integration:
|
||||
The image writer is designed to integrate with markdown_builder:
|
||||
|
||||
>>> from utils.image_extractor import create_image_writer
|
||||
>>> from utils.markdown_builder import build_markdown
|
||||
>>>
|
||||
>>> writer = create_image_writer(Path("output/doc/images"))
|
||||
>>> markdown = build_markdown(ocr_response, image_writer=writer)
|
||||
|
||||
Standalone Usage:
|
||||
>>> from pathlib import Path
|
||||
>>> from utils.image_extractor import extract_images
|
||||
>>>
|
||||
>>> # Extract all images from OCR response
|
||||
>>> paths = extract_images(ocr_response, Path("output/my_doc"))
|
||||
>>> print(f"Extracted {len(paths)} images")
|
||||
|
||||
File Naming Convention:
|
||||
Images are named: page{N}_img{M}.png
|
||||
- N: Page number (1-based)
|
||||
- M: Image index within page (1-based)
|
||||
- Format: Always PNG (base64 from Mistral is PNG)
|
||||
|
||||
Note:
|
||||
- All indices are 1-based for consistency with page numbering
|
||||
- The images subdirectory is created automatically if needed
|
||||
- Base64 data without proper encoding is silently skipped
|
||||
- Large documents may produce many images; monitor disk space
|
||||
|
||||
See Also:
|
||||
- utils.markdown_builder: Uses ImageWriter for markdown generation
|
||||
- utils.mistral_client: Source of OCR responses with image data
|
||||
"""
|
||||
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, List, Optional, Protocol
|
||||
|
||||
|
||||
class ImageWriterProtocol(Protocol):
|
||||
"""Protocol for image writing callbacks.
|
||||
|
||||
This protocol defines the interface for functions that save
|
||||
images extracted from OCR responses and return a relative
|
||||
path for markdown references.
|
||||
|
||||
The protocol expects:
|
||||
- page_idx: 1-based page number
|
||||
- img_idx: 1-based image index within the page
|
||||
- image_b64: Base64-encoded image data
|
||||
|
||||
Returns:
|
||||
Relative path to the saved image for markdown inclusion.
|
||||
|
||||
Example:
|
||||
>>> def my_writer(page_idx: int, img_idx: int, image_b64: str) -> str:
|
||||
... # Custom saving logic
|
||||
... return f"images/page{page_idx}_img{img_idx}.png"
|
||||
"""
|
||||
|
||||
def __call__(self, page_idx: int, img_idx: int, image_b64: str) -> str:
|
||||
"""Save image and return relative path for markdown reference."""
|
||||
...
|
||||
|
||||
|
||||
# Type alias for image writer callables
|
||||
ImageWriter = Callable[[int, int, str], str]
|
||||
|
||||
|
||||
def create_image_writer(images_dir: Path) -> ImageWriter:
|
||||
"""Create a function for saving images to disk.
|
||||
|
||||
This factory function creates a closure that saves base64-encoded
|
||||
images to the specified directory and returns relative paths
|
||||
suitable for markdown image references.
|
||||
|
||||
Args:
|
||||
images_dir: Directory path where images will be saved.
|
||||
The directory will be created if it doesn't exist.
|
||||
|
||||
Returns:
|
||||
A callable that accepts (page_idx, img_idx, image_b64) and
|
||||
returns the relative path to the saved image.
|
||||
|
||||
Example:
|
||||
>>> from pathlib import Path
|
||||
>>> writer = create_image_writer(Path("output/images"))
|
||||
>>> path = writer(1, 0, "iVBORw0KGgoAAAANS...")
|
||||
>>> print(path)
|
||||
'images/page1_img0.png'
|
||||
"""
|
||||
# Create directory if it doesn't exist
|
||||
images_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def writer(page_idx: int, img_idx: int, image_b64: str) -> str:
|
||||
"""Save an image and return its relative path.
|
||||
|
||||
Args:
|
||||
page_idx: Page number (1-based).
|
||||
img_idx: Image index within the page (1-based).
|
||||
image_b64: Base64-encoded image data.
|
||||
|
||||
Returns:
|
||||
Relative path to the saved image file.
|
||||
"""
|
||||
filename: str = f"page{page_idx}_img{img_idx}.png"
|
||||
filepath: Path = images_dir / filename
|
||||
|
||||
# Decode and save
|
||||
image_data: bytes = base64.b64decode(image_b64)
|
||||
filepath.write_bytes(image_data)
|
||||
|
||||
# Return relative path for markdown
|
||||
return f"images/{filename}"
|
||||
|
||||
return writer
|
||||
|
||||
|
||||
def extract_images(ocr_response: Any, output_dir: Path) -> List[str]:
|
||||
"""Extract all images from an OCR response.
|
||||
|
||||
Iterates through all pages in the OCR response, extracts any
|
||||
embedded images, decodes them from base64, and saves them
|
||||
to the output directory.
|
||||
|
||||
Args:
|
||||
ocr_response: OCR response object from Mistral API.
|
||||
Expected to have a pages attribute, where each page
|
||||
may have an images list containing objects with
|
||||
image_base64 attributes.
|
||||
output_dir: Base output directory. Images will be saved
|
||||
to a subdirectory named "images".
|
||||
|
||||
Returns:
|
||||
List of absolute file paths to the extracted images.
|
||||
|
||||
Example:
|
||||
>>> from pathlib import Path
|
||||
>>> paths = extract_images(ocr_response, Path("output/my_doc"))
|
||||
>>> for path in paths:
|
||||
... print(path)
|
||||
'C:/output/my_doc/images/page1_img1.png'
|
||||
'C:/output/my_doc/images/page2_img1.png'
|
||||
|
||||
Note:
|
||||
- Pages and images are 1-indexed in filenames
|
||||
- Images without base64 data are silently skipped
|
||||
- The images subdirectory is created automatically
|
||||
"""
|
||||
images_dir: Path = output_dir / "images"
|
||||
images_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
extracted: List[str] = []
|
||||
|
||||
for page_index, page in enumerate(ocr_response.pages, start=1):
|
||||
if not getattr(page, "images", None):
|
||||
continue
|
||||
|
||||
for img_idx, img in enumerate(page.images, start=1):
|
||||
image_b64: Optional[str] = getattr(img, "image_base64", None)
|
||||
if not image_b64:
|
||||
continue
|
||||
|
||||
filename: str = f"page{page_index}_img{img_idx}.png"
|
||||
filepath: Path = images_dir / filename
|
||||
|
||||
# Decode and save
|
||||
image_data: bytes = base64.b64decode(image_b64)
|
||||
filepath.write_bytes(image_data)
|
||||
|
||||
extracted.append(str(filepath))
|
||||
|
||||
return extracted
|
||||
319
generations/library_rag/utils/llm_chat.py
Normal file
319
generations/library_rag/utils/llm_chat.py
Normal file
@@ -0,0 +1,319 @@
|
||||
"""Multi-LLM Integration Module for Chat Conversation.
|
||||
|
||||
Provides a unified interface for calling different LLM providers with streaming support:
|
||||
- Ollama (local, free)
|
||||
- Mistral API
|
||||
- Anthropic API (Claude)
|
||||
- OpenAI API
|
||||
|
||||
Example:
|
||||
>>> for token in call_llm("Hello world", "ollama", "qwen2.5:7b"):
|
||||
... print(token, end="", flush=True)
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
from typing import Iterator, Optional
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LLMError(Exception):
|
||||
"""Base exception for LLM errors."""
|
||||
pass
|
||||
|
||||
|
||||
def call_llm(
|
||||
prompt: str,
|
||||
provider: str,
|
||||
model: str,
|
||||
stream: bool = True,
|
||||
temperature: float = 0.7,
|
||||
max_tokens: int = 16384,
|
||||
) -> Iterator[str]:
|
||||
"""Call an LLM provider with unified interface.
|
||||
|
||||
Args:
|
||||
prompt: The prompt to send to the LLM.
|
||||
provider: Provider name ("ollama", "mistral", "anthropic", "openai").
|
||||
model: Model name (e.g., "qwen2.5:7b", "mistral-small-latest", "claude-sonnet-4-5").
|
||||
stream: Whether to stream tokens (default: True).
|
||||
temperature: Temperature for generation (0-1).
|
||||
max_tokens: Maximum tokens to generate (default 16384 for philosophical discussions).
|
||||
|
||||
Yields:
|
||||
Tokens as strings (when streaming).
|
||||
|
||||
Raises:
|
||||
LLMError: If provider is invalid or API call fails.
|
||||
|
||||
Example:
|
||||
>>> for token in call_llm("Test", "ollama", "qwen2.5:7b"):
|
||||
... print(token, end="")
|
||||
"""
|
||||
provider = provider.lower()
|
||||
|
||||
logger.info(f"[LLM Call] Provider: {provider}, Model: {model}, Stream: {stream}")
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
if provider == "ollama":
|
||||
yield from _call_ollama(prompt, model, temperature, stream)
|
||||
elif provider == "mistral":
|
||||
yield from _call_mistral(prompt, model, temperature, max_tokens, stream)
|
||||
elif provider == "anthropic":
|
||||
yield from _call_anthropic(prompt, model, temperature, max_tokens, stream)
|
||||
elif provider == "openai":
|
||||
yield from _call_openai(prompt, model, temperature, max_tokens, stream)
|
||||
else:
|
||||
raise LLMError(f"Provider '{provider}' non supporté. Utilisez: ollama, mistral, anthropic, openai")
|
||||
|
||||
except Exception as e:
|
||||
elapsed = time.time() - start_time
|
||||
logger.error(f"[LLM Call] Error after {elapsed:.2f}s: {e}")
|
||||
raise
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
logger.info(f"[LLM Call] Completed in {elapsed:.2f}s")
|
||||
|
||||
|
||||
def _call_ollama(prompt: str, model: str, temperature: float, stream: bool) -> Iterator[str]:
|
||||
"""Call Ollama API with streaming support.
|
||||
|
||||
Args:
|
||||
prompt: The prompt text.
|
||||
model: Ollama model name.
|
||||
temperature: Temperature (0-1).
|
||||
stream: Whether to stream.
|
||||
|
||||
Yields:
|
||||
Tokens from the model.
|
||||
"""
|
||||
import requests
|
||||
|
||||
base_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
||||
url = f"{base_url}/api/generate"
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": stream,
|
||||
"options": {
|
||||
"temperature": temperature,
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(url, json=payload, stream=stream, timeout=120)
|
||||
response.raise_for_status()
|
||||
|
||||
if stream:
|
||||
# Stream mode: each line is a JSON object with "response" field
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
try:
|
||||
data = json.loads(line)
|
||||
token = data.get("response", "")
|
||||
if token:
|
||||
yield token
|
||||
|
||||
# Check if done
|
||||
if data.get("done", False):
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
else:
|
||||
# Non-stream mode
|
||||
data = response.json()
|
||||
yield data.get("response", "")
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise LLMError(f"Ollama API error: {e}")
|
||||
|
||||
|
||||
def _call_mistral(prompt: str, model: str, temperature: float, max_tokens: int, stream: bool) -> Iterator[str]:
|
||||
"""Call Mistral API with streaming support.
|
||||
|
||||
Args:
|
||||
prompt: The prompt text.
|
||||
model: Mistral model name.
|
||||
temperature: Temperature (0-1).
|
||||
max_tokens: Max tokens to generate.
|
||||
stream: Whether to stream.
|
||||
|
||||
Yields:
|
||||
Tokens from the model.
|
||||
"""
|
||||
api_key = os.getenv("MISTRAL_API_KEY")
|
||||
if not api_key:
|
||||
raise LLMError("MISTRAL_API_KEY not set in environment")
|
||||
|
||||
try:
|
||||
from mistralai import Mistral
|
||||
except ImportError:
|
||||
raise LLMError("mistralai package not installed. Run: pip install mistralai")
|
||||
|
||||
client = Mistral(api_key=api_key)
|
||||
|
||||
messages = [{"role": "user", "content": prompt}]
|
||||
|
||||
try:
|
||||
if stream:
|
||||
# Streaming mode
|
||||
stream_response = client.chat.stream(
|
||||
model=model,
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
for chunk in stream_response:
|
||||
if chunk.data.choices:
|
||||
delta = chunk.data.choices[0].delta
|
||||
if hasattr(delta, 'content') and delta.content:
|
||||
yield delta.content
|
||||
else:
|
||||
# Non-streaming mode
|
||||
response = client.chat.complete(
|
||||
model=model,
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
if response.choices:
|
||||
yield response.choices[0].message.content or ""
|
||||
|
||||
except Exception as e:
|
||||
raise LLMError(f"Mistral API error: {e}")
|
||||
|
||||
|
||||
def _call_anthropic(prompt: str, model: str, temperature: float, max_tokens: int, stream: bool) -> Iterator[str]:
|
||||
"""Call Anthropic API (Claude) with streaming support.
|
||||
|
||||
Args:
|
||||
prompt: The prompt text.
|
||||
model: Claude model name.
|
||||
temperature: Temperature (0-1).
|
||||
max_tokens: Max tokens to generate.
|
||||
stream: Whether to stream.
|
||||
|
||||
Yields:
|
||||
Tokens from the model.
|
||||
"""
|
||||
api_key = os.getenv("ANTHROPIC_API_KEY")
|
||||
if not api_key:
|
||||
raise LLMError("ANTHROPIC_API_KEY not set in environment")
|
||||
|
||||
try:
|
||||
from anthropic import Anthropic
|
||||
except ImportError:
|
||||
raise LLMError("anthropic package not installed. Run: pip install anthropic")
|
||||
|
||||
client = Anthropic(api_key=api_key)
|
||||
|
||||
try:
|
||||
if stream:
|
||||
# Streaming mode
|
||||
with client.messages.stream(
|
||||
model=model,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
) as stream:
|
||||
for text in stream.text_stream:
|
||||
yield text
|
||||
else:
|
||||
# Non-streaming mode
|
||||
response = client.messages.create(
|
||||
model=model,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
)
|
||||
if response.content:
|
||||
yield response.content[0].text
|
||||
|
||||
except Exception as e:
|
||||
raise LLMError(f"Anthropic API error: {e}")
|
||||
|
||||
|
||||
def _call_openai(prompt: str, model: str, temperature: float, max_tokens: int, stream: bool) -> Iterator[str]:
|
||||
"""Call OpenAI API with streaming support.
|
||||
|
||||
Args:
|
||||
prompt: The prompt text.
|
||||
model: OpenAI model name.
|
||||
temperature: Temperature (0-1).
|
||||
max_tokens: Max tokens to generate.
|
||||
stream: Whether to stream.
|
||||
|
||||
Yields:
|
||||
Tokens from the model.
|
||||
"""
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise LLMError("OPENAI_API_KEY not set in environment")
|
||||
|
||||
try:
|
||||
from openai import OpenAI
|
||||
except ImportError:
|
||||
raise LLMError("openai package not installed. Run: pip install openai")
|
||||
|
||||
client = OpenAI(api_key=api_key)
|
||||
|
||||
messages = [{"role": "user", "content": prompt}]
|
||||
|
||||
# Detect if model uses max_completion_tokens (o1, gpt-5.x) instead of max_tokens
|
||||
uses_completion_tokens = model.startswith("o1") or model.startswith("gpt-5")
|
||||
|
||||
try:
|
||||
if stream:
|
||||
# Streaming mode
|
||||
if uses_completion_tokens:
|
||||
stream_response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
max_completion_tokens=max_tokens,
|
||||
stream=True,
|
||||
)
|
||||
else:
|
||||
stream_response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for chunk in stream_response:
|
||||
if chunk.choices:
|
||||
delta = chunk.choices[0].delta
|
||||
if hasattr(delta, 'content') and delta.content:
|
||||
yield delta.content
|
||||
else:
|
||||
# Non-streaming mode
|
||||
if uses_completion_tokens:
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
max_completion_tokens=max_tokens,
|
||||
stream=False,
|
||||
)
|
||||
else:
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
stream=False,
|
||||
)
|
||||
if response.choices:
|
||||
yield response.choices[0].message.content or ""
|
||||
|
||||
except Exception as e:
|
||||
raise LLMError(f"OpenAI API error: {e}")
|
||||
495
generations/library_rag/utils/llm_chunker.py
Normal file
495
generations/library_rag/utils/llm_chunker.py
Normal file
@@ -0,0 +1,495 @@
|
||||
"""Semantic chunking of documents via LLM.
|
||||
|
||||
This module provides intelligent semantic chunking capabilities for academic and
|
||||
philosophical texts, using Large Language Models (LLM) to identify coherent units
|
||||
of meaning (argumentative units, definitions, examples, citations, etc.).
|
||||
|
||||
Overview:
|
||||
The module offers two chunking strategies:
|
||||
|
||||
1. **LLM-based semantic chunking** (chunk_section_with_llm):
|
||||
Uses an LLM to identify semantic boundaries and create chunks that preserve
|
||||
argumentative coherence. Each chunk is annotated with summary, concepts, type.
|
||||
|
||||
2. **Simple paragraph-based chunking** (simple_chunk_by_paragraphs):
|
||||
A fast fallback that splits text by paragraph boundaries.
|
||||
|
||||
Semantic Unit Types:
|
||||
- argument: A logical argument or reasoning sequence
|
||||
- definition: A definition or conceptual clarification
|
||||
- example: An illustrative example or case study
|
||||
- citation: A quoted passage from another source
|
||||
- exposition: Expository content presenting ideas
|
||||
- transition: Transitional text between sections
|
||||
|
||||
Chunk Size Guidelines:
|
||||
- Target size: 300-500 words per chunk (configurable)
|
||||
- Chunks are never split mid-sentence or mid-paragraph
|
||||
- Short sections (< 80% of target) are kept as single chunks
|
||||
|
||||
LLM Provider Support:
|
||||
- ollama: Local LLM (free, slower, default)
|
||||
- mistral: Mistral API (faster, requires API key)
|
||||
|
||||
See Also:
|
||||
utils.llm_cleaner: Chunk cleaning and validation
|
||||
utils.llm_classifier: Section type classification
|
||||
utils.pdf_pipeline: Main pipeline orchestration
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Literal, Optional, TypedDict
|
||||
|
||||
from .llm_structurer import (
|
||||
_clean_json_string,
|
||||
_get_default_mistral_model,
|
||||
_get_default_model,
|
||||
call_llm,
|
||||
)
|
||||
from .llm_cleaner import clean_page_markers, is_chunk_valid
|
||||
from .types import LLMProvider, SemanticChunk
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Type Definitions for LLM Chunker
|
||||
# =============================================================================
|
||||
|
||||
#: Unit type for semantic chunking (specific to this module's LLM output)
|
||||
ChunkUnitType = Literal[
|
||||
"argument",
|
||||
"definition",
|
||||
"example",
|
||||
"citation",
|
||||
"exposition",
|
||||
"transition",
|
||||
"main_content",
|
||||
]
|
||||
|
||||
|
||||
class LLMChunkResponse(TypedDict, total=False):
|
||||
"""Individual chunk structure as returned by LLM.
|
||||
|
||||
Attributes:
|
||||
text: Chunk text content (exact copy from source)
|
||||
summary: Brief one-sentence summary
|
||||
concepts: Key concepts extracted (3-5 items)
|
||||
type: Semantic unit type
|
||||
"""
|
||||
|
||||
text: str
|
||||
summary: str
|
||||
concepts: List[str]
|
||||
type: str
|
||||
|
||||
|
||||
class LLMChunksResult(TypedDict):
|
||||
"""Complete response structure from LLM chunking.
|
||||
|
||||
Attributes:
|
||||
chunks: List of chunk objects
|
||||
"""
|
||||
|
||||
chunks: List[LLMChunkResponse]
|
||||
|
||||
|
||||
# Note: SemanticChunk is imported from utils.types
|
||||
|
||||
|
||||
def extract_paragraph_number(text: str) -> Optional[int]:
|
||||
"""Extract paragraph number from the beginning of text.
|
||||
|
||||
Many philosophical texts use numbered paragraphs. This function
|
||||
detects various numbering formats.
|
||||
|
||||
Args:
|
||||
text: Text content that may start with a paragraph number.
|
||||
|
||||
Returns:
|
||||
The paragraph number if detected, None otherwise.
|
||||
|
||||
Example:
|
||||
>>> extract_paragraph_number("9 On presente...")
|
||||
9
|
||||
>>> extract_paragraph_number("Normal text")
|
||||
None
|
||||
"""
|
||||
text = text.strip()
|
||||
|
||||
# Patterns possibles pour les numéros de paragraphe
|
||||
patterns: List[str] = [
|
||||
r'^(\d+)\s+[A-ZÀ-Ü]', # "9 On présente..."
|
||||
r'^(\d+)[A-ZÀ-Ü]', # "10Dans la classification..."
|
||||
r'^§\s*(\d+)', # "§ 15 ..."
|
||||
r'^\[(\d+)\]', # "[9] ..."
|
||||
r'^(\d+)\.', # "9. ..."
|
||||
r'^(\d+)\)', # "9) ..."
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match: Optional[re.Match[str]] = re.match(pattern, text)
|
||||
if match:
|
||||
try:
|
||||
return int(match.group(1))
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _extract_json_from_response(text: str) -> Dict[str, Any]:
|
||||
"""Extract JSON from LLM response text.
|
||||
|
||||
Handles both wrapped JSON (in <JSON></JSON> tags) and raw JSON responses.
|
||||
Falls back to empty chunks list if parsing fails.
|
||||
|
||||
Args:
|
||||
text: Response text from LLM containing JSON.
|
||||
|
||||
Returns:
|
||||
Parsed JSON as dictionary with 'chunks' key. Returns
|
||||
{"chunks": []} if parsing fails.
|
||||
"""
|
||||
json_match: Optional[re.Match[str]] = re.search(
|
||||
r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL
|
||||
)
|
||||
if json_match:
|
||||
json_str: str = _clean_json_string(json_match.group(1))
|
||||
try:
|
||||
result: Dict[str, Any] = json.loads(json_str)
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
start: int = text.find("{")
|
||||
end: int = text.rfind("}")
|
||||
if start != -1 and end > start:
|
||||
json_str = _clean_json_string(text[start:end + 1])
|
||||
try:
|
||||
result = json.loads(json_str)
|
||||
return result
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"JSON invalide: {e}")
|
||||
|
||||
return {"chunks": []}
|
||||
|
||||
|
||||
def chunk_section_with_llm(
|
||||
section_content: str,
|
||||
section_title: str,
|
||||
chapter_title: Optional[str] = None,
|
||||
subsection_title: Optional[str] = None,
|
||||
section_level: int = 1,
|
||||
model: Optional[str] = None,
|
||||
provider: LLMProvider = "ollama",
|
||||
temperature: float = 0.2,
|
||||
target_chunk_size: int = 400,
|
||||
) -> List[SemanticChunk]:
|
||||
"""Split a section into semantically coherent chunks using an LLM.
|
||||
|
||||
This is the main semantic chunking function. It uses an LLM to identify
|
||||
natural semantic boundaries in academic/philosophical texts, preserving
|
||||
argumentative coherence and annotating each chunk with metadata.
|
||||
|
||||
Args:
|
||||
section_content: The text content of the section to chunk.
|
||||
section_title: Title of the current section being chunked.
|
||||
chapter_title: Title of the parent chapter (level 1) for context.
|
||||
subsection_title: Title of parent subsection (level 2) if applicable.
|
||||
section_level: Hierarchy level (1=chapter, 2=section, etc.).
|
||||
model: LLM model name. If None, uses provider default.
|
||||
provider: LLM provider ("ollama" for local, "mistral" for API).
|
||||
temperature: LLM temperature (lower = more deterministic).
|
||||
target_chunk_size: Target number of words per chunk.
|
||||
|
||||
Returns:
|
||||
List of SemanticChunk dictionaries containing text, summary,
|
||||
concepts, type, section_level, and optionally paragraph_number.
|
||||
|
||||
Note:
|
||||
If section is shorter than 80% of target_chunk_size, it is returned
|
||||
as a single chunk. If LLM fails, returns section with error field.
|
||||
"""
|
||||
if model is None:
|
||||
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
||||
|
||||
# Nettoyer le contenu
|
||||
content: str = clean_page_markers(section_content)
|
||||
|
||||
# Si le contenu est court, ne pas découper
|
||||
word_count: int = len(content.split())
|
||||
if word_count < target_chunk_size * 0.8:
|
||||
para_num: Optional[int] = extract_paragraph_number(content)
|
||||
chunk: SemanticChunk = {
|
||||
"text": content,
|
||||
"summary": section_title,
|
||||
"concepts": [],
|
||||
"type": "main_content",
|
||||
"section_level": section_level,
|
||||
}
|
||||
if para_num is not None:
|
||||
chunk["paragraph_number"] = para_num
|
||||
if subsection_title and subsection_title != section_title:
|
||||
chunk["subsection_title"] = subsection_title
|
||||
return [chunk]
|
||||
|
||||
chapter_info: str = f"Chapitre: {chapter_title}\n" if chapter_title else ""
|
||||
|
||||
prompt = f"""Tu es un expert en analyse de textes académiques.
|
||||
|
||||
TÂCHE: Découper ce texte en unités sémantiques cohérentes.
|
||||
|
||||
{chapter_info}Section: {section_title}
|
||||
|
||||
RÈGLES DE DÉCOUPAGE:
|
||||
1. Chaque chunk doit avoir un SENS COMPLET (une idée, un argument)
|
||||
2. Taille idéale: {target_chunk_size - 100} à {target_chunk_size + 100} mots
|
||||
3. NE PAS couper au milieu d'une phrase ou d'un paragraphe
|
||||
4. NE PAS couper au milieu d'une citation
|
||||
5. Regrouper les paragraphes qui développent la même idée
|
||||
6. Un chunk peut être plus long si nécessaire pour préserver le sens
|
||||
|
||||
POUR CHAQUE CHUNK, INDIQUE:
|
||||
- text: le texte exact (copié, pas reformulé)
|
||||
- summary: résumé en 1 phrase courte
|
||||
- concepts: 3-5 concepts clés (mots ou expressions)
|
||||
- type: argument | définition | exemple | citation | exposition | transition
|
||||
|
||||
TEXTE À DÉCOUPER:
|
||||
{content}
|
||||
|
||||
RÉPONDS avec un JSON entre <JSON></JSON>:
|
||||
|
||||
<JSON>
|
||||
{{
|
||||
"chunks": [
|
||||
{{
|
||||
"text": "Premier paragraphe ou groupe de paragraphes...",
|
||||
"summary": "Présentation de l'idée principale",
|
||||
"concepts": ["concept1", "concept2", "concept3"],
|
||||
"type": "exposition"
|
||||
}},
|
||||
{{
|
||||
"text": "Deuxième partie du texte...",
|
||||
"summary": "Développement de l'argument",
|
||||
"concepts": ["concept4", "concept5"],
|
||||
"type": "argument"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
</JSON>
|
||||
"""
|
||||
|
||||
logger.info(f"Chunking sémantique de '{section_title}' ({word_count} mots) via {provider.upper()}")
|
||||
|
||||
try:
|
||||
response: str = call_llm(
|
||||
prompt, model=model, provider=provider, temperature=temperature, timeout=300
|
||||
)
|
||||
result: Dict[str, Any] = _extract_json_from_response(response)
|
||||
chunks: List[Dict[str, Any]] = result.get("chunks", [])
|
||||
|
||||
# Valider les chunks et extraire les numéros de paragraphe
|
||||
valid_chunks: List[SemanticChunk] = []
|
||||
for raw_chunk in chunks:
|
||||
text: str = raw_chunk.get("text", "")
|
||||
if is_chunk_valid(text):
|
||||
# Extraire le numéro de paragraphe s'il existe
|
||||
para_num = extract_paragraph_number(text)
|
||||
|
||||
chunk_data: SemanticChunk = {
|
||||
"text": text,
|
||||
"summary": raw_chunk.get("summary", ""),
|
||||
"concepts": raw_chunk.get("concepts", []),
|
||||
"type": raw_chunk.get("type", "main_content"),
|
||||
"section_level": section_level,
|
||||
}
|
||||
|
||||
# Ajouter le numéro de paragraphe si détecté
|
||||
if para_num is not None:
|
||||
chunk_data["paragraph_number"] = para_num
|
||||
|
||||
# Ajouter la hiérarchie complète
|
||||
if subsection_title and subsection_title != section_title:
|
||||
chunk_data["subsection_title"] = subsection_title
|
||||
|
||||
valid_chunks.append(chunk_data)
|
||||
|
||||
# Si aucun chunk valide, retourner le contenu complet
|
||||
if not valid_chunks:
|
||||
logger.warning(f"Aucun chunk valide pour '{section_title}', retour contenu complet")
|
||||
para_num = extract_paragraph_number(content)
|
||||
fallback: SemanticChunk = {
|
||||
"text": content,
|
||||
"summary": section_title,
|
||||
"concepts": [],
|
||||
"type": "main_content",
|
||||
"section_level": section_level,
|
||||
}
|
||||
if para_num is not None:
|
||||
fallback["paragraph_number"] = para_num
|
||||
return [fallback]
|
||||
|
||||
logger.info(f"Section '{section_title}' découpée en {len(valid_chunks)} chunks")
|
||||
return valid_chunks
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur chunking LLM: {e}")
|
||||
# Fallback: retourner le contenu complet
|
||||
para_num = extract_paragraph_number(content)
|
||||
fallback_err: SemanticChunk = {
|
||||
"text": content,
|
||||
"summary": section_title,
|
||||
"concepts": [],
|
||||
"type": "main_content",
|
||||
"section_level": section_level,
|
||||
"error": str(e),
|
||||
}
|
||||
if para_num is not None:
|
||||
fallback_err["paragraph_number"] = para_num
|
||||
return [fallback_err]
|
||||
|
||||
|
||||
def simple_chunk_by_paragraphs(
|
||||
content: str,
|
||||
max_words: int = 500,
|
||||
min_words: int = 100,
|
||||
) -> List[str]:
|
||||
"""Split text into chunks by paragraph boundaries (no LLM required).
|
||||
|
||||
This is a fast fallback chunking method that respects paragraph and
|
||||
sentence boundaries. Use when LLM processing is not desired.
|
||||
|
||||
The algorithm:
|
||||
1. Split by double newlines (paragraph boundaries)
|
||||
2. Merge small paragraphs until max_words is reached
|
||||
3. Split long paragraphs at sentence boundaries
|
||||
4. Filter chunks below min_words threshold
|
||||
|
||||
Args:
|
||||
content: Text content to split into chunks.
|
||||
max_words: Maximum words per chunk. Defaults to 500.
|
||||
min_words: Minimum words per chunk. Defaults to 100.
|
||||
|
||||
Returns:
|
||||
List of text chunks as strings.
|
||||
|
||||
Example:
|
||||
>>> chunks = simple_chunk_by_paragraphs(text, max_words=400)
|
||||
>>> len(chunks)
|
||||
3
|
||||
"""
|
||||
content = clean_page_markers(content)
|
||||
|
||||
# Découper par paragraphes (double saut de ligne)
|
||||
paragraphs: List[str] = re.split(r'\n\n+', content)
|
||||
|
||||
chunks: List[str] = []
|
||||
current_chunk: List[str] = []
|
||||
current_words: int = 0
|
||||
|
||||
for para in paragraphs:
|
||||
para = para.strip()
|
||||
if not para:
|
||||
continue
|
||||
|
||||
para_words: int = len(para.split())
|
||||
|
||||
# Si le paragraphe seul est trop long, le découper par phrases
|
||||
if para_words > max_words:
|
||||
if current_chunk:
|
||||
chunks.append('\n\n'.join(current_chunk))
|
||||
current_chunk = []
|
||||
current_words = 0
|
||||
|
||||
# Découper par phrases
|
||||
sentences: List[str] = re.split(r'(?<=[.!?])\s+', para)
|
||||
for sentence in sentences:
|
||||
sentence_words: int = len(sentence.split())
|
||||
if current_words + sentence_words > max_words and current_chunk:
|
||||
chunks.append('\n\n'.join(current_chunk))
|
||||
current_chunk = [sentence]
|
||||
current_words = sentence_words
|
||||
else:
|
||||
current_chunk.append(sentence)
|
||||
current_words += sentence_words
|
||||
|
||||
# Si ajouter ce paragraphe dépasse la limite
|
||||
elif current_words + para_words > max_words:
|
||||
if current_chunk:
|
||||
chunks.append('\n\n'.join(current_chunk))
|
||||
current_chunk = [para]
|
||||
current_words = para_words
|
||||
|
||||
else:
|
||||
current_chunk.append(para)
|
||||
current_words += para_words
|
||||
|
||||
# Dernier chunk
|
||||
if current_chunk:
|
||||
chunks.append('\n\n'.join(current_chunk))
|
||||
|
||||
# Filtrer les chunks trop courts
|
||||
return [c for c in chunks if len(c.split()) >= min_words or len(chunks) == 1]
|
||||
|
||||
|
||||
def extract_concepts_from_chunk(
|
||||
chunk_text: str,
|
||||
model: Optional[str] = None,
|
||||
provider: LLMProvider = "ollama",
|
||||
) -> List[str]:
|
||||
"""Extract key concepts from a text chunk using an LLM.
|
||||
|
||||
Useful for enriching chunks created without LLM processing or for
|
||||
extracting additional concepts from existing chunks.
|
||||
|
||||
Args:
|
||||
chunk_text: The text content to analyze for concepts.
|
||||
model: LLM model name. If None, uses provider default.
|
||||
provider: LLM provider ("ollama" or "mistral").
|
||||
|
||||
Returns:
|
||||
List of 3-5 key concepts (words or short phrases). Returns
|
||||
empty list if extraction fails or text is too short (< 100 chars).
|
||||
|
||||
Example:
|
||||
>>> concepts = extract_concepts_from_chunk("L'etre-pour-la-mort...")
|
||||
>>> concepts
|
||||
['etre-pour-la-mort', 'structure existentiale', 'Dasein']
|
||||
"""
|
||||
if model is None:
|
||||
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
||||
|
||||
if len(chunk_text) < 100:
|
||||
return []
|
||||
|
||||
prompt: str = f"""Extrait les 3-5 concepts clés de ce texte.
|
||||
Un concept = un mot ou une expression courte (2-3 mots max).
|
||||
|
||||
Texte:
|
||||
{chunk_text[:1500]}
|
||||
|
||||
Réponds avec une liste JSON simple:
|
||||
["concept1", "concept2", "concept3"]
|
||||
"""
|
||||
|
||||
try:
|
||||
response: str = call_llm(prompt, model=model, provider=provider, temperature=0.1, timeout=60)
|
||||
|
||||
# Chercher la liste JSON
|
||||
match: Optional[re.Match[str]] = re.search(r'\[.*?\]', response, re.DOTALL)
|
||||
if match:
|
||||
concepts: List[str] = json.loads(match.group())
|
||||
return concepts[:5] # Max 5 concepts
|
||||
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur extraction concepts: {e}")
|
||||
return []
|
||||
|
||||
582
generations/library_rag/utils/llm_classifier.py
Normal file
582
generations/library_rag/utils/llm_classifier.py
Normal file
@@ -0,0 +1,582 @@
|
||||
"""LLM-based section classification module for document structure analysis.
|
||||
|
||||
This module provides functionality to classify document sections by type
|
||||
(front_matter, chapter, appendix, etc.) using Large Language Models and
|
||||
determine which sections should be indexed for semantic search.
|
||||
|
||||
Key Features:
|
||||
- Section classification via LLM (classify_sections)
|
||||
- Automatic TOC/metadata section exclusion (is_excluded_section)
|
||||
- Post-classification validation (validate_classified_sections)
|
||||
- Filtering for indexable content (filter_indexable_sections)
|
||||
|
||||
Section Types:
|
||||
The following section types are recognized:
|
||||
|
||||
**Indexable Content (should_index=True):**
|
||||
- chapter: Main document content, essays, articles, book reviews
|
||||
- introduction: Document introductions
|
||||
- conclusion: Document conclusions
|
||||
- preface: Prefaces, forewords, warnings (intellectual content)
|
||||
- abstract: Summaries, abstracts
|
||||
|
||||
**Non-Indexable Content (should_index=False):**
|
||||
- front_matter: Title pages, copyright, credits, colophon
|
||||
- toc_display: Table of contents display (not content)
|
||||
- appendix: Document appendices
|
||||
- bibliography: References, bibliography
|
||||
- index: Document index
|
||||
- notes: End notes
|
||||
- ignore: Ads, empty pages, technical metadata
|
||||
|
||||
Classification Strategy:
|
||||
1. LLM analyzes section titles and content previews
|
||||
2. Automatic exclusion rules catch common TOC/metadata patterns
|
||||
3. Post-classification validation detects false positives
|
||||
4. Filtering extracts only indexable content
|
||||
|
||||
Typical Usage:
|
||||
>>> from utils.llm_classifier import classify_sections, filter_indexable_sections
|
||||
>>> sections = [
|
||||
... {"title": "Table of Contents", "content": "...", "level": 1},
|
||||
... {"title": "Introduction", "content": "...", "level": 1},
|
||||
... {"title": "Chapter 1", "content": "...", "level": 1}
|
||||
... ]
|
||||
>>> classified = classify_sections(sections, provider="ollama")
|
||||
>>> indexable = filter_indexable_sections(classified)
|
||||
>>> print([s["title"] for s in indexable])
|
||||
['Introduction', 'Chapter 1']
|
||||
|
||||
LLM Provider Options:
|
||||
- "ollama": Local processing, free but slower
|
||||
- "mistral": Cloud API, faster but incurs costs
|
||||
|
||||
Note:
|
||||
The classifier is designed to handle edge cases like:
|
||||
- Book reviews with analytical content (classified as chapter)
|
||||
- Editor's notes without analysis (classified as front_matter)
|
||||
- TOC fragments embedded in content (detected and excluded)
|
||||
|
||||
See Also:
|
||||
- llm_toc: Table of contents extraction
|
||||
- llm_chunker: Semantic chunking of classified sections
|
||||
- llm_metadata: Document metadata extraction
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import cast, Any, Dict, Final
|
||||
|
||||
from .llm_structurer import (
|
||||
_clean_json_string,
|
||||
_get_default_mistral_model,
|
||||
_get_default_model,
|
||||
call_llm,
|
||||
)
|
||||
from .types import LLMProvider
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Types de sections possibles
|
||||
SECTION_TYPES: Final[dict[str, str]] = {
|
||||
"front_matter": "Métadonnées, page de titre, copyright, crédits, NOTE DE L'ÉDITEUR, colophon",
|
||||
"toc_display": "Table des matières affichée (pas le contenu)",
|
||||
"preface": "Préface, avant-propos, avertissement (contenu intellectuel à indexer)",
|
||||
"abstract": "Résumé, abstract",
|
||||
"introduction": "Introduction de l'œuvre",
|
||||
"chapter": "Chapitre principal du document",
|
||||
"conclusion": "Conclusion de l'œuvre",
|
||||
"appendix": "Annexes",
|
||||
"bibliography": "Bibliographie, références",
|
||||
"index": "Index",
|
||||
"notes": "Notes de fin",
|
||||
"ignore": "À ignorer (publicités, pages vides, métadonnées techniques)",
|
||||
}
|
||||
|
||||
|
||||
def _extract_json_from_response(text: str) -> dict[str, Any]:
|
||||
"""Extract JSON from LLM response text.
|
||||
|
||||
Handles two formats:
|
||||
1. JSON wrapped in <JSON></JSON> tags
|
||||
2. Raw JSON object in the response
|
||||
|
||||
Args:
|
||||
text: Raw LLM response text.
|
||||
|
||||
Returns:
|
||||
Parsed JSON as dictionary. Returns {"classifications": []} on failure.
|
||||
"""
|
||||
json_match: re.Match[str] | None = re.search(
|
||||
r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL
|
||||
)
|
||||
if json_match:
|
||||
json_str: str = _clean_json_string(json_match.group(1))
|
||||
try:
|
||||
result: Dict[str, Any] = json.loads(json_str)
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
start: int = text.find("{")
|
||||
end: int = text.rfind("}")
|
||||
if start != -1 and end > start:
|
||||
json_str = _clean_json_string(text[start:end + 1])
|
||||
try:
|
||||
result = json.loads(json_str)
|
||||
return result
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"JSON invalide: {e}")
|
||||
|
||||
return {"classifications": []}
|
||||
|
||||
|
||||
def classify_sections(
|
||||
sections: list[dict[str, Any]],
|
||||
document_title: str | None = None,
|
||||
model: str | None = None,
|
||||
provider: LLMProvider = "ollama",
|
||||
temperature: float = 0.1,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Classify document sections by type using LLM.
|
||||
|
||||
Uses an LLM to analyze section titles and content previews to determine
|
||||
the type of each section (chapter, front_matter, toc_display, etc.) and
|
||||
whether it should be indexed for semantic search.
|
||||
|
||||
Args:
|
||||
sections: List of section dictionaries with keys:
|
||||
- title: Section title
|
||||
- content: Section content (preview used)
|
||||
- level: Hierarchy level (1=chapter, 2=section, etc.)
|
||||
document_title: Optional document title for context.
|
||||
model: LLM model name. If None, uses provider default.
|
||||
provider: LLM provider ("ollama" or "mistral").
|
||||
temperature: Model temperature (0.0-1.0). Lower = more deterministic.
|
||||
|
||||
Returns:
|
||||
Same sections list with added classification fields:
|
||||
- type: Section type (SectionType literal)
|
||||
- should_index: Whether to include in vector index
|
||||
- chapter_number: Chapter number if applicable
|
||||
- classification_reason: Explanation for the classification
|
||||
|
||||
Example:
|
||||
>>> sections = [{"title": "Introduction", "content": "...", "level": 1}]
|
||||
>>> classified = classify_sections(sections, provider="ollama")
|
||||
>>> classified[0]["type"]
|
||||
'introduction'
|
||||
>>> classified[0]["should_index"]
|
||||
True
|
||||
"""
|
||||
if model is None:
|
||||
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
||||
|
||||
# Préparer les sections pour le prompt
|
||||
sections_for_prompt: list[dict[str, Any]] = []
|
||||
for i, section in enumerate(sections[:50]): # Limiter à 50 sections
|
||||
sections_for_prompt.append({
|
||||
"index": i,
|
||||
"title": section.get("title", ""),
|
||||
"preview": section.get("content", "")[:200] if section.get("content") else "",
|
||||
"level": section.get("level", 1),
|
||||
})
|
||||
|
||||
types_description: str = "\n".join([f"- {k}: {v}" for k, v in SECTION_TYPES.items()])
|
||||
title_context: str = f"Titre du document: {document_title}\n" if document_title else ""
|
||||
|
||||
prompt: str = f"""Tu es un expert en analyse de structure documentaire.
|
||||
|
||||
TÂCHE: Classifier chaque section selon son type.
|
||||
|
||||
{title_context}
|
||||
TYPES DISPONIBLES:
|
||||
{types_description}
|
||||
|
||||
RÈGLES:
|
||||
1. "front_matter": UNIQUEMENT pages de titre SANS contenu, copyright, colophon (métadonnées pures)
|
||||
2. "toc_display": la TABLE DES MATIÈRES elle-même (pas son contenu)
|
||||
3. "preface": préface, avant-propos, avertissement (À INDEXER car contenu intellectuel)
|
||||
4. "chapter": TOUT contenu principal - chapitres, sections, articles, revues de livre, essais
|
||||
5. "ignore": publicités, pages vides, métadonnées techniques sans valeur
|
||||
|
||||
IMPORTANT - REVUES DE LIVRE ET ARTICLES:
|
||||
- Une REVUE DE LIVRE ("Book Review") avec analyse critique → chapter, should_index = true
|
||||
- Un ARTICLE académique avec contenu substantiel → chapter, should_index = true
|
||||
- Les métadonnées éditoriales (auteur, affiliation, journal) au début d'un article NE sont PAS un motif pour classer comme "front_matter"
|
||||
- Si le document contient un TEXTE ANALYTIQUE développé → chapter
|
||||
|
||||
CAS PARTICULIERS:
|
||||
- "NOTE DE L'ÉDITEUR" (infos édition, réimpression, SANS analyse) → front_matter, should_index = false
|
||||
- "PRÉFACE" ou "AVANT-PROPOS" (texte intellectuel) → preface, should_index = true
|
||||
- "Book Review" ou "Article" avec paragraphes d'analyse → chapter, should_index = true
|
||||
|
||||
INDEXATION:
|
||||
- should_index = true pour: preface, introduction, chapter, conclusion, abstract
|
||||
- should_index = false pour: front_matter, toc_display, ignore
|
||||
|
||||
⚠️ ATTENTION AUX FAUX POSITIFS - LISTE DE TITRES VS CONTENU RÉEL:
|
||||
|
||||
LISTE DE TITRES (toc_display, should_index=false):
|
||||
- Suite de titres courts sans texte explicatif
|
||||
- Lignes commençant par "Comment...", "Où...", "Les dispositions à..."
|
||||
- Énumération de sections sans phrase complète
|
||||
- Exemple: "Comment fixer la croyance?\\nOù la croyance s'oppose au savoir\\nL'idéal de rationalité"
|
||||
|
||||
CONTENU RÉEL (chapter, should_index=true):
|
||||
- Texte avec phrases complètes et verbes conjugués
|
||||
- Paragraphes développés avec arguments
|
||||
- Explications, définitions, raisonnements
|
||||
- Exemple: "Comment fixer la croyance? Cette question se pose dès lors que..."
|
||||
|
||||
SECTIONS À CLASSIFIER:
|
||||
{json.dumps(sections_for_prompt, ensure_ascii=False, indent=2)}
|
||||
|
||||
RÉPONDS avec un JSON entre <JSON></JSON>:
|
||||
|
||||
<JSON>
|
||||
{{
|
||||
"classifications": [
|
||||
{{
|
||||
"index": 0,
|
||||
"type": "front_matter",
|
||||
"should_index": false,
|
||||
"chapter_number": null,
|
||||
"reason": "Page de titre avec métadonnées éditeur"
|
||||
}},
|
||||
{{
|
||||
"index": 1,
|
||||
"type": "chapter",
|
||||
"should_index": true,
|
||||
"chapter_number": 1,
|
||||
"reason": "Premier chapitre du document"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
</JSON>
|
||||
"""
|
||||
|
||||
logger.info(f"Classification de {len(sections_for_prompt)} sections via {provider.upper()} ({model})")
|
||||
|
||||
try:
|
||||
response: str = call_llm(prompt, model=model, provider=provider, temperature=temperature, timeout=300)
|
||||
result: dict[str, Any] = _extract_json_from_response(response)
|
||||
classifications: list[dict[str, Any]] = result.get("classifications", [])
|
||||
|
||||
# Créer un mapping index -> classification
|
||||
class_map: dict[int, dict[str, Any]] = {
|
||||
c["index"]: c for c in classifications if "index" in c
|
||||
}
|
||||
|
||||
# Appliquer les classifications
|
||||
for i, section in enumerate(sections):
|
||||
if i in class_map:
|
||||
c: dict[str, Any] = class_map[i]
|
||||
section["type"] = c.get("type", "chapter")
|
||||
section["should_index"] = c.get("should_index", True)
|
||||
section["chapter_number"] = c.get("chapter_number")
|
||||
section["classification_reason"] = c.get("reason", "")
|
||||
else:
|
||||
# Défaut: traiter comme contenu
|
||||
section["type"] = "chapter"
|
||||
section["should_index"] = True
|
||||
section["chapter_number"] = None
|
||||
|
||||
# Stats
|
||||
types_count: dict[str, int] = {}
|
||||
for s in sections:
|
||||
t: str = s.get("type", "unknown")
|
||||
types_count[t] = types_count.get(t, 0) + 1
|
||||
|
||||
logger.info(f"Classification terminée: {types_count}")
|
||||
|
||||
return sections
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur classification sections: {e}")
|
||||
# En cas d'erreur, marquer tout comme indexable
|
||||
for section in sections:
|
||||
section["type"] = "chapter"
|
||||
section["should_index"] = True
|
||||
return sections
|
||||
|
||||
|
||||
# Titres à exclure automatiquement (insensible à la casse)
|
||||
EXCLUDED_SECTION_TITLES: Final[list[str]] = [
|
||||
"table des matières",
|
||||
"table des matieres",
|
||||
"sommaire",
|
||||
"table of contents",
|
||||
"contents",
|
||||
"toc",
|
||||
"index",
|
||||
"liste des figures",
|
||||
"liste des tableaux",
|
||||
"list of figures",
|
||||
"list of tables",
|
||||
"note de l'éditeur",
|
||||
"note de l'editeur",
|
||||
"note de la rédaction",
|
||||
"copyright",
|
||||
"mentions légales",
|
||||
"crédits",
|
||||
"colophon",
|
||||
"achevé d'imprimer",
|
||||
]
|
||||
|
||||
|
||||
def is_excluded_section(section: dict[str, Any]) -> bool:
|
||||
"""Check if a section should be automatically excluded from indexing.
|
||||
|
||||
Excludes sections based on:
|
||||
1. Title matching known TOC/metadata patterns
|
||||
2. Content analysis detecting TOC-like structure (short lines, title patterns)
|
||||
|
||||
Args:
|
||||
section: Section dictionary with optional keys:
|
||||
- title: Section title
|
||||
- chapterTitle: Parent chapter title
|
||||
- content: Section content
|
||||
|
||||
Returns:
|
||||
True if section should be excluded from indexing.
|
||||
|
||||
Example:
|
||||
>>> is_excluded_section({"title": "Table des matières"})
|
||||
True
|
||||
>>> is_excluded_section({"title": "Introduction", "content": "..."})
|
||||
False
|
||||
"""
|
||||
title: str = section.get("title", "").lower().strip()
|
||||
chapter_title: str = section.get("chapterTitle", "").lower().strip()
|
||||
|
||||
# Vérifier le titre de la section
|
||||
for excluded in EXCLUDED_SECTION_TITLES:
|
||||
if excluded in title or title == excluded:
|
||||
return True
|
||||
if excluded in chapter_title or chapter_title == excluded:
|
||||
return True
|
||||
|
||||
# Vérifier si le contenu ressemble à une liste de titres (TOC)
|
||||
content: str = section.get("content", "")
|
||||
if content:
|
||||
lines: list[str] = [l.strip() for l in content.split("\n") if l.strip()]
|
||||
|
||||
# Si pas assez de lignes, pas de détection
|
||||
if len(lines) < 3:
|
||||
return False
|
||||
|
||||
# Critère 1: Lignes courtes (moyenne < 50 chars)
|
||||
avg_len: float = sum(len(l) for l in lines) / len(lines)
|
||||
|
||||
# Critère 2: Toutes les lignes sont courtes (< 100 chars)
|
||||
all_short: bool = all(len(l) < 100 for l in lines[:10])
|
||||
|
||||
# Critère 3: Patterns typiques de titres de sections
|
||||
title_patterns: list[str] = [
|
||||
r'^Comment\s+.+\?', # "Comment fixer la croyance?"
|
||||
r'^Où\s+.+', # "Où la croyance s'oppose"
|
||||
r'^Les?\s+\w+\s+à\s+', # "Les dispositions à penser"
|
||||
r'^Que\s+.+\?', # "Que peut-on savoir?"
|
||||
r'^L[ae]\s+\w+\s+(de|du)\s+', # "La critique de l'intuition"
|
||||
r'^Entre\s+.+\s+et\s+', # "Entre nature et norme"
|
||||
]
|
||||
|
||||
# Compter combien de lignes matchent les patterns de titres
|
||||
title_like_count: int = 0
|
||||
for line in lines[:10]:
|
||||
for pattern in title_patterns:
|
||||
if re.match(pattern, line, re.IGNORECASE):
|
||||
title_like_count += 1
|
||||
break
|
||||
|
||||
# Critère 4: Pas de verbes conjugués typiques du contenu narratif
|
||||
narrative_verbs: list[str] = [
|
||||
r'\best\b', r'\bsont\b', r'\bétait\b', r'\bsera\b',
|
||||
r'\ba\b', r'\bont\b', r'\bavait\b', r'\bavaient\b',
|
||||
r'\bfait\b', r'\bdit\b', r'\bpense\b', r'\bexplique\b'
|
||||
]
|
||||
|
||||
has_narrative: bool = False
|
||||
for line in lines[:5]:
|
||||
for verb_pattern in narrative_verbs:
|
||||
if re.search(verb_pattern, line, re.IGNORECASE):
|
||||
has_narrative = True
|
||||
break
|
||||
if has_narrative:
|
||||
break
|
||||
|
||||
# Décision: C'est une liste de titres (TOC) si:
|
||||
# - Lignes courtes ET toutes < 100 chars ET (beaucoup de patterns de titres OU pas de verbes narratifs)
|
||||
if len(lines) >= 5 and avg_len < 50 and all_short:
|
||||
if title_like_count >= len(lines) * 0.4 or not has_narrative:
|
||||
logger.debug(f"Section '{title}' exclue: ressemble à une TOC (lignes courtes, {title_like_count}/{len(lines)} titres)")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def filter_indexable_sections(sections: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
"""Filter sections to keep only those that should be indexed.
|
||||
|
||||
Applies multiple exclusion criteria:
|
||||
1. Automatic exclusion by title pattern (TOC, index, etc.)
|
||||
2. Parent chapter exclusion (if parent is TOC)
|
||||
3. LLM classification (should_index flag)
|
||||
|
||||
Args:
|
||||
sections: List of classified section dictionaries.
|
||||
|
||||
Returns:
|
||||
Filtered list containing only indexable sections.
|
||||
|
||||
Example:
|
||||
>>> sections = [
|
||||
... {"title": "TOC", "should_index": False},
|
||||
... {"title": "Chapter 1", "should_index": True}
|
||||
... ]
|
||||
>>> filtered = filter_indexable_sections(sections)
|
||||
>>> len(filtered)
|
||||
1
|
||||
"""
|
||||
filtered: list[dict[str, Any]] = []
|
||||
excluded_count: int = 0
|
||||
|
||||
for s in sections:
|
||||
# Vérifier l'exclusion automatique
|
||||
if is_excluded_section(s):
|
||||
logger.info(f"Section exclue automatiquement: '{s.get('title', 'Sans titre')}'")
|
||||
excluded_count += 1
|
||||
continue
|
||||
|
||||
# Vérifier si le chapitre parent est une TOC
|
||||
chapter_title: str = s.get("chapterTitle", "").lower().strip()
|
||||
if any(excluded in chapter_title for excluded in EXCLUDED_SECTION_TITLES):
|
||||
logger.info(f"Section exclue (chapitre TOC): '{s.get('title', 'Sans titre')}' dans '{chapter_title}'")
|
||||
excluded_count += 1
|
||||
continue
|
||||
|
||||
# Vérifier la classification LLM
|
||||
if s.get("should_index", True):
|
||||
filtered.append(s)
|
||||
else:
|
||||
excluded_count += 1
|
||||
|
||||
if excluded_count > 0:
|
||||
logger.info(f"Sections exclues: {excluded_count}, indexables: {len(filtered)}")
|
||||
|
||||
return filtered
|
||||
|
||||
|
||||
def validate_classified_sections(sections: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
"""Post-classification validation to detect false positives.
|
||||
|
||||
Performs additional checks on sections marked should_index=True to catch
|
||||
TOC fragments that escaped initial classification:
|
||||
1. Parent chapter is TOC -> exclude
|
||||
2. Content is mostly short title-like lines -> reclassify as toc_display
|
||||
|
||||
Args:
|
||||
sections: List of already-classified section dictionaries.
|
||||
|
||||
Returns:
|
||||
Validated sections with corrections applied. Corrections are logged
|
||||
and stored in 'validation_correction' field.
|
||||
|
||||
Example:
|
||||
>>> sections = [{"title": "Part 1", "should_index": True, "content": "..."}]
|
||||
>>> validated = validate_classified_sections(sections)
|
||||
>>> # May reclassify sections with TOC-like content
|
||||
"""
|
||||
validated: list[dict[str, Any]] = []
|
||||
fixed_count: int = 0
|
||||
|
||||
for section in sections:
|
||||
# Vérifier d'abord si le titre du chapitre parent est une TOC
|
||||
chapter_title: str = section.get("chapter_title", "").lower().strip()
|
||||
section_title: str = section.get("title", "").lower().strip()
|
||||
|
||||
# Exclure si le chapitre parent est une TOC
|
||||
is_toc_chapter: bool = False
|
||||
for excluded in EXCLUDED_SECTION_TITLES:
|
||||
if excluded in chapter_title:
|
||||
logger.warning(f"Section '{section.get('title', 'Sans titre')}' exclue: chapitre parent est '{chapter_title}'")
|
||||
section["should_index"] = False
|
||||
section["type"] = "toc_display"
|
||||
section["validation_correction"] = f"Exclue car chapitre parent = {chapter_title}"
|
||||
fixed_count += 1
|
||||
is_toc_chapter = True
|
||||
break
|
||||
|
||||
if is_toc_chapter:
|
||||
validated.append(section)
|
||||
continue
|
||||
|
||||
# Si déjà marquée comme non-indexable, garder tel quel
|
||||
if not section.get("should_index", True):
|
||||
validated.append(section)
|
||||
continue
|
||||
|
||||
content: str = section.get("content", "")
|
||||
|
||||
# Validation supplémentaire sur le contenu
|
||||
if content:
|
||||
lines: list[str] = [l.strip() for l in content.split("\n") if l.strip()]
|
||||
|
||||
# Si très peu de lignes, probablement pas un problème
|
||||
if len(lines) < 3:
|
||||
validated.append(section)
|
||||
continue
|
||||
|
||||
# Calculer le ratio de lignes qui ressemblent à des titres
|
||||
title_question_pattern: str = r'^(Comment|Où|Que|Quelle|Quel|Les?\s+\w+\s+(de|du|à)|Entre\s+.+\s+et)\s+'
|
||||
title_like: int = sum(1 for l in lines if re.match(title_question_pattern, l, re.IGNORECASE))
|
||||
|
||||
# Si > 50% des lignes ressemblent à des titres ET lignes courtes
|
||||
avg_len: float = sum(len(l) for l in lines) / len(lines)
|
||||
|
||||
if len(lines) >= 4 and title_like >= len(lines) * 0.5 and avg_len < 55:
|
||||
# C'est probablement une liste de titres extraite de la TOC
|
||||
logger.warning(f"Section '{section.get('title', 'Sans titre')}' reclassée: détectée comme liste de titres TOC")
|
||||
section["should_index"] = False
|
||||
section["type"] = "toc_display"
|
||||
section["validation_correction"] = "Reclassée comme toc_display (liste de titres)"
|
||||
fixed_count += 1
|
||||
validated.append(section)
|
||||
continue
|
||||
|
||||
validated.append(section)
|
||||
|
||||
if fixed_count > 0:
|
||||
logger.info(f"Validation post-classification: {fixed_count} section(s) reclassée(s)")
|
||||
|
||||
return validated
|
||||
|
||||
|
||||
def get_chapter_sections(sections: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
"""Filter sections to return only chapter-type content.
|
||||
|
||||
Returns sections with types that contain main document content:
|
||||
chapter, introduction, conclusion, abstract, preface.
|
||||
|
||||
Args:
|
||||
sections: List of classified section dictionaries.
|
||||
|
||||
Returns:
|
||||
Filtered list containing only chapter-type sections.
|
||||
|
||||
Example:
|
||||
>>> sections = [
|
||||
... {"title": "TOC", "type": "toc_display"},
|
||||
... {"title": "Chapter 1", "type": "chapter"}
|
||||
... ]
|
||||
>>> chapters = get_chapter_sections(sections)
|
||||
>>> len(chapters)
|
||||
1
|
||||
"""
|
||||
chapter_types: set[str] = {"chapter", "introduction", "conclusion", "abstract", "preface"}
|
||||
return [s for s in sections if s.get("type") in chapter_types]
|
||||
389
generations/library_rag/utils/llm_cleaner.py
Normal file
389
generations/library_rag/utils/llm_cleaner.py
Normal file
@@ -0,0 +1,389 @@
|
||||
"""Text cleaning and validation for OCR-extracted content.
|
||||
|
||||
This module provides utilities for cleaning OCR artifacts from extracted text,
|
||||
validating chunk content, and optionally using LLM for intelligent corrections.
|
||||
It handles common OCR issues like page markers, isolated page numbers,
|
||||
repeated headers/footers, and character recognition errors.
|
||||
|
||||
Overview:
|
||||
The module offers three levels of cleaning:
|
||||
|
||||
1. **Basic cleaning** (clean_page_markers, clean_ocr_artifacts):
|
||||
Fast regex-based cleaning for common issues. Always applied.
|
||||
|
||||
2. **LLM-enhanced cleaning** (clean_content_with_llm):
|
||||
Uses an LLM to correct subtle OCR errors while preserving meaning.
|
||||
Only applied when explicitly requested and for medium-length texts.
|
||||
|
||||
3. **Validation** (is_chunk_valid):
|
||||
Checks if a text chunk contains meaningful content.
|
||||
|
||||
Cleaning Operations:
|
||||
- Remove page markers (<!-- Page X -->)
|
||||
- Remove isolated page numbers
|
||||
- Remove short/repetitive header/footer lines
|
||||
- Normalize multiple spaces and blank lines
|
||||
- Correct obvious OCR character errors (LLM mode)
|
||||
- Preserve citations, technical vocabulary, paragraph structure
|
||||
|
||||
Validation Criteria:
|
||||
- Minimum character count (default: 20)
|
||||
- Minimum word count (default: 5)
|
||||
- Not pure metadata (URLs, ISBNs, DOIs, copyright notices)
|
||||
|
||||
LLM Provider Support:
|
||||
- ollama: Local LLM (free, slower, default)
|
||||
- mistral: Mistral API (faster, requires API key)
|
||||
|
||||
Example:
|
||||
>>> from utils.llm_cleaner import clean_chunk, is_chunk_valid
|
||||
>>>
|
||||
>>> # Clean a chunk with basic cleaning only
|
||||
>>> text = "<!-- Page 42 --> Some philosophical content..."
|
||||
>>> cleaned = clean_chunk(text)
|
||||
>>> print(cleaned)
|
||||
'Some philosophical content...'
|
||||
>>>
|
||||
>>> # Validate chunk before processing
|
||||
>>> if is_chunk_valid(cleaned):
|
||||
... process_chunk(cleaned)
|
||||
|
||||
See Also:
|
||||
utils.llm_chunker: Semantic chunking of sections
|
||||
utils.llm_validator: Document validation and concept extraction
|
||||
utils.pdf_pipeline: Main pipeline orchestration
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import List, Optional, Pattern
|
||||
|
||||
from .llm_structurer import call_llm, _get_default_model, _get_default_mistral_model
|
||||
from .types import LLMProvider
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
# Type alias for compiled regex patterns
|
||||
RegexPattern = Pattern[str]
|
||||
|
||||
|
||||
def clean_page_markers(text: str) -> str:
|
||||
r"""Remove page markers and normalize blank lines from text.
|
||||
|
||||
Page markers are HTML comments inserted during OCR processing to track
|
||||
page boundaries. This function removes them along with excessive blank
|
||||
lines that may result from the removal.
|
||||
|
||||
Args:
|
||||
text: Text content potentially containing page markers like
|
||||
'<!-- Page 42 -->' and multiple consecutive newlines.
|
||||
|
||||
Returns:
|
||||
Cleaned text with page markers removed and no more than two
|
||||
consecutive newlines. Text is stripped of leading/trailing whitespace.
|
||||
|
||||
Example:
|
||||
>>> text = "<!-- Page 1 -->\nContent here\n\n\n\n<!-- Page 2 -->"
|
||||
>>> clean_page_markers(text)
|
||||
'Content here'
|
||||
"""
|
||||
# Supprimer les marqueurs <!-- Page X -->
|
||||
text = re.sub(r'<!--\s*Page\s*\d+\s*-->', '', text)
|
||||
|
||||
# Supprimer les lignes vides multiples
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
def clean_ocr_artifacts(text: str) -> str:
|
||||
r"""Remove common OCR artifacts without using LLM.
|
||||
|
||||
This function performs fast, rule-based cleaning of typical OCR issues:
|
||||
- Isolated page numbers (1-4 digits on their own line)
|
||||
- Very short lines likely to be headers/footers (<=3 chars)
|
||||
- Multiple consecutive spaces
|
||||
- Excessive blank lines (>2)
|
||||
|
||||
Lines starting with '#' (markdown headers) are preserved regardless
|
||||
of length. Empty lines are preserved (single blank lines only).
|
||||
|
||||
Args:
|
||||
text: Raw OCR-extracted text potentially containing artifacts
|
||||
like isolated page numbers, repeated headers, and irregular spacing.
|
||||
|
||||
Returns:
|
||||
Cleaned text with artifacts removed and spacing normalized.
|
||||
Leading/trailing whitespace is stripped.
|
||||
|
||||
Example:
|
||||
>>> text = "42\n\nActual content here\n\n\n\n\nMore text"
|
||||
>>> clean_ocr_artifacts(text)
|
||||
'Actual content here\n\nMore text'
|
||||
|
||||
Note:
|
||||
This function is always called as part of clean_chunk() and provides
|
||||
a baseline level of cleaning even when LLM cleaning is disabled.
|
||||
"""
|
||||
# Supprimer les numéros de page isolés
|
||||
text = re.sub(r'^\d{1,4}\s*$', '', text, flags=re.MULTILINE)
|
||||
|
||||
# Supprimer les en-têtes/pieds de page répétés (lignes très courtes isolées)
|
||||
lines: List[str] = text.split('\n')
|
||||
cleaned_lines: List[str] = []
|
||||
for line in lines:
|
||||
# Garder les lignes non vides et significatives
|
||||
stripped: str = line.strip()
|
||||
if stripped and (len(stripped) > 3 or stripped.startswith('#')):
|
||||
cleaned_lines.append(line)
|
||||
elif not stripped:
|
||||
cleaned_lines.append('') # Préserver les lignes vides simples
|
||||
|
||||
text = '\n'.join(cleaned_lines)
|
||||
|
||||
# Normaliser les espaces
|
||||
text = re.sub(r' {2,}', ' ', text)
|
||||
|
||||
# Supprimer les lignes vides multiples
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
def clean_content_with_llm(
|
||||
text: str,
|
||||
context: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
provider: LLMProvider = "ollama",
|
||||
temperature: float = 0.1,
|
||||
) -> str:
|
||||
"""Clean text content using an LLM for intelligent OCR error correction.
|
||||
|
||||
Uses a language model to correct subtle OCR errors that rule-based
|
||||
cleaning cannot handle, such as misrecognized characters in context.
|
||||
The LLM is instructed to preserve the intellectual content exactly
|
||||
while fixing obvious technical errors.
|
||||
|
||||
The function includes safeguards:
|
||||
- Texts < 50 chars: Only basic cleaning (LLM skipped)
|
||||
- Texts > 3000 chars: Only basic cleaning (timeout risk)
|
||||
- If LLM changes text by >50%: Fallback to basic cleaning
|
||||
|
||||
Args:
|
||||
text: Text content to clean. Should be between 50-3000 characters
|
||||
for LLM processing.
|
||||
context: Optional context about the document (title, subject) to
|
||||
help the LLM make better corrections. Example: "Heidegger's
|
||||
Being and Time, Chapter 2".
|
||||
model: LLM model name. If None, uses provider default
|
||||
(qwen2.5:7b for ollama, mistral-small-latest for mistral).
|
||||
provider: LLM provider to use. Options: "ollama" (local, free)
|
||||
or "mistral" (API, faster).
|
||||
temperature: LLM temperature for response generation. Lower values
|
||||
(0.1) produce more deterministic corrections. Defaults to 0.1.
|
||||
|
||||
Returns:
|
||||
Cleaned text with OCR errors corrected. If LLM fails or produces
|
||||
suspicious output (too short/long), returns basic-cleaned text.
|
||||
|
||||
Raises:
|
||||
No exceptions raised - all errors caught and handled with fallback.
|
||||
|
||||
Example:
|
||||
>>> text = "Heidegger's concept of Dase1n is central..." # '1' should be 'i'
|
||||
>>> clean_content_with_llm(text, context="Being and Time")
|
||||
"Heidegger's concept of Dasein is central..."
|
||||
|
||||
Note:
|
||||
The LLM is explicitly instructed NOT to:
|
||||
- Modify meaning or intellectual content
|
||||
- Rephrase or summarize
|
||||
- Add any new content
|
||||
- Alter citations or technical vocabulary
|
||||
"""
|
||||
if model is None:
|
||||
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
||||
|
||||
# Ne pas traiter les textes trop courts
|
||||
if len(text.strip()) < 50:
|
||||
return clean_page_markers(text)
|
||||
|
||||
# Limiter la taille pour éviter les timeouts
|
||||
max_chars: int = 3000
|
||||
if len(text) > max_chars:
|
||||
# Pour les longs textes, nettoyer sans LLM
|
||||
return clean_page_markers(clean_ocr_artifacts(text))
|
||||
|
||||
context_info: str = f"Contexte: {context}\n" if context else ""
|
||||
|
||||
prompt: str = f"""Tu es un expert en correction de textes OCRisés.
|
||||
|
||||
TÂCHE: Nettoyer ce texte extrait par OCR.
|
||||
|
||||
{context_info}
|
||||
ACTIONS À EFFECTUER:
|
||||
1. Supprimer les marqueurs de page (<!-- Page X -->)
|
||||
2. Corriger les erreurs OCR ÉVIDENTES (caractères mal reconnus)
|
||||
3. Supprimer les artefacts (numéros de page isolés, en-têtes répétés)
|
||||
4. Normaliser la ponctuation et les espaces
|
||||
|
||||
RÈGLES STRICTES:
|
||||
- NE PAS modifier le sens ou le contenu intellectuel
|
||||
- NE PAS reformuler ou résumer
|
||||
- NE PAS ajouter de contenu
|
||||
- Préserver les citations et le vocabulaire technique
|
||||
- Garder la structure des paragraphes
|
||||
|
||||
TEXTE À NETTOYER:
|
||||
{text}
|
||||
|
||||
RÉPONDS UNIQUEMENT avec le texte nettoyé, sans commentaires ni balises."""
|
||||
|
||||
try:
|
||||
response: str = call_llm(
|
||||
prompt, model=model, provider=provider, temperature=temperature, timeout=120
|
||||
)
|
||||
|
||||
# Vérifier que la réponse est valide
|
||||
cleaned: str = response.strip()
|
||||
|
||||
# Si la réponse est trop différente (LLM a trop modifié), garder l'original nettoyé basiquement
|
||||
if len(cleaned) < len(text) * 0.5 or len(cleaned) > len(text) * 1.5:
|
||||
logger.warning("LLM a trop modifié le texte, utilisation du nettoyage basique")
|
||||
return clean_page_markers(clean_ocr_artifacts(text))
|
||||
|
||||
return cleaned
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur nettoyage LLM: {e}, utilisation du nettoyage basique")
|
||||
return clean_page_markers(clean_ocr_artifacts(text))
|
||||
|
||||
|
||||
def clean_chunk(
|
||||
chunk_text: str,
|
||||
use_llm: bool = False,
|
||||
context: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
provider: LLMProvider = "ollama",
|
||||
) -> str:
|
||||
r"""Clean a text chunk with optional LLM enhancement.
|
||||
|
||||
This is the main entry point for chunk cleaning. It always applies
|
||||
basic cleaning (page markers, OCR artifacts) and optionally uses
|
||||
LLM for more intelligent error correction.
|
||||
|
||||
Cleaning pipeline:
|
||||
1. Remove page markers (always)
|
||||
2. Remove OCR artifacts (always)
|
||||
3. LLM correction (if use_llm=True and text >= 50 chars)
|
||||
|
||||
Args:
|
||||
chunk_text: Raw text content of the chunk to clean.
|
||||
use_llm: Whether to use LLM for enhanced cleaning. Defaults to
|
||||
False. Set to True for higher quality but slower processing.
|
||||
context: Optional document context (title, chapter) passed to LLM
|
||||
for better corrections. Ignored if use_llm=False.
|
||||
model: LLM model name. If None, uses provider default.
|
||||
Ignored if use_llm=False.
|
||||
provider: LLM provider ("ollama" or "mistral"). Defaults to
|
||||
"ollama". Ignored if use_llm=False.
|
||||
|
||||
Returns:
|
||||
Cleaned chunk text ready for indexing or further processing.
|
||||
|
||||
Example:
|
||||
>>> # Basic cleaning only (fast)
|
||||
>>> chunk = "<!-- Page 5 -->\n42\n\nThe concept of being..."
|
||||
>>> clean_chunk(chunk)
|
||||
'The concept of being...'
|
||||
>>>
|
||||
>>> # With LLM enhancement (slower, higher quality)
|
||||
>>> clean_chunk(chunk, use_llm=True, context="Heidegger analysis")
|
||||
'The concept of being...'
|
||||
|
||||
See Also:
|
||||
is_chunk_valid: Validate cleaned chunks before processing
|
||||
clean_page_markers: Basic page marker removal
|
||||
clean_ocr_artifacts: Basic artifact removal
|
||||
"""
|
||||
# Nettoyage de base toujours appliqué
|
||||
text: str = clean_page_markers(chunk_text)
|
||||
text = clean_ocr_artifacts(text)
|
||||
|
||||
# Nettoyage LLM optionnel
|
||||
if use_llm and len(text) >= 50:
|
||||
text = clean_content_with_llm(text, context=context, model=model, provider=provider)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def is_chunk_valid(chunk_text: str, min_chars: int = 20, min_words: int = 5) -> bool:
|
||||
"""Check if a text chunk contains meaningful content.
|
||||
|
||||
Validates that a chunk has sufficient length and is not purely
|
||||
metadata or boilerplate content. Used to filter out non-content
|
||||
chunks before indexing.
|
||||
|
||||
Validation criteria:
|
||||
1. Character count >= min_chars (after page marker removal)
|
||||
2. Word count >= min_words
|
||||
3. Not matching metadata patterns (URLs, ISBNs, DOIs, dates, copyright)
|
||||
|
||||
Args:
|
||||
chunk_text: Text content of the chunk to validate. Page markers
|
||||
are removed before validation.
|
||||
min_chars: Minimum number of characters required. Defaults to 20.
|
||||
Chunks shorter than this are considered invalid.
|
||||
min_words: Minimum number of words required. Defaults to 5.
|
||||
Chunks with fewer words are considered invalid.
|
||||
|
||||
Returns:
|
||||
True if the chunk passes all validation criteria and contains
|
||||
meaningful content suitable for indexing. False otherwise.
|
||||
|
||||
Example:
|
||||
>>> is_chunk_valid("The concept of Dasein is central to Heidegger.")
|
||||
True
|
||||
>>> is_chunk_valid("42") # Too short
|
||||
False
|
||||
>>> is_chunk_valid("ISBN 978-0-123456-78-9") # Metadata
|
||||
False
|
||||
>>> is_chunk_valid("https://example.com/page") # URL
|
||||
False
|
||||
|
||||
Note:
|
||||
Metadata patterns checked:
|
||||
- URLs (http://, https://)
|
||||
- Dates (YYYY-MM-DD format)
|
||||
- ISBN numbers
|
||||
- DOI identifiers
|
||||
- Copyright notices (©)
|
||||
"""
|
||||
text: str = clean_page_markers(chunk_text).strip()
|
||||
|
||||
# Vérifier la longueur
|
||||
if len(text) < min_chars:
|
||||
return False
|
||||
|
||||
# Compter les mots
|
||||
words: List[str] = text.split()
|
||||
if len(words) < min_words:
|
||||
return False
|
||||
|
||||
# Vérifier que ce n'est pas juste des métadonnées
|
||||
metadata_patterns: List[str] = [
|
||||
r'^https?://',
|
||||
r'^\d{4}-\d{2}-\d{2}$',
|
||||
r'^ISBN',
|
||||
r'^DOI',
|
||||
r'^©',
|
||||
]
|
||||
pattern: str
|
||||
for pattern in metadata_patterns:
|
||||
if re.match(pattern, text, re.IGNORECASE):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
294
generations/library_rag/utils/llm_metadata.py
Normal file
294
generations/library_rag/utils/llm_metadata.py
Normal file
@@ -0,0 +1,294 @@
|
||||
r"""LLM-based bibliographic metadata extraction from documents.
|
||||
|
||||
This module extracts bibliographic metadata (title, author, publisher, year, etc.)
|
||||
from document text using Large Language Models. It supports both local (Ollama)
|
||||
and cloud-based (Mistral API) LLM providers.
|
||||
|
||||
The extraction process:
|
||||
1. Takes the first N characters of the document markdown (typically first pages)
|
||||
2. Sends a structured prompt to the LLM requesting JSON-formatted metadata
|
||||
3. Parses the LLM response to extract the JSON data
|
||||
4. Applies default values and cleanup for missing/invalid fields
|
||||
|
||||
Supported metadata fields:
|
||||
- title: Document title (including subtitle if present)
|
||||
- author: Primary author name
|
||||
- collection: Series or collection name
|
||||
- publisher: Publisher name
|
||||
- year: Publication year
|
||||
- doi: Digital Object Identifier
|
||||
- isbn: ISBN number
|
||||
- language: ISO 639-1 language code (default: "fr")
|
||||
- confidence: Dict of confidence scores per field (0.0-1.0)
|
||||
|
||||
LLM Provider Differences:
|
||||
- **Ollama** (local): Free, slower, requires local installation.
|
||||
Uses models like "mistral", "llama2", "mixtral".
|
||||
- **Mistral API** (cloud): Fast, paid (~0.002€/call for small prompts).
|
||||
Uses models like "mistral-small-latest", "mistral-medium-latest".
|
||||
|
||||
Cost Implications:
|
||||
- Ollama: No API cost, only local compute resources
|
||||
- Mistral API: ~0.002€ per metadata extraction call (small prompt)
|
||||
|
||||
Example:
|
||||
>>> from utils.llm_metadata import extract_metadata
|
||||
>>>
|
||||
>>> markdown = '''
|
||||
... # La technique et le temps
|
||||
... ## Tome 1 : La faute d'Épiméthée
|
||||
...
|
||||
... Bernard Stiegler
|
||||
...
|
||||
... Éditions Galilée, 1994
|
||||
... '''
|
||||
>>>
|
||||
>>> metadata = extract_metadata(markdown, provider="ollama")
|
||||
>>> print(metadata)
|
||||
{
|
||||
'title': 'La technique et le temps. Tome 1 : La faute d\'Épiméthée',
|
||||
'author': 'Bernard Stiegler',
|
||||
'publisher': 'Éditions Galilée',
|
||||
'year': 1994,
|
||||
'language': 'fr',
|
||||
'confidence': {'title': 0.95, 'author': 0.98}
|
||||
}
|
||||
|
||||
See Also:
|
||||
- llm_toc: Table of contents extraction via LLM
|
||||
- llm_structurer: Core LLM call infrastructure
|
||||
- pdf_pipeline: Orchestration using this module (Step 4)
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from .llm_structurer import (
|
||||
_clean_json_string,
|
||||
_get_default_mistral_model,
|
||||
_get_default_model,
|
||||
call_llm,
|
||||
)
|
||||
from .types import LLMProvider
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _extract_json_from_response(text: str) -> Dict[str, Any]:
|
||||
"""Extract JSON data from an LLM response string.
|
||||
|
||||
Attempts to parse JSON from the LLM response using two strategies:
|
||||
1. First, looks for JSON enclosed in <JSON></JSON> tags (preferred format)
|
||||
2. Falls back to finding the first {...} block in the response
|
||||
|
||||
The function applies JSON string cleaning to handle common LLM quirks
|
||||
like trailing commas, unescaped quotes, etc.
|
||||
|
||||
Args:
|
||||
text: Raw LLM response text that may contain JSON data.
|
||||
|
||||
Returns:
|
||||
Parsed JSON as a dictionary. Returns empty dict if no valid
|
||||
JSON could be extracted.
|
||||
|
||||
Example:
|
||||
>>> response = '<JSON>{"title": "Test", "author": "Smith"}</JSON>'
|
||||
>>> _extract_json_from_response(response)
|
||||
{'title': 'Test', 'author': 'Smith'}
|
||||
|
||||
>>> response = 'Here is the metadata: {"title": "Test"}'
|
||||
>>> _extract_json_from_response(response)
|
||||
{'title': 'Test'}
|
||||
"""
|
||||
# Chercher entre balises <JSON> et </JSON>
|
||||
json_match: Optional[re.Match[str]] = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL)
|
||||
if json_match:
|
||||
json_str: str = _clean_json_string(json_match.group(1))
|
||||
try:
|
||||
result: Dict[str, Any] = json.loads(json_str)
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Fallback: chercher le premier objet JSON
|
||||
start: int = text.find("{")
|
||||
end: int = text.rfind("}")
|
||||
if start != -1 and end > start:
|
||||
json_str = _clean_json_string(text[start:end + 1])
|
||||
try:
|
||||
result = json.loads(json_str)
|
||||
return result
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"JSON invalide: {e}")
|
||||
|
||||
return {}
|
||||
|
||||
|
||||
def extract_metadata(
|
||||
markdown: str,
|
||||
model: Optional[str] = None,
|
||||
provider: LLMProvider = "ollama",
|
||||
temperature: float = 0.1,
|
||||
max_chars: int = 6000,
|
||||
) -> Dict[str, Any]:
|
||||
"""Extract bibliographic metadata from a document using an LLM.
|
||||
|
||||
Analyzes the beginning of a document (typically first few pages) to extract
|
||||
bibliographic metadata including title, author, publisher, year, and more.
|
||||
Uses a structured prompt that guides the LLM to distinguish between
|
||||
document title vs. collection name vs. publisher name.
|
||||
|
||||
The LLM is instructed to return confidence scores for extracted fields,
|
||||
allowing downstream processing to handle uncertain extractions appropriately.
|
||||
|
||||
Args:
|
||||
markdown: Document text in Markdown format. For best results, provide
|
||||
at least the first 2-3 pages containing title page and colophon.
|
||||
model: LLM model name to use. If None, uses the default model for the
|
||||
selected provider (e.g., "mistral" for Ollama, "mistral-small-latest"
|
||||
for Mistral API).
|
||||
provider: LLM provider to use. Options are:
|
||||
- "ollama": Local LLM (free, slower, requires Ollama installation)
|
||||
- "mistral": Mistral API (fast, paid, requires API key)
|
||||
temperature: Model temperature for generation. Lower values (0.0-0.3)
|
||||
produce more consistent, deterministic results. Default 0.1.
|
||||
max_chars: Maximum number of characters to send to the LLM. Longer
|
||||
documents are truncated. Default 6000 (~2 pages).
|
||||
|
||||
Returns:
|
||||
Dictionary containing extracted metadata with the following keys:
|
||||
- title (str | None): Document title with subtitle if present
|
||||
- author (str | None): Primary author name
|
||||
- collection (str | None): Series or collection name
|
||||
- publisher (str | None): Publisher name
|
||||
- year (int | None): Publication year
|
||||
- doi (str | None): Digital Object Identifier
|
||||
- isbn (str | None): ISBN number
|
||||
- language (str): ISO 639-1 language code (default "fr")
|
||||
- confidence (dict): Confidence scores per field (0.0-1.0)
|
||||
- error (str): Error message if extraction failed (only on error)
|
||||
|
||||
Raises:
|
||||
No exceptions are raised; errors are captured in the return dict.
|
||||
|
||||
Note:
|
||||
- Cost for Mistral API: ~0.002€ per call (6000 chars input)
|
||||
- Ollama is free but requires local GPU/CPU resources
|
||||
- The prompt is in French as most processed documents are French texts
|
||||
- Low temperature (0.1) is used for consistent metadata extraction
|
||||
|
||||
Example:
|
||||
>>> # Extract from first pages of a philosophy book
|
||||
>>> markdown = Path("output/stiegler/stiegler.md").read_text()[:6000]
|
||||
>>> metadata = extract_metadata(markdown, provider="ollama")
|
||||
>>> print(f"Title: {metadata['title']}")
|
||||
Title: La technique et le temps
|
||||
|
||||
>>> # Using Mistral API for faster extraction
|
||||
>>> metadata = extract_metadata(markdown, provider="mistral")
|
||||
>>> print(f"Author: {metadata['author']} (confidence: {metadata['confidence'].get('author', 'N/A')})")
|
||||
Author: Bernard Stiegler (confidence: 0.98)
|
||||
"""
|
||||
if model is None:
|
||||
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
||||
|
||||
# Prendre les premières pages (métadonnées souvent au début)
|
||||
content: str = markdown[:max_chars]
|
||||
if len(markdown) > max_chars:
|
||||
content += "\n\n[... document tronqué ...]"
|
||||
|
||||
prompt: str = f"""Tu es un expert en bibliographie et édition scientifique.
|
||||
|
||||
TÂCHE: Extraire les métadonnées bibliographiques de ce document.
|
||||
|
||||
ATTENTION - PIÈGES COURANTS:
|
||||
- Le titre n'est PAS forcément le premier titre H1 (peut être le nom de la collection)
|
||||
- Le sous-titre fait partie du titre
|
||||
- L'auteur peut apparaître sous le titre, dans les métadonnées éditeur, ou ailleurs
|
||||
- Distingue bien: titre de l'œuvre ≠ nom de la collection/série ≠ nom de l'éditeur
|
||||
|
||||
INDICES POUR TROUVER LE VRAI TITRE:
|
||||
- Souvent en plus grand / plus visible
|
||||
- Accompagné du nom de l'auteur juste après
|
||||
- Répété sur la page de garde et la page de titre
|
||||
- Peut contenir un sous-titre après ":"
|
||||
|
||||
IMPORTANT - FORMAT DES DONNÉES:
|
||||
- N'ajoute JAMAIS d'annotations comme "(correct)", "(à confirmer)", "(possiblement)", etc.
|
||||
- Retourne uniquement les noms propres et titres sans commentaires
|
||||
- NE METS PAS de phrases comme "À confirmer avec...", "Vérifier si...", "Possiblement..."
|
||||
- Le champ "confidence" sert à exprimer ton niveau de certitude
|
||||
- Si tu n'es pas sûr du titre, mets le titre le plus probable ET un confidence faible
|
||||
- EXEMPLE CORRECT: "title": "La pensée-signe" avec "confidence": {{"title": 0.6}}
|
||||
- EXEMPLE INCORRECT: "title": "À confirmer avec le titre exact"
|
||||
|
||||
RÉPONDS UNIQUEMENT avec un JSON entre balises <JSON></JSON>:
|
||||
|
||||
<JSON>
|
||||
{{
|
||||
"title": "Le vrai titre de l'œuvre (avec sous-titre si présent)",
|
||||
"author": "Prénom Nom de l'auteur principal",
|
||||
"collection": "Nom de la collection ou série (null si absent)",
|
||||
"publisher": "Nom de l'éditeur",
|
||||
"year": 2023,
|
||||
"doi": "10.xxxx/xxxxx (null si absent)",
|
||||
"isbn": "978-x-xxxx-xxxx-x (null si absent)",
|
||||
"language": "fr",
|
||||
"confidence": {{
|
||||
"title": 0.95,
|
||||
"author": 0.90
|
||||
}}
|
||||
}}
|
||||
</JSON>
|
||||
|
||||
DOCUMENT À ANALYSER:
|
||||
{content}
|
||||
|
||||
Réponds UNIQUEMENT avec le JSON."""
|
||||
|
||||
logger.info(f"Extraction métadonnées via {provider.upper()} ({model})")
|
||||
|
||||
try:
|
||||
response: str = call_llm(prompt, model=model, provider=provider, temperature=temperature)
|
||||
metadata: Dict[str, Any] = _extract_json_from_response(response)
|
||||
|
||||
# Valeurs par défaut si non trouvées
|
||||
defaults: Dict[str, Optional[str]] = {
|
||||
"title": None,
|
||||
"author": None,
|
||||
"collection": None,
|
||||
"publisher": None,
|
||||
"year": None,
|
||||
"doi": None,
|
||||
"isbn": None,
|
||||
"language": "fr",
|
||||
}
|
||||
|
||||
for key, default in defaults.items():
|
||||
if key not in metadata or metadata[key] == "":
|
||||
metadata[key] = default
|
||||
|
||||
# Nettoyer les valeurs "null" string
|
||||
for key in metadata:
|
||||
if metadata[key] == "null" or metadata[key] == "None":
|
||||
metadata[key] = None
|
||||
|
||||
logger.info(f"Métadonnées extraites: titre='{metadata.get('title')}', auteur='{metadata.get('author')}'")
|
||||
return metadata
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur extraction métadonnées: {e}")
|
||||
return {
|
||||
"title": None,
|
||||
"author": None,
|
||||
"collection": None,
|
||||
"publisher": None,
|
||||
"year": None,
|
||||
"doi": None,
|
||||
"isbn": None,
|
||||
"language": "fr",
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
583
generations/library_rag/utils/llm_structurer.py
Normal file
583
generations/library_rag/utils/llm_structurer.py
Normal file
@@ -0,0 +1,583 @@
|
||||
"""Structuration de documents via LLM (Ollama ou Mistral API)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional, TypedDict, Union, cast
|
||||
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
import threading
|
||||
|
||||
# Import type definitions from central types module
|
||||
from utils.types import LLMCostStats
|
||||
|
||||
# Charger les variables d'environnement
|
||||
load_dotenv()
|
||||
|
||||
# Logger
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
if not logging.getLogger().hasHandlers():
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[%(asctime)s] %(levelname)s %(message)s"
|
||||
)
|
||||
|
||||
|
||||
class LLMStructureError(RuntimeError):
|
||||
"""Erreur lors de la structuration via LLM."""
|
||||
pass
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# TypedDict Definitions
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
class MistralPricingEntry(TypedDict):
|
||||
"""Mistral API pricing per million tokens."""
|
||||
input: float
|
||||
output: float
|
||||
|
||||
|
||||
class LLMHierarchyPath(TypedDict, total=False):
|
||||
"""Hierarchy path in structured output."""
|
||||
part: Optional[str]
|
||||
chapter: Optional[str]
|
||||
section: Optional[str]
|
||||
subsection: Optional[str]
|
||||
|
||||
|
||||
class LLMChunkOutput(TypedDict, total=False):
|
||||
"""Single chunk in LLM structured output."""
|
||||
chunk_id: str
|
||||
text: str
|
||||
hierarchy: LLMHierarchyPath
|
||||
type: str
|
||||
is_toc: bool
|
||||
|
||||
|
||||
class LLMDocumentSection(TypedDict, total=False):
|
||||
"""Document section in structured output."""
|
||||
path: LLMHierarchyPath
|
||||
type: str
|
||||
page_start: int
|
||||
page_end: int
|
||||
|
||||
|
||||
class LLMStructuredResult(TypedDict, total=False):
|
||||
"""Result from LLM document structuring."""
|
||||
document_structure: List[LLMDocumentSection]
|
||||
chunks: List[LLMChunkOutput]
|
||||
|
||||
|
||||
class OllamaResultContainer(TypedDict):
|
||||
"""Container for Ollama call result (internal use)."""
|
||||
response: Optional[str]
|
||||
error: Optional[Exception]
|
||||
done: bool
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# Configuration
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _get_ollama_url() -> str:
|
||||
"""Retourne l'URL de base d'Ollama."""
|
||||
return os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
||||
|
||||
|
||||
def _get_default_model() -> str:
|
||||
"""Retourne le modèle LLM par défaut."""
|
||||
return os.getenv("STRUCTURE_LLM_MODEL", "qwen2.5:7b")
|
||||
|
||||
|
||||
def _get_mistral_api_key() -> Optional[str]:
|
||||
"""Retourne la clé API Mistral."""
|
||||
return os.getenv("MISTRAL_API_KEY")
|
||||
|
||||
|
||||
def _get_default_mistral_model() -> str:
|
||||
"""Retourne le modèle Mistral par défaut pour les tâches LLM."""
|
||||
return os.getenv("MISTRAL_LLM_MODEL", "mistral-small-latest")
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# Appel Mistral API (rapide, cloud) avec tracking des coûts
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# Prix Mistral API par million de tokens (€)
|
||||
MISTRAL_PRICING: Dict[str, MistralPricingEntry] = {
|
||||
"mistral-small-latest": {"input": 0.2, "output": 0.6},
|
||||
"mistral-medium-latest": {"input": 0.8, "output": 2.4},
|
||||
"mistral-large-latest": {"input": 2.0, "output": 6.0},
|
||||
# Fallback pour autres modèles
|
||||
"default": {"input": 0.5, "output": 1.5},
|
||||
}
|
||||
|
||||
# Accumulateur de coûts global (thread-local pour safety)
|
||||
_cost_tracker: threading.local = threading.local()
|
||||
|
||||
|
||||
def reset_llm_cost() -> None:
|
||||
"""Réinitialise le compteur de coût LLM."""
|
||||
_cost_tracker.total_cost = 0.0
|
||||
_cost_tracker.total_input_tokens = 0
|
||||
_cost_tracker.total_output_tokens = 0
|
||||
_cost_tracker.calls_count = 0
|
||||
|
||||
|
||||
def get_llm_cost() -> LLMCostStats:
|
||||
"""Retourne les statistiques de coût LLM accumulées."""
|
||||
return {
|
||||
"total_cost": getattr(_cost_tracker, "total_cost", 0.0),
|
||||
"total_input_tokens": getattr(_cost_tracker, "total_input_tokens", 0),
|
||||
"total_output_tokens": getattr(_cost_tracker, "total_output_tokens", 0),
|
||||
"calls_count": getattr(_cost_tracker, "calls_count", 0),
|
||||
}
|
||||
|
||||
|
||||
def _calculate_mistral_cost(model: str, input_tokens: int, output_tokens: int) -> float:
|
||||
"""Calcule le coût d'un appel Mistral API en euros."""
|
||||
pricing: MistralPricingEntry = MISTRAL_PRICING.get(model, MISTRAL_PRICING["default"])
|
||||
cost: float = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1_000_000
|
||||
return cost
|
||||
|
||||
|
||||
def _call_mistral_api(
|
||||
prompt: str,
|
||||
model: str = "mistral-small-latest",
|
||||
temperature: float = 0.2,
|
||||
max_tokens: int = 4096,
|
||||
timeout: int = 120,
|
||||
) -> str:
|
||||
"""Appelle l'API Mistral pour générer une réponse.
|
||||
|
||||
Modèles disponibles (du plus rapide au plus puissant) :
|
||||
- mistral-small-latest : Rapide, économique (~0.2€/M tokens input)
|
||||
- mistral-medium-latest : Équilibré (~0.8€/M tokens input)
|
||||
- mistral-large-latest : Puissant (~2€/M tokens input)
|
||||
|
||||
Args:
|
||||
prompt: Le prompt à envoyer
|
||||
model: Nom du modèle Mistral
|
||||
temperature: Température (0-1)
|
||||
max_tokens: Nombre max de tokens en réponse
|
||||
timeout: Timeout en secondes
|
||||
|
||||
Returns:
|
||||
Réponse textuelle du LLM
|
||||
"""
|
||||
api_key: Optional[str] = _get_mistral_api_key()
|
||||
if not api_key:
|
||||
raise LLMStructureError("MISTRAL_API_KEY non définie dans .env")
|
||||
|
||||
logger.info(f"Appel Mistral API - modèle: {model}")
|
||||
|
||||
url: str = "https://api.mistral.ai/v1/chat/completions"
|
||||
headers: Dict[str, str] = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": temperature,
|
||||
"max_tokens": max_tokens,
|
||||
}
|
||||
|
||||
try:
|
||||
start: float = time.time()
|
||||
response: requests.Response = requests.post(url, headers=headers, json=payload, timeout=timeout)
|
||||
elapsed: float = time.time() - start
|
||||
|
||||
response.raise_for_status()
|
||||
data: Dict[str, Any] = response.json()
|
||||
|
||||
content: str = data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
usage: Dict[str, Any] = data.get("usage", {})
|
||||
|
||||
input_tokens: int = usage.get("prompt_tokens", 0)
|
||||
output_tokens: int = usage.get("completion_tokens", 0)
|
||||
|
||||
# Calculer et accumuler le coût
|
||||
call_cost: float = _calculate_mistral_cost(model, input_tokens, output_tokens)
|
||||
|
||||
# Mettre à jour le tracker
|
||||
if not hasattr(_cost_tracker, "total_cost"):
|
||||
reset_llm_cost()
|
||||
|
||||
_cost_tracker.total_cost += call_cost
|
||||
_cost_tracker.total_input_tokens += input_tokens
|
||||
_cost_tracker.total_output_tokens += output_tokens
|
||||
_cost_tracker.calls_count += 1
|
||||
|
||||
logger.info(f"Mistral API terminé en {elapsed:.1f}s - {input_tokens}+{output_tokens} tokens = {call_cost:.6f}€")
|
||||
|
||||
return content
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
raise LLMStructureError(f"Timeout Mistral API ({timeout}s)")
|
||||
except requests.exceptions.HTTPError as e:
|
||||
raise LLMStructureError(f"Erreur HTTP Mistral: {e}")
|
||||
except Exception as e:
|
||||
raise LLMStructureError(f"Erreur Mistral API: {e}")
|
||||
|
||||
|
||||
def _prepare_prompt(
|
||||
markdown: str,
|
||||
hierarchy: Dict[str, Any],
|
||||
max_chars: int = 8000,
|
||||
) -> str:
|
||||
"""Prépare le prompt pour le LLM.
|
||||
|
||||
Args:
|
||||
markdown: Texte Markdown du document
|
||||
hierarchy: Structure hiérarchique initiale
|
||||
max_chars: Nombre max de caractères du Markdown à inclure
|
||||
|
||||
Returns:
|
||||
Prompt formaté pour le LLM
|
||||
"""
|
||||
# Tronquer le Markdown si nécessaire
|
||||
truncated: str = markdown[:max_chars]
|
||||
if len(markdown) > max_chars:
|
||||
truncated += f"\n\n... [tronqué à {max_chars} caractères]"
|
||||
|
||||
# Sérialiser la hiérarchie
|
||||
outline_json: str = json.dumps(hierarchy, ensure_ascii=False, indent=2)
|
||||
|
||||
prompt: str = f"""Tu es un expert en édition scientifique chargé d'analyser la structure logique d'un document.
|
||||
|
||||
IMPORTANT: Réponds UNIQUEMENT avec un objet JSON valide. Pas de texte avant ou après.
|
||||
|
||||
À partir du Markdown OCRisé et d'un premier découpage hiérarchique, tu dois :
|
||||
1. Identifier les parties liminaires (préface, introduction...), le corps du document (parties, chapitres, sections) et les parties finales (conclusion, annexes, bibliographie...).
|
||||
2. Reconstruire l'organisation réelle du texte.
|
||||
3. Produire un JSON avec :
|
||||
- "document_structure": vue hiérarchique du document
|
||||
- "chunks": liste des chunks avec chunk_id, text, hierarchy, type
|
||||
|
||||
FORMAT DE RÉPONSE (entre balises <JSON></JSON>):
|
||||
<JSON>
|
||||
{{
|
||||
"document_structure": [
|
||||
{{
|
||||
"path": {{"part": "Titre"}},
|
||||
"type": "main_content",
|
||||
"page_start": 1,
|
||||
"page_end": 10
|
||||
}}
|
||||
],
|
||||
"chunks": [
|
||||
{{
|
||||
"chunk_id": "chunk_00001",
|
||||
"text": "Contenu...",
|
||||
"hierarchy": {{
|
||||
"part": "Titre partie",
|
||||
"chapter": "Titre chapitre",
|
||||
"section": null,
|
||||
"subsection": null
|
||||
}},
|
||||
"type": "main_content",
|
||||
"is_toc": false
|
||||
}}
|
||||
]
|
||||
}}
|
||||
</JSON>
|
||||
|
||||
### Hiérarchie initiale
|
||||
{outline_json}
|
||||
|
||||
### Markdown OCR
|
||||
{truncated}
|
||||
|
||||
Réponds UNIQUEMENT avec le JSON entre <JSON> et </JSON>."""
|
||||
|
||||
return prompt.strip()
|
||||
|
||||
|
||||
def _call_ollama(
|
||||
prompt: str,
|
||||
model: str,
|
||||
base_url: Optional[str] = None,
|
||||
temperature: float = 0.2,
|
||||
timeout: int = 300,
|
||||
) -> str:
|
||||
"""Appelle Ollama pour générer une réponse.
|
||||
|
||||
Args:
|
||||
prompt: Le prompt à envoyer
|
||||
model: Nom du modèle Ollama
|
||||
base_url: URL de base d'Ollama
|
||||
temperature: Température du modèle
|
||||
timeout: Timeout en secondes
|
||||
|
||||
Returns:
|
||||
Réponse textuelle du LLM
|
||||
|
||||
Raises:
|
||||
LLMStructureError: En cas d'erreur d'appel
|
||||
"""
|
||||
# Essayer d'abord le SDK ollama
|
||||
try:
|
||||
import ollama
|
||||
|
||||
logger.info(f"Appel Ollama SDK - modèle: {model}, timeout: {timeout}s")
|
||||
|
||||
# Note: Le SDK ollama ne supporte pas directement le timeout
|
||||
# On utilise un wrapper avec threading.Timer pour forcer le timeout
|
||||
result_container: OllamaResultContainer = {"response": None, "error": None, "done": False}
|
||||
|
||||
def _run_ollama_call() -> None:
|
||||
try:
|
||||
resp: Any
|
||||
if hasattr(ollama, "generate"):
|
||||
resp = ollama.generate(
|
||||
model=model,
|
||||
prompt=prompt,
|
||||
stream=False,
|
||||
options={"temperature": temperature}
|
||||
)
|
||||
if isinstance(resp, dict):
|
||||
result_container["response"] = resp.get("response", json.dumps(resp))
|
||||
elif hasattr(resp, "response"):
|
||||
result_container["response"] = resp.response
|
||||
else:
|
||||
result_container["response"] = str(resp)
|
||||
else:
|
||||
# Fallback sur chat
|
||||
resp = ollama.chat(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
options={"temperature": temperature}
|
||||
)
|
||||
if isinstance(resp, dict):
|
||||
result_container["response"] = resp.get("message", {}).get("content", str(resp))
|
||||
else:
|
||||
result_container["response"] = str(resp)
|
||||
result_container["done"] = True
|
||||
except Exception as e:
|
||||
result_container["error"] = e
|
||||
result_container["done"] = True
|
||||
|
||||
thread: threading.Thread = threading.Thread(target=_run_ollama_call, daemon=True)
|
||||
thread.start()
|
||||
thread.join(timeout=timeout)
|
||||
|
||||
if not result_container["done"]:
|
||||
raise LLMStructureError(f"Timeout Ollama SDK après {timeout}s (modèle: {model})")
|
||||
|
||||
if result_container["error"]:
|
||||
raise result_container["error"]
|
||||
|
||||
if result_container["response"]:
|
||||
return result_container["response"]
|
||||
|
||||
raise LLMStructureError("Aucune réponse du SDK Ollama")
|
||||
|
||||
except ImportError:
|
||||
logger.info("SDK ollama non disponible, utilisation de l'API HTTP")
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur SDK ollama: {e}, fallback HTTP")
|
||||
|
||||
# Fallback HTTP
|
||||
base: str = base_url or _get_ollama_url()
|
||||
url: str = f"{base.rstrip('/')}/api/generate"
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {"temperature": temperature},
|
||||
}
|
||||
|
||||
# Retry avec backoff
|
||||
max_retries: int = 2
|
||||
backoff: float = 1.0
|
||||
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
logger.info(f"Appel HTTP Ollama (tentative {attempt + 1})")
|
||||
response: requests.Response = requests.post(url, json=payload, timeout=timeout)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise LLMStructureError(
|
||||
f"Erreur Ollama ({response.status_code}): {response.text}"
|
||||
)
|
||||
|
||||
data: Dict[str, Any] = response.json()
|
||||
if "response" not in data:
|
||||
raise LLMStructureError(f"Réponse Ollama inattendue: {data}")
|
||||
|
||||
return cast(str, data["response"])
|
||||
|
||||
except requests.RequestException as e:
|
||||
if attempt < max_retries:
|
||||
time.sleep(backoff)
|
||||
backoff *= 2
|
||||
continue
|
||||
raise LLMStructureError(f"Impossible de contacter Ollama: {e}") from e
|
||||
|
||||
raise LLMStructureError("Échec après plusieurs tentatives")
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# Fonction générique d'appel LLM
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def call_llm(
|
||||
prompt: str,
|
||||
model: Optional[str] = None,
|
||||
provider: str = "ollama", # "ollama" ou "mistral"
|
||||
temperature: float = 0.2,
|
||||
timeout: int = 300,
|
||||
) -> str:
|
||||
"""Appelle un LLM (Ollama local ou Mistral API).
|
||||
|
||||
Args:
|
||||
prompt: Le prompt à envoyer
|
||||
model: Nom du modèle (auto-détecté si None)
|
||||
provider: "ollama" (local, lent) ou "mistral" (API, rapide)
|
||||
temperature: Température du modèle
|
||||
timeout: Timeout en secondes
|
||||
|
||||
Returns:
|
||||
Réponse textuelle du LLM
|
||||
"""
|
||||
resolved_model: str
|
||||
if provider == "mistral":
|
||||
# Mistral API (rapide, cloud)
|
||||
resolved_model = model or _get_default_mistral_model()
|
||||
return _call_mistral_api(
|
||||
prompt,
|
||||
model=resolved_model,
|
||||
temperature=temperature,
|
||||
timeout=timeout,
|
||||
)
|
||||
else:
|
||||
# Ollama (local, lent mais gratuit)
|
||||
resolved_model = model or _get_default_model()
|
||||
return _call_ollama(
|
||||
prompt,
|
||||
model=resolved_model,
|
||||
temperature=temperature,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
|
||||
def _clean_json_string(json_str: str) -> str:
|
||||
"""Nettoie une chaîne JSON des caractères de contrôle invalides.
|
||||
|
||||
Stratégie robuste : Remplace TOUS les caractères de contrôle (x00-x1f)
|
||||
par des espaces, puis réduit les espaces multiples. Cela évite les erreurs
|
||||
"Invalid control character" de json.loads().
|
||||
"""
|
||||
# Remplacer tous les caractères de contrôle par des espaces
|
||||
cleaned: str = re.sub(r'[\x00-\x1f]', ' ', json_str)
|
||||
# Réduire les espaces multiples
|
||||
cleaned = re.sub(r'\s+', ' ', cleaned)
|
||||
return cleaned
|
||||
|
||||
|
||||
def _extract_json(text: str) -> LLMStructuredResult:
|
||||
"""Extrait le JSON de la réponse du LLM.
|
||||
|
||||
Args:
|
||||
text: Réponse textuelle du LLM
|
||||
|
||||
Returns:
|
||||
Dictionnaire JSON parsé
|
||||
|
||||
Raises:
|
||||
LLMStructureError: Si le JSON est invalide ou absent
|
||||
"""
|
||||
# Chercher entre balises <JSON> et </JSON>
|
||||
json_start: int = text.find("<JSON>")
|
||||
json_end: int = text.find("</JSON>")
|
||||
|
||||
if json_start != -1 and json_end != -1 and json_end > json_start:
|
||||
json_content: str = text[json_start + 6:json_end].strip()
|
||||
json_content = _clean_json_string(json_content)
|
||||
|
||||
try:
|
||||
result: Dict[str, Any] = json.loads(json_content)
|
||||
if "chunks" not in result:
|
||||
raise LLMStructureError(
|
||||
f"JSON sans clé 'chunks'. Clés: {list(result.keys())}"
|
||||
)
|
||||
return cast(LLMStructuredResult, result)
|
||||
except json.JSONDecodeError:
|
||||
pass # Fallback ci-dessous
|
||||
|
||||
# Fallback: chercher par accolades
|
||||
start: int = text.find("{")
|
||||
end: int = text.rfind("}")
|
||||
|
||||
if start == -1 or end == -1 or end <= start:
|
||||
raise LLMStructureError(
|
||||
f"Pas de JSON trouvé dans la réponse.\nDébut: {text[:500]}"
|
||||
)
|
||||
|
||||
json_str: str = _clean_json_string(text[start:end + 1])
|
||||
|
||||
try:
|
||||
result = json.loads(json_str)
|
||||
if "chunks" not in result:
|
||||
raise LLMStructureError(
|
||||
f"JSON sans clé 'chunks'. Clés: {list(result.keys())}"
|
||||
)
|
||||
return cast(LLMStructuredResult, result)
|
||||
except json.JSONDecodeError as e:
|
||||
raise LLMStructureError(f"JSON invalide: {e}\nContenu: {json_str[:500]}") from e
|
||||
|
||||
|
||||
def structure_with_llm(
|
||||
markdown: str,
|
||||
hierarchy: Dict[str, Any],
|
||||
model: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
temperature: float = 0.2,
|
||||
max_chars: int = 8000,
|
||||
timeout: int = 300,
|
||||
) -> LLMStructuredResult:
|
||||
"""Améliore la structure d'un document via LLM.
|
||||
|
||||
Args:
|
||||
markdown: Texte Markdown du document
|
||||
hierarchy: Structure hiérarchique initiale (de build_hierarchy)
|
||||
model: Modèle Ollama à utiliser
|
||||
base_url: URL de base d'Ollama
|
||||
temperature: Température du modèle
|
||||
max_chars: Nombre max de caractères du Markdown
|
||||
timeout: Timeout en secondes
|
||||
|
||||
Returns:
|
||||
Structure améliorée avec document_structure et chunks
|
||||
|
||||
Raises:
|
||||
LLMStructureError: En cas d'erreur
|
||||
"""
|
||||
resolved_model: str = model or _get_default_model()
|
||||
|
||||
logger.info(f"Structuration LLM - modèle: {resolved_model}")
|
||||
|
||||
# Préparer le prompt
|
||||
prompt: str = _prepare_prompt(markdown, hierarchy, max_chars)
|
||||
|
||||
# Appeler le LLM
|
||||
raw_response: str = _call_ollama(
|
||||
prompt,
|
||||
model=resolved_model,
|
||||
base_url=base_url,
|
||||
temperature=temperature,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
# Extraire le JSON
|
||||
return _extract_json(raw_response)
|
||||
|
||||
420
generations/library_rag/utils/llm_toc.py
Normal file
420
generations/library_rag/utils/llm_toc.py
Normal file
@@ -0,0 +1,420 @@
|
||||
"""LLM-based Table of Contents (TOC) extraction module.
|
||||
|
||||
This module provides functionality to extract hierarchical table of contents
|
||||
from markdown documents using Large Language Models. It intelligently parses
|
||||
document structure and creates both hierarchical and flat representations
|
||||
of the TOC.
|
||||
|
||||
Key Features:
|
||||
- Hierarchical TOC extraction with chapters, sections, and subsections
|
||||
- Flat TOC generation with full paths for navigation
|
||||
- Content-to-TOC matching for associating sections with TOC entries
|
||||
- Support for multiple LLM providers (Ollama local, Mistral API)
|
||||
|
||||
TOC Structure Levels:
|
||||
- Level 1: Introduction, main chapters, Conclusion, Bibliography
|
||||
- Level 2: Sections listed under a chapter (same visual level)
|
||||
- Level 3: Only if explicit indentation or subsection visible
|
||||
|
||||
Typical Usage:
|
||||
>>> from utils.llm_toc import extract_toc
|
||||
>>> result = extract_toc(
|
||||
... markdown=document_text,
|
||||
... document_title="The Republic",
|
||||
... provider="ollama"
|
||||
... )
|
||||
>>> print(result["toc"]) # Hierarchical structure
|
||||
[
|
||||
{
|
||||
"title": "Introduction",
|
||||
"level": 1,
|
||||
"children": []
|
||||
},
|
||||
{
|
||||
"title": "Book I: Justice",
|
||||
"level": 1,
|
||||
"chapter_number": 1,
|
||||
"children": [
|
||||
{"title": "The Nature of Justice", "level": 2, "children": []}
|
||||
]
|
||||
}
|
||||
]
|
||||
>>> print(result["flat_toc"]) # Flat list with paths
|
||||
[
|
||||
{"title": "Introduction", "level": 1, "path": "Introduction"},
|
||||
{"title": "Book I: Justice", "level": 1, "path": "Book I: Justice"},
|
||||
{
|
||||
"title": "The Nature of Justice",
|
||||
"level": 2,
|
||||
"path": "Book I: Justice > The Nature of Justice"
|
||||
}
|
||||
]
|
||||
|
||||
LLM Provider Options:
|
||||
- "ollama": Local processing, free but slower
|
||||
- "mistral": Cloud API, faster but incurs costs
|
||||
|
||||
Note:
|
||||
For documents without a clear TOC (short articles, book reviews),
|
||||
the module returns an empty TOC list rather than inventing structure.
|
||||
|
||||
See Also:
|
||||
- llm_metadata: Document metadata extraction
|
||||
- llm_classifier: Section classification
|
||||
- toc_extractor: Non-LLM TOC extraction alternatives
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import cast, Any, Dict, List, Optional
|
||||
|
||||
from .llm_structurer import (
|
||||
_clean_json_string,
|
||||
_get_default_mistral_model,
|
||||
_get_default_model,
|
||||
call_llm,
|
||||
)
|
||||
from .types import FlatTOCEntry, LLMProvider, TOCEntry, TOCResult
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _extract_json_from_response(text: str) -> Dict[str, Any]:
|
||||
"""Extract JSON data from an LLM response.
|
||||
|
||||
Parses the LLM response to extract JSON content, handling both
|
||||
explicitly tagged JSON (between <JSON></JSON> tags) and raw JSON
|
||||
embedded in the response text.
|
||||
|
||||
Args:
|
||||
text: The raw LLM response text that may contain JSON.
|
||||
|
||||
Returns:
|
||||
A dictionary containing the parsed JSON data. Returns
|
||||
{"toc": []} if no valid JSON can be extracted.
|
||||
|
||||
Note:
|
||||
This function attempts two parsing strategies:
|
||||
1. Look for JSON between <JSON></JSON> tags
|
||||
2. Find JSON by locating first '{' and last '}'
|
||||
"""
|
||||
json_match: Optional[re.Match[str]] = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL)
|
||||
if json_match:
|
||||
json_str: str = _clean_json_string(json_match.group(1))
|
||||
try:
|
||||
result: Dict[str, Any] = json.loads(json_str)
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
start: int = text.find("{")
|
||||
end: int = text.rfind("}")
|
||||
if start != -1 and end > start:
|
||||
json_str = _clean_json_string(text[start:end + 1])
|
||||
try:
|
||||
result = json.loads(json_str)
|
||||
return result
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"JSON invalide: {e}")
|
||||
|
||||
return {"toc": []}
|
||||
|
||||
|
||||
def extract_toc(
|
||||
markdown: str,
|
||||
document_title: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
provider: LLMProvider = "ollama",
|
||||
temperature: float = 0.1,
|
||||
) -> Dict[str, Any]:
|
||||
r"""Extract a structured table of contents from a document using LLM.
|
||||
|
||||
Analyzes markdown content to identify the document's hierarchical
|
||||
structure and generates both a nested TOC (with children) and a
|
||||
flat TOC (with navigation paths).
|
||||
|
||||
Args:
|
||||
markdown: Complete markdown text of the document to analyze.
|
||||
document_title: Optional title of the document for context.
|
||||
Helps the LLM better understand the document structure.
|
||||
model: LLM model name to use. If None, uses the default model
|
||||
for the specified provider.
|
||||
provider: LLM provider to use. Either "ollama" for local
|
||||
processing or "mistral" for cloud API.
|
||||
temperature: Model temperature for response generation.
|
||||
Lower values (0.1) produce more consistent results.
|
||||
|
||||
Returns:
|
||||
A dictionary containing:
|
||||
- toc: Hierarchical list of TOC entries, each with:
|
||||
- title: Section title
|
||||
- level: Hierarchy level (1, 2, or 3)
|
||||
- chapter_number: Optional chapter number
|
||||
- children: List of nested TOC entries
|
||||
- flat_toc: Flat list of all TOC entries with paths:
|
||||
- title: Section title
|
||||
- level: Hierarchy level
|
||||
- path: Full navigation path (e.g., "Chapter 1 > Section 1")
|
||||
- error: Error message string (only if extraction failed)
|
||||
|
||||
Raises:
|
||||
No exceptions are raised; errors are captured in the return dict.
|
||||
|
||||
Example:
|
||||
>>> result = extract_toc(
|
||||
... markdown="# Introduction\n...\n# Chapter 1\n## Section 1.1",
|
||||
... document_title="My Book",
|
||||
... provider="ollama"
|
||||
... )
|
||||
>>> len(result["toc"])
|
||||
2
|
||||
>>> result["toc"][0]["title"]
|
||||
'Introduction'
|
||||
|
||||
Note:
|
||||
- Documents longer than 12,000 characters are truncated
|
||||
- Short articles without clear TOC return empty lists
|
||||
- The LLM is instructed to never invent structure
|
||||
"""
|
||||
if model is None:
|
||||
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
||||
|
||||
# Tronquer si trop long mais garder les sections importantes
|
||||
max_chars: int = 12000
|
||||
content: str = markdown[:max_chars]
|
||||
if len(markdown) > max_chars:
|
||||
content += "\n\n[... suite du document ...]"
|
||||
|
||||
title_context: str = f"Titre du document: {document_title}\n" if document_title else ""
|
||||
|
||||
prompt: str = f"""Tu es un expert en structuration de documents académiques.
|
||||
|
||||
TÂCHE: Extraire la table des matières FIDÈLE au document fourni.
|
||||
|
||||
{title_context}
|
||||
⚠️ RÈGLES CRITIQUES:
|
||||
|
||||
1. **ANALYSER LE DOCUMENT RÉEL** - Ne JAMAIS copier les exemples ci-dessous!
|
||||
2. **DOCUMENTS SANS TOC** - Si le document est un article court, une revue de livre, ou n'a pas de table des matières explicite, retourner {{"toc": []}}
|
||||
3. **RESPECTER LA STRUCTURE PLATE** - Ne pas inventer de hiérarchie entre des lignes au même niveau
|
||||
4. **IGNORER** - Métadonnées éditoriales (DOI, ISBN, éditeur, copyright, numéros de page)
|
||||
|
||||
NIVEAUX DE STRUCTURE:
|
||||
- level 1: Introduction, Chapitres principaux, Conclusion, Bibliographie
|
||||
- level 2: Sections listées sous un chapitre (même niveau visuel)
|
||||
- level 3: UNIQUEMENT si indentation ou sous-titre explicite visible
|
||||
|
||||
FORMAT DE RÉPONSE (JSON entre balises <JSON></JSON>):
|
||||
|
||||
Pour un livre avec TOC:
|
||||
<JSON>
|
||||
{{
|
||||
"toc": [
|
||||
{{
|
||||
"title": "Titre Chapitre 1",
|
||||
"level": 1,
|
||||
"chapter_number": 1,
|
||||
"children": [
|
||||
{{"title": "Section 1.1", "level": 2, "children": []}},
|
||||
{{"title": "Section 1.2", "level": 2, "children": []}}
|
||||
]
|
||||
}}
|
||||
]
|
||||
}}
|
||||
</JSON>
|
||||
|
||||
Pour un article SANS TOC (revue de livre, article court, etc.):
|
||||
<JSON>
|
||||
{{
|
||||
"toc": []
|
||||
}}
|
||||
</JSON>
|
||||
|
||||
⚠️ NE PAS COPIER CES EXEMPLES ! Analyser uniquement le DOCUMENT RÉEL ci-dessous.
|
||||
|
||||
DOCUMENT À ANALYSER:
|
||||
{content}
|
||||
|
||||
Réponds UNIQUEMENT avec le JSON correspondant à CE document (pas aux exemples)."""
|
||||
|
||||
logger.info(f"Extraction TOC via {provider.upper()} ({model})")
|
||||
|
||||
try:
|
||||
response: str = call_llm(prompt, model=model, provider=provider, temperature=temperature, timeout=360)
|
||||
result: Dict[str, Any] = _extract_json_from_response(response)
|
||||
|
||||
toc: List[Dict[str, Any]] = result.get("toc", [])
|
||||
|
||||
# Générer la version plate de la TOC
|
||||
flat_toc: List[Dict[str, Any]] = _flatten_toc(toc)
|
||||
|
||||
logger.info(f"TOC extraite: {len(toc)} entrées niveau 1, {len(flat_toc)} entrées totales")
|
||||
|
||||
return {
|
||||
"toc": toc,
|
||||
"flat_toc": flat_toc,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur extraction TOC: {e}")
|
||||
return {
|
||||
"toc": [],
|
||||
"flat_toc": [],
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
|
||||
def _flatten_toc(
|
||||
toc: List[Dict[str, Any]],
|
||||
parent_path: str = "",
|
||||
result: Optional[List[Dict[str, Any]]] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Flatten a hierarchical TOC into a list with navigation paths.
|
||||
|
||||
Recursively traverses a nested TOC structure and produces a flat
|
||||
list where each entry includes its full path from the root.
|
||||
|
||||
Args:
|
||||
toc: Hierarchical TOC list with nested children.
|
||||
parent_path: Path accumulated from parent entries. Used
|
||||
internally during recursion.
|
||||
result: Accumulator list for results. Used internally
|
||||
during recursion.
|
||||
|
||||
Returns:
|
||||
A flat list of TOC entries, each containing:
|
||||
- title: The section title
|
||||
- level: Hierarchy level (1, 2, or 3)
|
||||
- path: Full navigation path (e.g., "Chapter > Section")
|
||||
- chapter_number: Optional chapter number if present
|
||||
|
||||
Example:
|
||||
>>> hierarchical_toc = [
|
||||
... {
|
||||
... "title": "Chapter 1",
|
||||
... "level": 1,
|
||||
... "children": [
|
||||
... {"title": "Section 1.1", "level": 2, "children": []}
|
||||
... ]
|
||||
... }
|
||||
... ]
|
||||
>>> flat = _flatten_toc(hierarchical_toc)
|
||||
>>> flat[0]["path"]
|
||||
'Chapter 1'
|
||||
>>> flat[1]["path"]
|
||||
'Chapter 1 > Section 1.1'
|
||||
"""
|
||||
if result is None:
|
||||
result = []
|
||||
|
||||
for item in toc:
|
||||
title: str = item.get("title", "")
|
||||
level: int = item.get("level", 1)
|
||||
|
||||
# Construire le chemin
|
||||
path: str
|
||||
if parent_path:
|
||||
path = f"{parent_path} > {title}"
|
||||
else:
|
||||
path = title
|
||||
|
||||
result.append({
|
||||
"title": title,
|
||||
"level": level,
|
||||
"path": path,
|
||||
"chapter_number": item.get("chapter_number"),
|
||||
})
|
||||
|
||||
# Récursion sur les enfants
|
||||
children: List[Dict[str, Any]] = item.get("children", [])
|
||||
if children:
|
||||
_flatten_toc(children, path, result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def match_content_to_toc(
|
||||
content_sections: List[Dict[str, Any]],
|
||||
flat_toc: List[Dict[str, Any]],
|
||||
model: Optional[str] = None,
|
||||
provider: LLMProvider = "ollama",
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Match content sections to TOC entries using LLM.
|
||||
|
||||
Uses an LLM to intelligently associate extracted content sections
|
||||
with their corresponding entries in the table of contents. This
|
||||
enables navigation and context-aware content organization.
|
||||
|
||||
Args:
|
||||
content_sections: List of content sections extracted from
|
||||
the document. Each section should have a "title" key.
|
||||
flat_toc: Flat TOC list as returned by extract_toc()["flat_toc"].
|
||||
Each entry should have a "title" key.
|
||||
model: LLM model name to use. If None, uses the default
|
||||
model for the specified provider.
|
||||
provider: LLM provider to use. Either "ollama" for local
|
||||
processing or "mistral" for cloud API.
|
||||
|
||||
Returns:
|
||||
The input content_sections list with a "toc_match" key added
|
||||
to each section. The value is either:
|
||||
- The matched TOC entry dict (if a match was found)
|
||||
- None (if no match was found)
|
||||
|
||||
Example:
|
||||
>>> sections = [{"title": "Introduction"}, {"title": "Methods"}]
|
||||
>>> toc = [{"title": "Introduction", "level": 1, "path": "Introduction"}]
|
||||
>>> matched = match_content_to_toc(sections, toc)
|
||||
>>> matched[0]["toc_match"]["title"]
|
||||
'Introduction'
|
||||
>>> matched[1]["toc_match"] is None
|
||||
True
|
||||
|
||||
Note:
|
||||
- Only the first 30 content sections are processed to limit costs
|
||||
- Failed matches are silently handled (sections get toc_match=None)
|
||||
"""
|
||||
if model is None:
|
||||
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
||||
|
||||
# Préparer les données pour le prompt
|
||||
toc_titles: List[str] = [item["title"] for item in flat_toc]
|
||||
section_titles: List[str] = [s.get("title", "") for s in content_sections[:30]] # Limiter
|
||||
|
||||
prompt: str = f"""Tu dois associer les sections de contenu aux entrées de la table des matières.
|
||||
|
||||
TABLE DES MATIÈRES:
|
||||
{json.dumps(toc_titles, ensure_ascii=False, indent=2)}
|
||||
|
||||
SECTIONS DE CONTENU:
|
||||
{json.dumps(section_titles, ensure_ascii=False, indent=2)}
|
||||
|
||||
Pour chaque section de contenu, indique l'index (0-based) de l'entrée TOC correspondante.
|
||||
Si pas de correspondance, indique -1.
|
||||
|
||||
RÉPONDS avec un JSON:
|
||||
<JSON>
|
||||
{{
|
||||
"matches": [0, 1, 2, -1, 3, ...]
|
||||
}}
|
||||
</JSON>
|
||||
"""
|
||||
|
||||
try:
|
||||
response: str = call_llm(prompt, model=model, provider=provider, temperature=0.1)
|
||||
result: Dict[str, Any] = _extract_json_from_response(response)
|
||||
matches: List[int] = result.get("matches", [])
|
||||
|
||||
# Appliquer les correspondances
|
||||
for i, section in enumerate(content_sections):
|
||||
if i < len(matches) and matches[i] >= 0 and matches[i] < len(flat_toc):
|
||||
section["toc_match"] = flat_toc[matches[i]]
|
||||
else:
|
||||
section["toc_match"] = None
|
||||
|
||||
return content_sections
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur correspondance TOC: {e}")
|
||||
return content_sections
|
||||
513
generations/library_rag/utils/llm_validator.py
Normal file
513
generations/library_rag/utils/llm_validator.py
Normal file
@@ -0,0 +1,513 @@
|
||||
"""Document validation and enrichment using Large Language Models.
|
||||
|
||||
This module provides comprehensive validation, correction, and enrichment
|
||||
functionality for parsed documents. It uses LLMs to verify document coherence,
|
||||
detect inconsistencies, suggest corrections, and extract key concepts from
|
||||
text chunks.
|
||||
|
||||
Overview:
|
||||
The module performs three main functions:
|
||||
|
||||
1. **Document Validation** (validate_document):
|
||||
Verifies the coherence of parsed documents by checking metadata,
|
||||
table of contents, and chunk content quality. Returns detailed
|
||||
validation results with issues, corrections, and confidence scores.
|
||||
|
||||
2. **Content Enrichment** (enrich_chunks_with_concepts, generate_section_summary):
|
||||
Enhances document content by extracting key philosophical concepts
|
||||
from chunks and generating concise summaries for sections.
|
||||
|
||||
3. **Correction Application** (apply_corrections, clean_validation_annotations):
|
||||
Applies suggested corrections from validation results and cleans
|
||||
LLM-generated annotation artifacts from text.
|
||||
|
||||
Validation Criteria:
|
||||
The validator checks several aspects of document quality:
|
||||
|
||||
- **Metadata Quality**: Verifies title and author are correctly identified
|
||||
(not collection names, not "Unknown" when visible in text)
|
||||
- **TOC Coherence**: Checks for duplicates, proper ordering, completeness
|
||||
- **Chunk Content**: Ensures chunks contain substantive content, not just
|
||||
metadata fragments or headers
|
||||
|
||||
Validation Result Structure:
|
||||
The ValidationResult TypedDict contains:
|
||||
|
||||
- valid (bool): Overall validation pass/fail
|
||||
- errors (List[str]): Critical issues requiring attention
|
||||
- warnings (List[str]): Non-critical suggestions
|
||||
- corrections (Dict[str, str]): Suggested field corrections
|
||||
- concepts (List[str]): Extracted key concepts
|
||||
- score (float): Confidence score (0.0 to 1.0)
|
||||
|
||||
LLM Provider Support:
|
||||
- ollama: Local LLM (free, slower, privacy-preserving)
|
||||
- mistral: Mistral API (faster, requires API key, ~0.001 per validation)
|
||||
|
||||
Example:
|
||||
>>> from utils.llm_validator import validate_document, apply_corrections
|
||||
>>>
|
||||
>>> # Validate a parsed document
|
||||
>>> parsed_doc = {
|
||||
... "metadata": {"title": "Phenomenologie", "author": "Hegel"},
|
||||
... "toc": [{"title": "Preface", "level": 1, "page": 1}],
|
||||
... "chunks": [{"text": "La conscience...", "section_path": "Preface"}]
|
||||
... }
|
||||
>>> result = validate_document(parsed_doc, provider="ollama")
|
||||
>>> print(f"Valid: {result['valid']}, Score: {result['score']}")
|
||||
Valid: True, Score: 0.85
|
||||
|
||||
See Also:
|
||||
utils.llm_cleaner: Text cleaning and validation
|
||||
utils.llm_chunker: Semantic chunking of sections
|
||||
utils.pdf_pipeline: Main pipeline orchestration
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Match
|
||||
|
||||
from .llm_structurer import call_llm, _get_default_model, _get_default_mistral_model, _clean_json_string
|
||||
from .types import LLMProvider, ValidationResult, ParsedDocument, ChunkData
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _extract_json_from_response(text: str) -> Dict[str, Any]:
|
||||
"""Extract JSON from an LLM response text.
|
||||
|
||||
Attempts to parse JSON from the response using two strategies:
|
||||
1. Look for content wrapped in <JSON></JSON> tags
|
||||
2. Find the first { and last } to extract raw JSON
|
||||
|
||||
Args:
|
||||
text: LLM response text potentially containing JSON data.
|
||||
May include markdown, explanatory text, or XML-style tags.
|
||||
|
||||
Returns:
|
||||
Parsed dictionary from the JSON content. Returns an empty dict
|
||||
if no valid JSON is found or parsing fails.
|
||||
|
||||
Example:
|
||||
>>> response = '<JSON>{"valid": true, "score": 0.9}</JSON>'
|
||||
>>> _extract_json_from_response(response)
|
||||
{'valid': True, 'score': 0.9}
|
||||
"""
|
||||
json_match: Optional[Match[str]] = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL)
|
||||
if json_match:
|
||||
json_str: str = _clean_json_string(json_match.group(1))
|
||||
try:
|
||||
result: Dict[str, Any] = json.loads(json_str)
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
start: int = text.find("{")
|
||||
end: int = text.rfind("}")
|
||||
if start != -1 and end > start:
|
||||
json_str = _clean_json_string(text[start:end + 1])
|
||||
try:
|
||||
result = json.loads(json_str)
|
||||
return result
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"JSON invalide: {e}")
|
||||
|
||||
return {}
|
||||
|
||||
|
||||
def validate_document(
|
||||
parsed_doc: Dict[str, Any],
|
||||
model: Optional[str] = None,
|
||||
provider: LLMProvider = "ollama",
|
||||
temperature: float = 0.1,
|
||||
) -> ValidationResult:
|
||||
"""Validate a parsed document's coherence and suggest corrections.
|
||||
|
||||
Uses an LLM to analyze the document structure and content, checking
|
||||
for common issues like incorrect metadata, inconsistent TOC, or
|
||||
low-quality chunk content.
|
||||
|
||||
Args:
|
||||
parsed_doc: Dictionary containing the parsed document with keys:
|
||||
- metadata: Dict with title, author, year, language
|
||||
- toc: List of TOC entries with title, level, page
|
||||
- chunks: List of text chunks with content and metadata
|
||||
model: LLM model name. If None, uses provider's default model.
|
||||
provider: LLM provider, either "ollama" (local) or "mistral" (API).
|
||||
temperature: Model temperature for response generation (0.0-1.0).
|
||||
Lower values produce more deterministic results.
|
||||
|
||||
Returns:
|
||||
ValidationResult TypedDict containing:
|
||||
- valid: Overall validation status (True if no critical errors)
|
||||
- errors: List of critical issues as strings
|
||||
- warnings: List of non-critical suggestions
|
||||
- corrections: Dict mapping field names to suggested corrections
|
||||
- concepts: Extracted key concepts (empty for this function)
|
||||
- score: Confidence score from 0.0 to 1.0
|
||||
|
||||
Note:
|
||||
The function always returns a valid result, even on LLM errors.
|
||||
Check the 'score' field - a score of 0.0 indicates an error occurred.
|
||||
"""
|
||||
if model is None:
|
||||
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
||||
|
||||
# Extraire les infos clés
|
||||
metadata: Dict[str, Any] = parsed_doc.get("metadata", {})
|
||||
toc: List[Dict[str, Any]] = parsed_doc.get("toc", [])
|
||||
chunks: List[Dict[str, Any]] = parsed_doc.get("chunks", [])
|
||||
|
||||
# Préparer le résumé du document
|
||||
doc_summary: Dict[str, Any] = {
|
||||
"title": metadata.get("title"),
|
||||
"author": metadata.get("author"),
|
||||
"toc_count": len(toc),
|
||||
"toc_preview": [t.get("title") for t in toc[:10]] if toc else [],
|
||||
"chunks_count": len(chunks),
|
||||
"first_chunks_preview": [
|
||||
c.get("text", "")[:100] for c in chunks[:5]
|
||||
] if chunks else [],
|
||||
}
|
||||
|
||||
prompt: str = f"""Tu es un expert en validation de documents structurés.
|
||||
|
||||
TÂCHE: Vérifier la cohérence de ce document parsé et détecter les erreurs.
|
||||
|
||||
DOCUMENT PARSÉ:
|
||||
{json.dumps(doc_summary, ensure_ascii=False, indent=2)}
|
||||
|
||||
VÉRIFICATIONS À EFFECTUER:
|
||||
1. Le titre correspond-il au contenu? (pas le nom d'une collection)
|
||||
2. L'auteur est-il correctement identifié? (pas "Inconnu" si visible)
|
||||
3. La TOC est-elle cohérente? (pas de doublons, bon ordre)
|
||||
4. Les chunks contiennent-ils du vrai contenu? (pas que des métadonnées)
|
||||
|
||||
RÉPONDS avec un JSON entre <JSON></JSON>:
|
||||
|
||||
<JSON>
|
||||
{{
|
||||
"is_valid": true,
|
||||
"confidence": 0.85,
|
||||
"issues": [
|
||||
{{
|
||||
"field": "title",
|
||||
"severity": "warning",
|
||||
"message": "Le titre semble être le nom de la collection",
|
||||
"suggestion": "Vrai titre suggéré"
|
||||
}}
|
||||
],
|
||||
"corrections": {{
|
||||
"title": "Titre corrigé si nécessaire",
|
||||
"author": "Auteur corrigé si nécessaire"
|
||||
}},
|
||||
"quality_score": {{
|
||||
"metadata": 0.8,
|
||||
"toc": 0.9,
|
||||
"chunks": 0.7
|
||||
}}
|
||||
}}
|
||||
</JSON>
|
||||
"""
|
||||
|
||||
logger.info(f"Validation du document parsé via {provider.upper()}")
|
||||
|
||||
try:
|
||||
response: str = call_llm(
|
||||
prompt, model=model, provider=provider, temperature=temperature, timeout=180
|
||||
)
|
||||
result: Dict[str, Any] = _extract_json_from_response(response)
|
||||
|
||||
# Construire ValidationResult avec valeurs par défaut
|
||||
is_valid: bool = result.get("is_valid", True)
|
||||
issues: List[str] = result.get("issues", [])
|
||||
corrections: Dict[str, str] = result.get("corrections", {})
|
||||
confidence: float = result.get("confidence", 0.5)
|
||||
|
||||
logger.info(f"Validation terminée: valid={is_valid}, issues={len(issues)}")
|
||||
|
||||
validation_result: ValidationResult = {
|
||||
"valid": is_valid,
|
||||
"errors": [str(issue) for issue in issues] if issues else [],
|
||||
"warnings": [],
|
||||
"corrections": corrections,
|
||||
"concepts": [],
|
||||
"score": confidence,
|
||||
}
|
||||
return validation_result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur validation document: {e}")
|
||||
error_result: ValidationResult = {
|
||||
"valid": True,
|
||||
"errors": [str(e)],
|
||||
"warnings": [],
|
||||
"corrections": {},
|
||||
"concepts": [],
|
||||
"score": 0.0,
|
||||
}
|
||||
return error_result
|
||||
|
||||
|
||||
def generate_section_summary(
|
||||
section_content: str,
|
||||
section_title: str,
|
||||
model: Optional[str] = None,
|
||||
provider: LLMProvider = "ollama",
|
||||
max_words: int = 50,
|
||||
) -> str:
|
||||
"""Generate a concise summary for a document section using LLM.
|
||||
|
||||
Creates a single-sentence summary capturing the main idea of the section.
|
||||
For very short sections (< 100 characters), returns the section title
|
||||
instead of calling the LLM.
|
||||
|
||||
Args:
|
||||
section_content: Full text content of the section to summarize.
|
||||
section_title: Title of the section, used as fallback if summarization
|
||||
fails or content is too short.
|
||||
model: LLM model name. If None, uses provider's default model.
|
||||
provider: LLM provider, either "ollama" (local) or "mistral" (API).
|
||||
max_words: Maximum number of words for the generated summary.
|
||||
Defaults to 50 words.
|
||||
|
||||
Returns:
|
||||
Generated summary string, truncated to max_words if necessary.
|
||||
Returns section_title if content is too short or on error.
|
||||
|
||||
Note:
|
||||
Only the first 2000 characters of section_content are sent to the LLM
|
||||
to manage context window limits and costs.
|
||||
"""
|
||||
if model is None:
|
||||
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
||||
|
||||
if len(section_content) < 100:
|
||||
return section_title
|
||||
|
||||
prompt: str = f"""Résume cette section en maximum {max_words} mots.
|
||||
Le résumé doit capturer l'idée principale.
|
||||
|
||||
Titre: {section_title}
|
||||
Contenu:
|
||||
{section_content[:2000]}
|
||||
|
||||
Résumé (en une phrase):"""
|
||||
|
||||
try:
|
||||
response: str = call_llm(
|
||||
prompt, model=model, provider=provider, temperature=0.2, timeout=60
|
||||
)
|
||||
|
||||
# Nettoyer la réponse
|
||||
summary: str = response.strip()
|
||||
|
||||
# Limiter la longueur
|
||||
words: List[str] = summary.split()
|
||||
if len(words) > max_words:
|
||||
summary = ' '.join(words[:max_words]) + '...'
|
||||
|
||||
return summary or section_title
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur génération résumé: {e}")
|
||||
return section_title
|
||||
|
||||
|
||||
def enrich_chunks_with_concepts(
|
||||
chunks: List[Dict[str, Any]],
|
||||
model: Optional[str] = None,
|
||||
provider: LLMProvider = "ollama",
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Enrich text chunks with extracted key concepts using LLM.
|
||||
|
||||
Processes each chunk to extract 3-5 key philosophical or thematic
|
||||
concepts, adding them to the chunk's 'concepts' field. Skips chunks
|
||||
that already have concepts or are too short (< 100 characters).
|
||||
|
||||
Args:
|
||||
chunks: List of chunk dictionaries, each containing at minimum:
|
||||
- text: The chunk's text content
|
||||
May also contain existing 'concepts' field (will be skipped).
|
||||
model: LLM model name. If None, uses provider's default model.
|
||||
provider: LLM provider, either "ollama" (local) or "mistral" (API).
|
||||
|
||||
Returns:
|
||||
The same list of chunks, modified in-place with 'concepts' field
|
||||
added to each chunk. Each concepts field is a list of 0-5 strings.
|
||||
|
||||
Note:
|
||||
- Chunks are processed individually with logging every 10 chunks.
|
||||
- Only the first 1000 characters of each chunk are analyzed.
|
||||
- The function modifies chunks in-place AND returns them.
|
||||
- On extraction error, sets concepts to an empty list.
|
||||
"""
|
||||
if model is None:
|
||||
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
||||
|
||||
# Limiter le nombre de chunks à traiter en une fois
|
||||
batch_size: int = 10
|
||||
|
||||
i: int
|
||||
chunk: Dict[str, Any]
|
||||
for i, chunk in enumerate(chunks):
|
||||
if "concepts" in chunk and chunk["concepts"]:
|
||||
continue # Déjà enrichi
|
||||
|
||||
text: str = chunk.get("text", "")
|
||||
if len(text) < 100:
|
||||
chunk["concepts"] = []
|
||||
continue
|
||||
|
||||
# Traiter par batch pour optimiser
|
||||
if i % batch_size == 0:
|
||||
logger.info(f"Enrichissement concepts: chunks {i} à {min(i+batch_size, len(chunks))}")
|
||||
|
||||
prompt: str = f"""Extrait 3-5 concepts clés de ce texte.
|
||||
Réponds avec une liste JSON: ["concept1", "concept2", ...]
|
||||
|
||||
Texte:
|
||||
{text[:1000]}
|
||||
|
||||
Concepts:"""
|
||||
|
||||
try:
|
||||
response: str = call_llm(
|
||||
prompt, model=model, provider=provider, temperature=0.1, timeout=30
|
||||
)
|
||||
|
||||
# Chercher la liste JSON
|
||||
match: Optional[Match[str]] = re.search(r'\[.*?\]', response, re.DOTALL)
|
||||
if match:
|
||||
concepts: List[str] = json.loads(match.group())
|
||||
chunk["concepts"] = concepts[:5]
|
||||
else:
|
||||
chunk["concepts"] = []
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur extraction concepts chunk {i}: {e}")
|
||||
chunk["concepts"] = []
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def clean_validation_annotations(text: str) -> str:
|
||||
"""Remove LLM-generated validation annotations from text.
|
||||
|
||||
Cleans common annotation patterns that LLMs may add when validating
|
||||
or correcting text, such as confidence markers or verification notes.
|
||||
|
||||
Patterns removed:
|
||||
- "(correct)" or "(a confirmer)" at end of text
|
||||
- "(a confirmer comme titre principal)"
|
||||
- "(possiblement...)" or "(probablement...)"
|
||||
- Isolated "(correct)" or "(a confirmer)" mid-text
|
||||
|
||||
Args:
|
||||
text: Text potentially containing LLM annotation artifacts.
|
||||
|
||||
Returns:
|
||||
Cleaned text with annotations removed and whitespace normalized.
|
||||
Returns the original text if input is None or empty.
|
||||
|
||||
Example:
|
||||
>>> clean_validation_annotations("Phenomenologie (a confirmer)")
|
||||
"Phenomenologie"
|
||||
>>> clean_validation_annotations("G.W.F. Hegel (correct)")
|
||||
'G.W.F. Hegel'
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# Supprimer les annotations à la fin du texte
|
||||
text = re.sub(
|
||||
r'\s*\([^)]*(?:correct|à confirmer|possiblement|probablement)[^)]*\)\s*$',
|
||||
'',
|
||||
text,
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
|
||||
# Nettoyer aussi les annotations au milieu si elles sont isolées
|
||||
text = re.sub(r'\s*\((?:correct|à confirmer)\)\s*', ' ', text, flags=re.IGNORECASE)
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
def apply_corrections(
|
||||
parsed_doc: Dict[str, Any],
|
||||
validation_result: Optional[Dict[str, Any]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Apply validation corrections to a parsed document.
|
||||
|
||||
Takes the corrections suggested by validate_document() and applies them
|
||||
to the document's metadata. Also cleans any LLM annotation artifacts
|
||||
from existing metadata fields.
|
||||
|
||||
Args:
|
||||
parsed_doc: Parsed document dictionary containing at minimum:
|
||||
- metadata: Dict with title, author, and other fields
|
||||
May also contain 'work' field as fallback title source.
|
||||
validation_result: Result from validate_document() containing:
|
||||
- corrections: Dict mapping field names to corrected values
|
||||
If None, only cleans existing metadata annotations.
|
||||
|
||||
Returns:
|
||||
The modified parsed_doc with:
|
||||
- Corrected metadata fields applied
|
||||
- Original values preserved in 'original_<field>' keys
|
||||
- LLM annotations cleaned from all text fields
|
||||
- 'validation' key added with the validation_result
|
||||
|
||||
Note:
|
||||
- Modifies parsed_doc in-place AND returns it
|
||||
- Empty correction values are ignored
|
||||
- If title contains validation phrases and 'work' field exists,
|
||||
the work field value is used as the corrected title
|
||||
"""
|
||||
corrections: Dict[str, str] = (
|
||||
validation_result.get("corrections", {}) if validation_result else {}
|
||||
)
|
||||
|
||||
metadata: Dict[str, Any] = parsed_doc.get("metadata", {})
|
||||
|
||||
# Appliquer les corrections de métadonnées
|
||||
if "title" in corrections and corrections["title"]:
|
||||
old_title: Optional[str] = metadata.get("title")
|
||||
# Nettoyer les annotations de validation
|
||||
clean_title: str = clean_validation_annotations(corrections["title"])
|
||||
metadata["title"] = clean_title
|
||||
metadata["original_title"] = old_title
|
||||
logger.info(f"Titre corrigé: '{old_title}' -> '{clean_title}'")
|
||||
|
||||
if "author" in corrections and corrections["author"]:
|
||||
old_author: Optional[str] = metadata.get("author")
|
||||
# Nettoyer les annotations de validation
|
||||
clean_author: str = clean_validation_annotations(corrections["author"])
|
||||
metadata["author"] = clean_author
|
||||
metadata["original_author"] = old_author
|
||||
logger.info(f"Auteur corrigé: '{old_author}' -> '{clean_author}'")
|
||||
|
||||
# Nettoyer aussi les métadonnées existantes si pas de corrections
|
||||
if "title" in metadata and metadata["title"]:
|
||||
title: str = metadata["title"]
|
||||
# Si le titre contient des phrases de validation, utiliser le champ "work" à la place
|
||||
validation_phrases: List[str] = ["à confirmer", "confirmer avec", "vérifier"]
|
||||
if any(phrase in title.lower() for phrase in validation_phrases):
|
||||
if "work" in metadata and metadata["work"]:
|
||||
logger.info(f"Titre remplacé par 'work': '{title}' -> '{metadata['work']}'")
|
||||
metadata["original_title"] = title
|
||||
metadata["title"] = metadata["work"]
|
||||
else:
|
||||
metadata["title"] = clean_validation_annotations(title)
|
||||
|
||||
if "author" in metadata and metadata["author"]:
|
||||
metadata["author"] = clean_validation_annotations(metadata["author"])
|
||||
|
||||
parsed_doc["metadata"] = metadata
|
||||
parsed_doc["validation"] = validation_result
|
||||
|
||||
return parsed_doc
|
||||
|
||||
141
generations/library_rag/utils/markdown_builder.py
Normal file
141
generations/library_rag/utils/markdown_builder.py
Normal file
@@ -0,0 +1,141 @@
|
||||
"""Markdown document builder from OCR API responses.
|
||||
|
||||
This module transforms Mistral OCR API responses into structured Markdown text.
|
||||
It handles text extraction, page marker insertion, and image processing
|
||||
(either base64 embedding or disk-based storage with relative path references).
|
||||
|
||||
The builder is a core component of the PDF processing pipeline, sitting between
|
||||
OCR extraction and hierarchical parsing.
|
||||
|
||||
Pipeline Position:
|
||||
PDF → OCR (mistral_client) → **Markdown Builder** → Hierarchy Parser → Chunks
|
||||
|
||||
Features:
|
||||
- Page markers: Inserts HTML comments (<!-- Page N -->) for traceability
|
||||
- Image handling: Supports both inline base64 and external file references
|
||||
- Type safety: Uses Protocol-based typing for OCR response structures
|
||||
|
||||
Workflow:
|
||||
1. Iterate through pages in the OCR response
|
||||
2. Extract Markdown content from each page
|
||||
3. Process images (embed as base64 or save via ImageWriter callback)
|
||||
4. Assemble the complete Markdown document
|
||||
|
||||
Image Handling Modes:
|
||||
1. **No images**: Set embed_images=False and image_writer=None
|
||||
2. **Inline base64**: Set embed_images=True (large file size)
|
||||
3. **External files**: Provide image_writer callback (recommended)
|
||||
|
||||
Example:
|
||||
>>> from pathlib import Path
|
||||
>>> from utils.image_extractor import create_image_writer
|
||||
>>>
|
||||
>>> # Create image writer for output directory
|
||||
>>> writer = create_image_writer(Path("output/my_doc/images"))
|
||||
>>>
|
||||
>>> # Build markdown with external image references
|
||||
>>> markdown = build_markdown(
|
||||
... ocr_response,
|
||||
... embed_images=False,
|
||||
... image_writer=writer
|
||||
... )
|
||||
>>> print(markdown[:100])
|
||||
<!-- Page 1 -->
|
||||
# Document Title
|
||||
...
|
||||
|
||||
Note:
|
||||
- Page indices are 1-based for human readability
|
||||
- The OCR response must follow the Mistral API structure
|
||||
- Empty pages produce only the page marker comment
|
||||
|
||||
See Also:
|
||||
- utils.mistral_client: OCR API client for obtaining responses
|
||||
- utils.image_extractor: Image writer factory and extraction
|
||||
- utils.hierarchy_parser: Next step in pipeline (structure parsing)
|
||||
"""
|
||||
|
||||
from typing import Any, Callable, List, Optional, Protocol
|
||||
|
||||
|
||||
# Type pour le writer d'images
|
||||
ImageWriterCallable = Callable[[int, int, str], Optional[str]]
|
||||
|
||||
|
||||
class OCRImage(Protocol):
|
||||
"""Protocol pour une image extraite par OCR."""
|
||||
|
||||
image_base64: Optional[str]
|
||||
|
||||
|
||||
class OCRPage(Protocol):
|
||||
"""Protocol pour une page extraite par OCR."""
|
||||
|
||||
markdown: Optional[str]
|
||||
images: Optional[List[OCRImage]]
|
||||
|
||||
|
||||
class OCRResponseProtocol(Protocol):
|
||||
"""Protocol pour la réponse complète de l'API OCR Mistral."""
|
||||
|
||||
pages: List[OCRPage]
|
||||
|
||||
|
||||
def build_markdown(
|
||||
ocr_response: OCRResponseProtocol,
|
||||
embed_images: bool = False,
|
||||
image_writer: Optional[ImageWriterCallable] = None,
|
||||
) -> str:
|
||||
"""Construit le texte Markdown à partir de la réponse OCR.
|
||||
|
||||
Args:
|
||||
ocr_response: Réponse de l'API OCR Mistral contenant les pages extraites.
|
||||
embed_images: Intégrer les images en base64 dans le Markdown.
|
||||
image_writer: Fonction pour sauvegarder les images sur disque.
|
||||
Signature: (page_idx, img_idx, base64_data) -> chemin_relatif.
|
||||
|
||||
Returns:
|
||||
Texte Markdown complet du document avec marqueurs de page et images.
|
||||
|
||||
Example:
|
||||
>>> markdown = build_markdown(
|
||||
... ocr_response,
|
||||
... embed_images=False,
|
||||
... image_writer=lambda p, i, b64: f"images/p{p}_i{i}.png"
|
||||
... )
|
||||
"""
|
||||
md_parts: List[str] = []
|
||||
|
||||
for page_index, page in enumerate(ocr_response.pages, start=1):
|
||||
# Commentaire de page
|
||||
md_parts.append(f"<!-- Page {page_index} -->\n\n")
|
||||
|
||||
# Contenu Markdown de la page
|
||||
page_markdown: Optional[str] = getattr(page, "markdown", None)
|
||||
if page_markdown:
|
||||
md_parts.append(page_markdown)
|
||||
md_parts.append("\n\n")
|
||||
|
||||
# Traitement des images
|
||||
page_images: Optional[List[OCRImage]] = getattr(page, "images", None)
|
||||
if page_images:
|
||||
for img_idx, img in enumerate(page_images, start=1):
|
||||
image_b64: Optional[str] = getattr(img, "image_base64", None)
|
||||
if not image_b64:
|
||||
continue
|
||||
|
||||
if embed_images:
|
||||
# Image intégrée en base64
|
||||
data_uri: str = f"data:image/png;base64,{image_b64}"
|
||||
md_parts.append(
|
||||
f"\n\n"
|
||||
)
|
||||
elif image_writer:
|
||||
# Image sauvegardée sur disque
|
||||
rel_path: Optional[str] = image_writer(page_index, img_idx, image_b64)
|
||||
if rel_path:
|
||||
md_parts.append(
|
||||
f"\n\n"
|
||||
)
|
||||
|
||||
return "".join(md_parts)
|
||||
169
generations/library_rag/utils/mistral_client.py
Normal file
169
generations/library_rag/utils/mistral_client.py
Normal file
@@ -0,0 +1,169 @@
|
||||
"""Mistral API Client Management.
|
||||
|
||||
This module provides utilities for managing the Mistral API client,
|
||||
including API key retrieval and OCR cost estimation. It serves as the
|
||||
foundation for all Mistral API interactions in the Library RAG pipeline.
|
||||
|
||||
Key Features:
|
||||
- Automatic API key discovery from multiple sources
|
||||
- Client instantiation with proper authentication
|
||||
- OCR cost estimation for budget planning
|
||||
|
||||
API Key Priority:
|
||||
The module searches for the Mistral API key in this order:
|
||||
1. Explicit argument passed to functions
|
||||
2. MISTRAL_API_KEY environment variable
|
||||
3. .env file in the project root
|
||||
|
||||
Cost Estimation:
|
||||
Mistral OCR pricing (as of 2024):
|
||||
- Standard OCR: ~1 EUR per 1000 pages (0.001 EUR/page)
|
||||
- OCR with annotations: ~3 EUR per 1000 pages (0.003 EUR/page)
|
||||
|
||||
Example:
|
||||
Basic client creation and usage::
|
||||
|
||||
from utils.mistral_client import create_client, estimate_ocr_cost
|
||||
|
||||
# Create authenticated client
|
||||
client = create_client()
|
||||
|
||||
# Estimate cost for a 100-page document
|
||||
cost = estimate_ocr_cost(100, use_annotations=False)
|
||||
print(f"Estimated cost: {cost:.2f} EUR") # Output: Estimated cost: 0.10 EUR
|
||||
|
||||
Using explicit API key::
|
||||
|
||||
client = create_client(api_key="your-api-key-here")
|
||||
|
||||
See Also:
|
||||
- :mod:`utils.ocr_processor`: OCR execution functions using this client
|
||||
- :mod:`utils.pdf_uploader`: PDF upload utilities for OCR processing
|
||||
|
||||
Note:
|
||||
Ensure MISTRAL_API_KEY is set before using this module in production.
|
||||
The API key can be obtained from the Mistral AI platform dashboard.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from mistralai import Mistral
|
||||
|
||||
|
||||
def get_api_key(api_key: Optional[str] = None) -> str:
|
||||
"""Retrieve the Mistral API key from available sources.
|
||||
|
||||
Searches for the API key in the following priority order:
|
||||
1. Explicit argument passed to this function
|
||||
2. MISTRAL_API_KEY environment variable
|
||||
3. .env file in the project root
|
||||
|
||||
Args:
|
||||
api_key: Optional API key to use directly. If provided and non-empty,
|
||||
this value is used without checking other sources.
|
||||
|
||||
Returns:
|
||||
The Mistral API key as a string.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If no API key is found in any of the checked sources.
|
||||
|
||||
Example:
|
||||
>>> # Using environment variable
|
||||
>>> key = get_api_key()
|
||||
>>> len(key) > 0
|
||||
True
|
||||
|
||||
>>> # Using explicit key
|
||||
>>> key = get_api_key("my-api-key")
|
||||
>>> key
|
||||
'my-api-key'
|
||||
"""
|
||||
# 1. Argument fourni
|
||||
if api_key and api_key.strip():
|
||||
return api_key.strip()
|
||||
|
||||
# 2. Variable d environnement
|
||||
env_key = os.getenv("MISTRAL_API_KEY", "").strip()
|
||||
if env_key:
|
||||
return env_key
|
||||
|
||||
# 3. Fichier .env
|
||||
load_dotenv()
|
||||
env_key = os.getenv("MISTRAL_API_KEY", "").strip()
|
||||
if env_key:
|
||||
return env_key
|
||||
|
||||
raise RuntimeError(
|
||||
"MISTRAL_API_KEY manquante. "
|
||||
"Definissez la variable d environnement ou creez un fichier .env"
|
||||
)
|
||||
|
||||
|
||||
def create_client(api_key: Optional[str] = None) -> Mistral:
|
||||
"""Create and return an authenticated Mistral client.
|
||||
|
||||
This is the primary entry point for obtaining a Mistral client instance.
|
||||
The client can be used for OCR operations, chat completions, and other
|
||||
Mistral API features.
|
||||
|
||||
Args:
|
||||
api_key: Optional API key. If not provided, the key is automatically
|
||||
retrieved from environment variables or .env file.
|
||||
|
||||
Returns:
|
||||
An authenticated Mistral client instance ready for API calls.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If no API key is found (propagated from get_api_key).
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> # Client is now ready for OCR or other operations
|
||||
>>> response = client.ocr.process(...) # doctest: +SKIP
|
||||
"""
|
||||
key = get_api_key(api_key)
|
||||
return Mistral(api_key=key)
|
||||
|
||||
|
||||
def estimate_ocr_cost(nb_pages: int, use_annotations: bool = False) -> float:
|
||||
"""Estimate the cost of OCR processing for a document.
|
||||
|
||||
Calculates the expected cost based on Mistral OCR pricing model.
|
||||
This is useful for budget planning before processing large document
|
||||
collections.
|
||||
|
||||
Pricing Model:
|
||||
- Standard OCR: ~1 EUR per 1000 pages (0.001 EUR/page)
|
||||
- OCR with annotations: ~3 EUR per 1000 pages (0.003 EUR/page)
|
||||
|
||||
The annotation mode is approximately 3x more expensive but provides
|
||||
additional structural information useful for TOC extraction.
|
||||
|
||||
Args:
|
||||
nb_pages: Number of pages in the document to process.
|
||||
use_annotations: If True, uses the higher annotation pricing.
|
||||
Annotations provide bounding box and structural data.
|
||||
|
||||
Returns:
|
||||
Estimated cost in euros as a float.
|
||||
|
||||
Example:
|
||||
>>> # Standard OCR for 100 pages
|
||||
>>> estimate_ocr_cost(100)
|
||||
0.1
|
||||
|
||||
>>> # OCR with annotations for 100 pages
|
||||
>>> estimate_ocr_cost(100, use_annotations=True)
|
||||
0.3
|
||||
|
||||
>>> # Large document collection
|
||||
>>> estimate_ocr_cost(10000)
|
||||
10.0
|
||||
"""
|
||||
if use_annotations:
|
||||
return nb_pages * 0.003 # 3 EUR / 1000 pages
|
||||
else:
|
||||
return nb_pages * 0.001 # 1 EUR / 1000 pages
|
||||
312
generations/library_rag/utils/ocr_processor.py
Normal file
312
generations/library_rag/utils/ocr_processor.py
Normal file
@@ -0,0 +1,312 @@
|
||||
"""OCR Processing via Mistral API.
|
||||
|
||||
This module provides functions for executing OCR (Optical Character Recognition)
|
||||
on PDF documents using the Mistral API. It handles both standard OCR and advanced
|
||||
OCR with structured annotations for better document understanding.
|
||||
|
||||
Key Features:
|
||||
- Standard OCR for text extraction with optional image embedding
|
||||
- Advanced OCR with document and bounding box annotations
|
||||
- Response serialization for JSON storage and further processing
|
||||
- Support for page-by-page processing
|
||||
|
||||
OCR Modes:
|
||||
1. **Standard OCR** (run_ocr):
|
||||
- Extracts text and optionally images
|
||||
- Cost: ~1 EUR per 1000 pages (0.001 EUR/page)
|
||||
- Best for: Simple text extraction, content indexing
|
||||
|
||||
2. **OCR with Annotations** (run_ocr_with_annotations):
|
||||
- Extracts text with structural metadata (bounding boxes, document structure)
|
||||
- Cost: ~3 EUR per 1000 pages (0.003 EUR/page)
|
||||
- Best for: TOC extraction, layout analysis, structured documents
|
||||
- Document annotations limited to 8 pages max
|
||||
- Bounding box annotations have no page limit
|
||||
|
||||
Response Structure:
|
||||
The OCR response contains:
|
||||
- pages: List of page objects with text content
|
||||
- images: Optional base64-encoded images (if include_images=True)
|
||||
- annotations: Structural metadata (if using annotation mode)
|
||||
|
||||
Example:
|
||||
Basic OCR processing::
|
||||
|
||||
from utils.mistral_client import create_client
|
||||
from utils.ocr_processor import run_ocr, serialize_ocr_response
|
||||
|
||||
# Create client and read PDF
|
||||
client = create_client()
|
||||
with open("document.pdf", "rb") as f:
|
||||
pdf_bytes = f.read()
|
||||
|
||||
# Run OCR
|
||||
response = run_ocr(client, pdf_bytes, "document.pdf")
|
||||
|
||||
# Serialize for storage
|
||||
ocr_dict = serialize_ocr_response(response)
|
||||
print(f"Extracted {len(ocr_dict['pages'])} pages")
|
||||
|
||||
Cost Considerations:
|
||||
- Always estimate costs before batch processing with estimate_ocr_cost()
|
||||
- Use pages parameter to limit processing when full document is not needed
|
||||
- Annotation mode is 3x more expensive - use only when structure is needed
|
||||
- Cache OCR results to avoid reprocessing (saved in output/<doc>/<doc>.json)
|
||||
|
||||
See Also:
|
||||
- utils.mistral_client: Client creation and cost estimation
|
||||
- utils.pdf_uploader: PDF upload utilities
|
||||
- utils.pdf_pipeline: Full pipeline orchestration
|
||||
|
||||
Note:
|
||||
OCR responses are Pydantic models from the Mistral SDK. Use
|
||||
serialize_ocr_response() to convert to dictionaries before JSON storage.
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Any, Dict, List, Optional, Type
|
||||
|
||||
from mistralai import Mistral
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .pdf_uploader import upload_pdf
|
||||
from .types import OCRResponse
|
||||
|
||||
|
||||
def run_ocr(
|
||||
client: Mistral,
|
||||
file_bytes: bytes,
|
||||
filename: str,
|
||||
include_images: bool = True,
|
||||
) -> Any:
|
||||
"""Execute standard OCR on a PDF document via Mistral API.
|
||||
|
||||
Uploads the PDF to Mistral servers and runs OCR to extract text content.
|
||||
Optionally includes base64-encoded images from the document.
|
||||
|
||||
This is the most cost-effective OCR mode (~0.001 EUR/page) suitable for
|
||||
basic text extraction and content indexing.
|
||||
|
||||
Args:
|
||||
client: Authenticated Mistral client instance created via
|
||||
utils.mistral_client.create_client().
|
||||
file_bytes: Binary content of the PDF file to process.
|
||||
filename: Original filename of the PDF (used for identification).
|
||||
include_images: If True, includes base64-encoded images from each page
|
||||
in the response. Set to False to reduce response size when images
|
||||
are not needed. Defaults to True.
|
||||
|
||||
Returns:
|
||||
OCR response object from Mistral API (Pydantic model). Contains:
|
||||
- pages: List of page objects with extracted text
|
||||
- images: Base64 images if include_images=True
|
||||
|
||||
Use serialize_ocr_response() to convert to a dictionary.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the Mistral client is not properly authenticated.
|
||||
HTTPError: If the API request fails (network issues, rate limits).
|
||||
|
||||
Example:
|
||||
>>> from utils.mistral_client import create_client
|
||||
>>> client = create_client()
|
||||
>>> with open("document.pdf", "rb") as f:
|
||||
... pdf_bytes = f.read()
|
||||
>>> response = run_ocr(client, pdf_bytes, "document.pdf")
|
||||
>>> # Access extracted text from first page
|
||||
>>> first_page_text = response.pages[0].markdown # doctest: +SKIP
|
||||
|
||||
Note:
|
||||
The PDF is first uploaded to Mistral servers via
|
||||
utils.pdf_uploader.upload_pdf(), then processed. The uploaded
|
||||
file is automatically cleaned up by Mistral after processing.
|
||||
"""
|
||||
# Upload du document
|
||||
doc_url: str = upload_pdf(client, file_bytes, filename)
|
||||
|
||||
# Appel OCR
|
||||
response = client.ocr.process(
|
||||
model="mistral-ocr-latest",
|
||||
document={
|
||||
"type": "document_url",
|
||||
"document_url": doc_url,
|
||||
},
|
||||
include_image_base64=include_images,
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
def run_ocr_with_annotations(
|
||||
client: Mistral,
|
||||
file_bytes: bytes,
|
||||
filename: str,
|
||||
include_images: bool = True,
|
||||
document_annotation_format: Optional[Type[BaseModel]] = None,
|
||||
bbox_annotation_format: Optional[Type[BaseModel]] = None,
|
||||
pages: Optional[List[int]] = None,
|
||||
) -> Any:
|
||||
"""Execute OCR with structured annotations on a PDF document.
|
||||
|
||||
This advanced OCR mode extracts text along with structural metadata
|
||||
defined by Pydantic schemas. Useful for extracting structured data
|
||||
like table of contents, form fields, or document hierarchy.
|
||||
|
||||
Two annotation modes are available:
|
||||
- Document annotations: Extract document-level structure (limited to 8 pages)
|
||||
- Bounding box annotations: Extract element positions (no page limit)
|
||||
|
||||
This mode is approximately 3x more expensive than standard OCR (~0.003 EUR/page).
|
||||
|
||||
Args:
|
||||
client: Authenticated Mistral client instance created via
|
||||
utils.mistral_client.create_client().
|
||||
file_bytes: Binary content of the PDF file to process.
|
||||
filename: Original filename of the PDF (used for identification).
|
||||
include_images: If True, includes base64-encoded images from each page.
|
||||
Defaults to True.
|
||||
document_annotation_format: Optional Pydantic model defining the expected
|
||||
document-level annotation structure. The model is converted to JSON
|
||||
schema for the API. Limited to processing 8 pages maximum.
|
||||
bbox_annotation_format: Optional Pydantic model defining the expected
|
||||
bounding box annotation structure. No page limit applies.
|
||||
pages: Optional list of 0-indexed page numbers to process. If None,
|
||||
all pages are processed. Use this to limit costs and processing time.
|
||||
|
||||
Returns:
|
||||
OCR response object with annotations from Mistral API. Contains:
|
||||
- pages: List of page objects with extracted text
|
||||
- annotations: Structured data matching the provided Pydantic schema
|
||||
- images: Base64 images if include_images=True
|
||||
|
||||
Use serialize_ocr_response() to convert to a dictionary.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the Mistral client is not properly authenticated.
|
||||
HTTPError: If the API request fails (network issues, rate limits).
|
||||
ValueError: If document_annotation_format is used with more than 8 pages.
|
||||
|
||||
Example:
|
||||
Extract table of contents from first 8 pages::
|
||||
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Optional
|
||||
|
||||
class TOCEntry(BaseModel):
|
||||
title: str
|
||||
page: int
|
||||
level: int
|
||||
children: Optional[List["TOCEntry"]] = None
|
||||
|
||||
response = run_ocr_with_annotations(
|
||||
client,
|
||||
pdf_bytes,
|
||||
"book.pdf",
|
||||
document_annotation_format=TOCEntry,
|
||||
pages=[0, 1, 2, 3, 4, 5, 6, 7]
|
||||
)
|
||||
|
||||
# Access annotations
|
||||
toc_data = response.annotations # doctest: +SKIP
|
||||
|
||||
Note:
|
||||
- Document annotations are more expensive but provide rich structure
|
||||
- For large documents, use pages parameter to limit processing
|
||||
- Consider caching results to avoid reprocessing costs
|
||||
"""
|
||||
from mistralai.extra import response_format_from_pydantic_model
|
||||
|
||||
# Upload du document
|
||||
doc_url: str = upload_pdf(client, file_bytes, filename)
|
||||
|
||||
# Construire les arguments de l'appel OCR
|
||||
kwargs: Dict[str, Any] = {
|
||||
"model": "mistral-ocr-latest",
|
||||
"document": {
|
||||
"type": "document_url",
|
||||
"document_url": doc_url,
|
||||
},
|
||||
"include_image_base64": include_images,
|
||||
}
|
||||
|
||||
# Ajouter les pages si spécifié
|
||||
if pages is not None:
|
||||
kwargs["pages"] = pages
|
||||
|
||||
# Ajouter le format d'annotation document si fourni
|
||||
if document_annotation_format is not None:
|
||||
kwargs["document_annotation_format"] = response_format_from_pydantic_model(
|
||||
document_annotation_format
|
||||
)
|
||||
|
||||
# Ajouter le format d'annotation bbox si fourni
|
||||
if bbox_annotation_format is not None:
|
||||
kwargs["bbox_annotation_format"] = response_format_from_pydantic_model(
|
||||
bbox_annotation_format
|
||||
)
|
||||
|
||||
# Appel OCR avec annotations
|
||||
response = client.ocr.process(**kwargs)
|
||||
return response
|
||||
|
||||
|
||||
def serialize_ocr_response(response: Any) -> Dict[str, Any]:
|
||||
"""Convert an OCR response object to a JSON-serializable dictionary.
|
||||
|
||||
The Mistral OCR API returns Pydantic model objects that need to be
|
||||
converted to plain dictionaries for JSON storage or further processing.
|
||||
This function handles various response formats from different versions
|
||||
of the Mistral SDK.
|
||||
|
||||
Args:
|
||||
response: OCR response object from Mistral API. Can be any object
|
||||
that has model_dump(), dict(), or json() method.
|
||||
|
||||
Returns:
|
||||
A dictionary representation of the OCR response, suitable for:
|
||||
- JSON serialization with json.dumps()
|
||||
- Storage in files (output/<doc>/<doc>.json)
|
||||
- Further processing in the pipeline
|
||||
|
||||
The dictionary typically contains:
|
||||
- pages: List of page data with text content
|
||||
- images: Base64-encoded images (if requested)
|
||||
- model: OCR model used
|
||||
- usage: Token/page usage statistics
|
||||
|
||||
Raises:
|
||||
TypeError: If the response object cannot be serialized using any
|
||||
of the supported methods (model_dump, dict, json).
|
||||
|
||||
Example:
|
||||
>>> # Assuming response is from run_ocr()
|
||||
>>> ocr_dict = serialize_ocr_response(response) # doctest: +SKIP
|
||||
>>> import json
|
||||
>>> with open("ocr_result.json", "w") as f:
|
||||
... json.dump(ocr_dict, f, indent=2) # doctest: +SKIP
|
||||
|
||||
>>> # Access page count
|
||||
>>> num_pages = len(ocr_dict["pages"]) # doctest: +SKIP
|
||||
|
||||
Note:
|
||||
This function tries multiple serialization methods in order of
|
||||
preference:
|
||||
1. model_dump() - Pydantic v2 (preferred)
|
||||
2. dict() - Pydantic v1 compatibility
|
||||
3. json() - Fallback for other Pydantic models
|
||||
"""
|
||||
if hasattr(response, "model_dump"):
|
||||
result: Dict[str, Any] = response.model_dump()
|
||||
return result
|
||||
|
||||
if hasattr(response, "dict"):
|
||||
result = response.dict()
|
||||
return result
|
||||
|
||||
if hasattr(response, "json"):
|
||||
result = json.loads(response.json())
|
||||
return result
|
||||
|
||||
raise TypeError("Réponse OCR non sérialisable")
|
||||
|
||||
|
||||
55
generations/library_rag/utils/ocr_schemas.py
Normal file
55
generations/library_rag/utils/ocr_schemas.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""Schémas Pydantic pour l'extraction structurée via OCR avec annotations.
|
||||
|
||||
Utilisés avec document_annotation_format et bbox_annotation_format de l'API Mistral.
|
||||
"""
|
||||
|
||||
from typing import List, Optional
|
||||
from pydantic import BaseModel, Field
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class TocEntryType(str, Enum):
|
||||
"""Type d'entrée de table des matières."""
|
||||
CHAPTER = "chapter"
|
||||
SECTION = "section"
|
||||
SUBSECTION = "subsection"
|
||||
PREAMBLE = "preamble"
|
||||
APPENDIX = "appendix"
|
||||
|
||||
|
||||
class TocEntry(BaseModel):
|
||||
"""Entrée de table des matières avec hiérarchie."""
|
||||
title: str = Field(..., description="Titre exact de la section tel qu'il apparaît dans la table des matières")
|
||||
page_number: int = Field(..., description="Numéro de page réel tel qu'imprimé/affiché dans le livre (PAS l'index séquentiel du PDF, mais le numéro visible sur la page elle-même)")
|
||||
level: int = Field(..., description="""Niveau hiérarchique détecté VISUELLEMENT dans la mise en page de la table des matières:
|
||||
- level=1 si le titre est aligné à gauche SANS indentation (titres principaux)
|
||||
- level=2 si le titre a une PETITE indentation ou est légèrement décalé vers la droite
|
||||
- level=3 si le titre a une DOUBLE indentation ou est très décalé vers la droite
|
||||
Regardez attentivement l'alignement horizontal et les espaces avant chaque titre pour déterminer le niveau.""")
|
||||
entry_type: TocEntryType = Field(default=TocEntryType.SECTION, description="Type d'entrée: 'preamble' pour préfaces/introductions, 'chapter' pour chapitres, 'section' pour sections, 'subsection' pour sous-sections, 'appendix' pour annexes")
|
||||
parent_title: Optional[str] = Field(None, description="Si level > 1, indiquer le titre du parent direct (l'entrée de level=1 sous laquelle cette entrée est indentée)")
|
||||
|
||||
|
||||
class DocumentTOC(BaseModel):
|
||||
"""Table des matières complète du document."""
|
||||
entries: List[TocEntry] = Field(..., description="""Liste COMPLÈTE de TOUTES les entrées de la table des matières dans l'ordre d'apparition.
|
||||
IMPORTANT : Analysez attentivement l'indentation/alignement horizontal de chaque titre pour assigner le bon niveau hiérarchique:
|
||||
- Les titres alignés à gauche (non indentés) = level 1
|
||||
- Les titres légèrement indentés/décalés vers la droite = level 2 (sous-sections du titre level 1 précédent)
|
||||
- Les titres avec double indentation = level 3 (sous-sections du titre level 2 précédent)
|
||||
Chaque entrée doit avoir son vrai numéro de page tel qu'imprimé dans le livre.""")
|
||||
has_explicit_toc: bool = Field(..., description="Le document contient-il une table des matières explicite et visible ? (généralement en début de document)")
|
||||
toc_page_numbers: List[int] = Field(..., description="Liste des numéros de pages où se trouve la table des matières (généralement pages 2-5)")
|
||||
|
||||
|
||||
class DocumentMetadata(BaseModel):
|
||||
"""Métadonnées enrichies du document."""
|
||||
title: str = Field(..., description="Titre complet du document")
|
||||
author: str = Field(..., description="Auteur principal du document")
|
||||
languages: List[str] = Field(..., description="Liste des langues présentes dans le document (codes ISO 639-1, ex: ['fr', 'en'])")
|
||||
summary: str = Field(..., description="Résumé du document en 2-3 phrases maximum")
|
||||
collection: Optional[str] = Field(None, description="Nom de la collection ou série éditoriale")
|
||||
publisher: Optional[str] = Field(None, description="Nom de l'éditeur")
|
||||
year: Optional[int] = Field(None, description="Année de publication")
|
||||
total_pages: int = Field(..., description="Nombre total de pages dans le document")
|
||||
toc: DocumentTOC = Field(..., description="Table des matières structurée avec hiérarchie et numéros de page réels")
|
||||
1439
generations/library_rag/utils/pdf_pipeline.py
Normal file
1439
generations/library_rag/utils/pdf_pipeline.py
Normal file
File diff suppressed because it is too large
Load Diff
31
generations/library_rag/utils/pdf_uploader.py
Normal file
31
generations/library_rag/utils/pdf_uploader.py
Normal file
@@ -0,0 +1,31 @@
|
||||
"""Upload de fichiers PDF vers l'API Mistral."""
|
||||
|
||||
from mistralai import Mistral
|
||||
|
||||
|
||||
def upload_pdf(client: Mistral, file_bytes: bytes, filename: str) -> str:
|
||||
"""Upload un PDF vers Mistral et retourne l'URL signée.
|
||||
|
||||
Args:
|
||||
client: Client Mistral authentifié
|
||||
file_bytes: Contenu binaire du fichier PDF
|
||||
filename: Nom du fichier
|
||||
|
||||
Returns:
|
||||
URL signée du document uploadé
|
||||
"""
|
||||
# Upload du fichier
|
||||
uploaded = client.files.upload(
|
||||
file={
|
||||
"file_name": filename,
|
||||
"content": file_bytes,
|
||||
},
|
||||
purpose="ocr",
|
||||
)
|
||||
|
||||
# Récupération de l'URL signée
|
||||
signed = client.files.get_signed_url(file_id=uploaded.id)
|
||||
|
||||
return signed.url
|
||||
|
||||
|
||||
382
generations/library_rag/utils/toc_enricher.py
Normal file
382
generations/library_rag/utils/toc_enricher.py
Normal file
@@ -0,0 +1,382 @@
|
||||
"""TOC Enrichment Module for Chunk Metadata Enhancement.
|
||||
|
||||
This module provides functions to enrich chunk metadata with hierarchical
|
||||
information from the table of contents (TOC). It matches chunks to their
|
||||
corresponding TOC entries and extracts:
|
||||
- Full hierarchical paths (e.g., "Peirce: CP 1.628 > 628. It is...")
|
||||
- Chapter titles
|
||||
- Canonical academic references (e.g., "CP 1.628", "Ménon 80a")
|
||||
|
||||
The enrichment happens before Weaviate ingestion to ensure chunks have
|
||||
complete metadata for rigorous academic citation.
|
||||
|
||||
Usage:
|
||||
>>> from utils.toc_enricher import enrich_chunks_with_toc
|
||||
>>> enriched_chunks = enrich_chunks_with_toc(chunks, toc, hierarchy)
|
||||
|
||||
See Also:
|
||||
- utils.types: FlatTOCEntryEnriched type definition
|
||||
- utils.weaviate_ingest: Integration point for enrichment
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from .types import FlatTOCEntryEnriched
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def flatten_toc_with_paths(
|
||||
toc: List[Dict[str, Any]],
|
||||
hierarchy: Dict[str, Any],
|
||||
) -> List[FlatTOCEntryEnriched]:
|
||||
"""Flatten hierarchical or flat TOC and build full paths with metadata.
|
||||
|
||||
Handles both hierarchical TOCs (with 'children' keys) and flat TOCs
|
||||
(where parent-child relationships are inferred from 'level' field).
|
||||
|
||||
Traverses the TOC structure and creates enriched flat entries with:
|
||||
- Full hierarchical path (e.g., "Peirce: CP 1.628 > 628. It is...")
|
||||
- Canonical reference extraction (e.g., "CP 1.628")
|
||||
- Chapter title tracking (first level 1 ancestor)
|
||||
- Parent title list for context
|
||||
|
||||
Args:
|
||||
toc: TOC structure with 'title' and 'level' fields, optionally 'children'
|
||||
hierarchy: Document hierarchy (currently unused, reserved for future)
|
||||
|
||||
Returns:
|
||||
List of enriched flat TOC entries with full metadata.
|
||||
|
||||
Example:
|
||||
>>> toc = [
|
||||
... {"title": "Peirce: CP 1.628", "level": 1},
|
||||
... {"title": "628. It is the instincts...", "level": 2}
|
||||
... ]
|
||||
>>> flat = flatten_toc_with_paths(toc, {})
|
||||
>>> flat[1]["full_path"]
|
||||
'Peirce: CP 1.628 > 628. It is the instincts...'
|
||||
>>> flat[1]["canonical_ref"]
|
||||
'CP 1.628'
|
||||
"""
|
||||
flat_toc: List[FlatTOCEntryEnriched] = []
|
||||
|
||||
# Check if TOC is hierarchical (has children) or flat (level-based)
|
||||
is_hierarchical = any("children" in entry for entry in toc if entry)
|
||||
|
||||
if is_hierarchical:
|
||||
# Original recursive approach for hierarchical TOCs
|
||||
def traverse(
|
||||
entries: List[Dict[str, Any]],
|
||||
parent_titles: List[str],
|
||||
current_chapter: str,
|
||||
current_canonical: Optional[str],
|
||||
) -> None:
|
||||
"""Recursively traverse TOC entries and build flat list."""
|
||||
for entry in entries:
|
||||
title = entry.get("title", "")
|
||||
level = entry.get("level", 0)
|
||||
children = entry.get("children", [])
|
||||
|
||||
# Build full path from parents + current title
|
||||
full_path_parts = parent_titles + [title]
|
||||
full_path = " > ".join(full_path_parts)
|
||||
|
||||
# Extract canonical reference if present in title
|
||||
canonical_ref = current_canonical
|
||||
cp_match = re.search(r'CP\s+(\d+\.\d+)', title)
|
||||
stephanus_match = re.search(r'(\w+\s+\d+[a-z])', title)
|
||||
|
||||
if cp_match:
|
||||
canonical_ref = f"CP {cp_match.group(1)}"
|
||||
elif stephanus_match:
|
||||
canonical_ref = stephanus_match.group(1)
|
||||
|
||||
# Update chapter title when entering level 1
|
||||
chapter_title = current_chapter
|
||||
if level == 1:
|
||||
chapter_title = title
|
||||
|
||||
# Create enriched entry
|
||||
enriched_entry: FlatTOCEntryEnriched = {
|
||||
"title": title,
|
||||
"level": level,
|
||||
"full_path": full_path,
|
||||
"chapter_title": chapter_title,
|
||||
"canonical_ref": canonical_ref,
|
||||
"parent_titles": parent_titles.copy(),
|
||||
"index_in_flat_list": len(flat_toc),
|
||||
}
|
||||
flat_toc.append(enriched_entry)
|
||||
|
||||
# Recursively process children
|
||||
if children:
|
||||
traverse(
|
||||
children,
|
||||
parent_titles + [title],
|
||||
chapter_title,
|
||||
canonical_ref,
|
||||
)
|
||||
|
||||
traverse(toc, [], "", None)
|
||||
else:
|
||||
# New iterative approach for flat TOCs (infer hierarchy from levels)
|
||||
parent_stack: List[Dict[str, Any]] = [] # Stack of (level, title, canonical_ref)
|
||||
current_chapter = ""
|
||||
current_canonical: Optional[str] = None
|
||||
|
||||
for entry in toc:
|
||||
title = entry.get("title", "")
|
||||
level = entry.get("level", 1)
|
||||
|
||||
# Pop parents that are at same or deeper level
|
||||
while parent_stack and parent_stack[-1]["level"] >= level:
|
||||
parent_stack.pop()
|
||||
|
||||
# Build parent titles list
|
||||
parent_titles = [p["title"] for p in parent_stack]
|
||||
|
||||
# Build full path
|
||||
full_path_parts = parent_titles + [title]
|
||||
full_path = " > ".join(full_path_parts)
|
||||
|
||||
# Extract canonical reference if present in title
|
||||
cp_match = re.search(r'CP\s+(\d+\.\d+)', title)
|
||||
stephanus_match = re.search(r'(\w+\s+\d+[a-z])', title)
|
||||
|
||||
if cp_match:
|
||||
current_canonical = f"CP {cp_match.group(1)}"
|
||||
elif stephanus_match:
|
||||
current_canonical = stephanus_match.group(1)
|
||||
elif level == 1:
|
||||
# Reset canonical ref at level 1 if none found
|
||||
current_canonical = None
|
||||
|
||||
# Inherit canonical ref from parent if not found
|
||||
if not current_canonical and parent_stack:
|
||||
current_canonical = parent_stack[-1].get("canonical_ref")
|
||||
|
||||
# Update chapter title when at level 1
|
||||
if level == 1:
|
||||
current_chapter = title
|
||||
|
||||
# Create enriched entry
|
||||
enriched_entry: FlatTOCEntryEnriched = {
|
||||
"title": title,
|
||||
"level": level,
|
||||
"full_path": full_path,
|
||||
"chapter_title": current_chapter,
|
||||
"canonical_ref": current_canonical,
|
||||
"parent_titles": parent_titles.copy(),
|
||||
"index_in_flat_list": len(flat_toc),
|
||||
}
|
||||
flat_toc.append(enriched_entry)
|
||||
|
||||
# Add current entry to parent stack for next iteration
|
||||
parent_stack.append({
|
||||
"level": level,
|
||||
"title": title,
|
||||
"canonical_ref": current_canonical,
|
||||
})
|
||||
|
||||
return flat_toc
|
||||
|
||||
|
||||
def extract_paragraph_number(section_text: str) -> Optional[str]:
|
||||
"""Extract paragraph number from section text.
|
||||
|
||||
Handles various academic paragraph numbering formats:
|
||||
- "628. Text..." → "628"
|
||||
- "§42 Text..." → "42"
|
||||
- "80a. Text..." → "80a" (Stephanus pagination)
|
||||
- "CP 5.628. Text..." → "628"
|
||||
|
||||
Args:
|
||||
section_text: Section title or path text
|
||||
|
||||
Returns:
|
||||
Extracted paragraph number or None if not found.
|
||||
|
||||
Example:
|
||||
>>> extract_paragraph_number("628. It is the instincts...")
|
||||
'628'
|
||||
>>> extract_paragraph_number("§42 On the nature of...")
|
||||
'42'
|
||||
>>> extract_paragraph_number("80a. SOCRATE: Sais-tu...")
|
||||
'80a'
|
||||
"""
|
||||
if not section_text:
|
||||
return None
|
||||
|
||||
# Pattern 1: Standard paragraph number at start "628. Text"
|
||||
match = re.match(r'^(\d+[a-z]?)\.\s', section_text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# Pattern 2: Section symbol "§42 Text"
|
||||
match = re.match(r'^§\s*(\d+[a-z]?)\s', section_text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# Pattern 3: CP reference "CP 5.628. Text" → extract paragraph only
|
||||
match = re.match(r'^CP\s+\d+\.(\d+)\.\s', section_text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def find_matching_toc_entry(
|
||||
chunk: Dict[str, Any],
|
||||
flat_toc: List[FlatTOCEntryEnriched],
|
||||
) -> Optional[FlatTOCEntryEnriched]:
|
||||
"""Find matching TOC entry for a chunk using multi-strategy matching.
|
||||
|
||||
Matching strategies (in priority order):
|
||||
1. **Exact text match**: chunk.section == toc.title
|
||||
2. **Paragraph number match**: Extract paragraph number from both and compare
|
||||
3. **Proximity match**: Use order_index to find nearest TOC entry
|
||||
|
||||
Args:
|
||||
chunk: Chunk dict with 'section', 'sectionPath', 'order_index' fields
|
||||
flat_toc: Flattened TOC with enriched metadata
|
||||
|
||||
Returns:
|
||||
Best matching TOC entry or None if no match found.
|
||||
|
||||
Example:
|
||||
>>> chunk = {"section": "628. It is the instincts...", "order_index": 42}
|
||||
>>> toc_entry = find_matching_toc_entry(chunk, flat_toc)
|
||||
>>> toc_entry["canonical_ref"]
|
||||
'CP 1.628'
|
||||
"""
|
||||
if not flat_toc:
|
||||
return None
|
||||
|
||||
chunk_section = chunk.get("section", chunk.get("sectionPath", ""))
|
||||
if not chunk_section:
|
||||
return None
|
||||
|
||||
# Strategy 1: Exact title match
|
||||
for entry in flat_toc:
|
||||
if entry["title"] == chunk_section:
|
||||
return entry
|
||||
|
||||
# Strategy 2: Paragraph number match
|
||||
chunk_para = extract_paragraph_number(chunk_section)
|
||||
if chunk_para:
|
||||
# Look for matching paragraph in level 2 entries (actual content)
|
||||
for i, entry in enumerate(flat_toc):
|
||||
if entry["level"] == 2:
|
||||
entry_para = extract_paragraph_number(entry["title"])
|
||||
if entry_para == chunk_para:
|
||||
# Additional text similarity check to disambiguate
|
||||
# Get first significant word from chunk section
|
||||
chunk_words = [w for w in chunk_section.split() if len(w) > 3]
|
||||
entry_words = [w for w in entry["title"].split() if len(w) > 3]
|
||||
|
||||
if chunk_words and entry_words:
|
||||
# Check if first significant words match
|
||||
if chunk_words[0].lower() in entry["title"].lower():
|
||||
return entry
|
||||
else:
|
||||
# No text to compare, return paragraph match
|
||||
return entry
|
||||
|
||||
# Strategy 3: Proximity match using order_index
|
||||
chunk_order = chunk.get("order_index")
|
||||
if chunk_order is not None and flat_toc:
|
||||
# Find TOC entry with closest index_in_flat_list to chunk order
|
||||
# This is a fallback heuristic assuming TOC and chunks follow similar order
|
||||
closest_entry = min(
|
||||
flat_toc,
|
||||
key=lambda e: abs(e["index_in_flat_list"] - chunk_order),
|
||||
)
|
||||
return closest_entry
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def enrich_chunks_with_toc(
|
||||
chunks: List[Dict[str, Any]],
|
||||
toc: List[Dict[str, Any]],
|
||||
hierarchy: Dict[str, Any],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Enrich chunks with hierarchical metadata from TOC.
|
||||
|
||||
Main orchestration function that:
|
||||
1. Checks if TOC is available (guard clause)
|
||||
2. Flattens TOC once for efficiency
|
||||
3. Matches each chunk to its TOC entry
|
||||
4. Updates chunk metadata: sectionPath, chapterTitle, canonical_reference
|
||||
|
||||
Args:
|
||||
chunks: List of chunk dicts from pdf_pipeline
|
||||
toc: Hierarchical TOC structure (may be empty)
|
||||
hierarchy: Document hierarchy dict (may be empty)
|
||||
|
||||
Returns:
|
||||
List of chunks with enriched metadata (same objects, modified in place).
|
||||
If TOC is empty, returns chunks unchanged (no regression).
|
||||
|
||||
Example:
|
||||
>>> chunks = [{"text": "...", "section": "628. It is..."}]
|
||||
>>> toc = [
|
||||
... {"title": "Peirce: CP 1.628", "level": 1, "children": [
|
||||
... {"title": "628. It is...", "level": 2, "children": []}
|
||||
... ]}
|
||||
... ]
|
||||
>>> enriched = enrich_chunks_with_toc(chunks, toc, {})
|
||||
>>> enriched[0]["sectionPath"]
|
||||
'Peirce: CP 1.628 > 628. It is the instincts...'
|
||||
>>> enriched[0]["chapterTitle"]
|
||||
'Peirce: CP 1.628'
|
||||
>>> enriched[0]["canonical_reference"]
|
||||
'CP 1.628'
|
||||
"""
|
||||
# Guard: If no TOC, return chunks unchanged (graceful fallback)
|
||||
if not toc:
|
||||
logger.info("No TOC available, skipping chunk enrichment")
|
||||
return chunks
|
||||
|
||||
logger.info(f"Enriching {len(chunks)} chunks with TOC metadata...")
|
||||
|
||||
# Flatten TOC once for efficient matching
|
||||
try:
|
||||
flat_toc = flatten_toc_with_paths(toc, hierarchy)
|
||||
logger.info(f"Flattened TOC: {len(flat_toc)} entries")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to flatten TOC: {e}")
|
||||
return chunks # Fallback on error
|
||||
|
||||
# Match each chunk to TOC entry and enrich
|
||||
enriched_count = 0
|
||||
for chunk in chunks:
|
||||
matching_entry = find_matching_toc_entry(chunk, flat_toc)
|
||||
|
||||
if matching_entry:
|
||||
# Update sectionPath with full hierarchical path
|
||||
chunk["sectionPath"] = matching_entry["full_path"]
|
||||
|
||||
# Update chapterTitle
|
||||
chunk["chapterTitle"] = matching_entry["chapter_title"]
|
||||
|
||||
# Add canonicalReference if available
|
||||
if matching_entry["canonical_ref"]:
|
||||
chunk["canonicalReference"] = matching_entry["canonical_ref"]
|
||||
|
||||
enriched_count += 1
|
||||
|
||||
if chunks:
|
||||
logger.info(
|
||||
f"Enriched {enriched_count}/{len(chunks)} chunks "
|
||||
f"({100 * enriched_count / len(chunks):.1f}%)"
|
||||
)
|
||||
else:
|
||||
logger.info("No chunks to enrich")
|
||||
|
||||
return chunks
|
||||
260
generations/library_rag/utils/toc_extractor.py
Normal file
260
generations/library_rag/utils/toc_extractor.py
Normal file
@@ -0,0 +1,260 @@
|
||||
"""Table of Contents (TOC) extraction using Mistral OCR with annotations.
|
||||
|
||||
This module is the **primary entry point** for TOC extraction in the Library RAG
|
||||
pipeline. It provides intelligent routing between two extraction strategies:
|
||||
|
||||
1. **Visual (bbox) Analysis** (default, recommended): Uses bounding box coordinates
|
||||
to detect indentation and hierarchy based on horizontal positioning.
|
||||
2. **Semantic (annotation) Analysis**: Uses Mistral's document_annotation_format
|
||||
for structured metadata and TOC extraction.
|
||||
|
||||
The visual approach is more reliable for philosophical texts with complex
|
||||
hierarchies (parts, chapters, sections, subsections).
|
||||
|
||||
Extraction Strategies:
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ extract_toc_from_annotations(use_visual_bbox=True) │
|
||||
│ ↓ (default) │
|
||||
│ toc_extractor_visual.py → X-coordinate based hierarchy │
|
||||
│ │
|
||||
│ extract_toc_from_annotations(use_visual_bbox=False) │
|
||||
│ ↓ │
|
||||
│ DocumentMetadata Pydantic schema → Structured extraction │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
|
||||
Cost Considerations:
|
||||
- Annotated OCR: ~0.003€/page (3x standard OCR cost)
|
||||
- Only first N pages are processed (default: 8)
|
||||
- Total cost: max_toc_pages × 0.003€
|
||||
|
||||
Output Structure:
|
||||
{
|
||||
"success": bool,
|
||||
"metadata": {...}, # Document metadata
|
||||
"toc": [...], # Hierarchical TOC (nested children)
|
||||
"toc_flat": [...], # Flat list with levels
|
||||
"cost_ocr_annotated": float
|
||||
}
|
||||
|
||||
Example:
|
||||
>>> from pathlib import Path
|
||||
>>> from utils.toc_extractor import extract_toc_from_annotations
|
||||
>>>
|
||||
>>> # Extract TOC using visual analysis (recommended)
|
||||
>>> result = extract_toc_from_annotations(
|
||||
... pdf_path=Path("input/philosophy_book.pdf"),
|
||||
... max_toc_pages=8,
|
||||
... use_visual_bbox=True # default
|
||||
... )
|
||||
>>> if result["success"]:
|
||||
... for entry in result["toc"]:
|
||||
... print(f"{entry['title']} (p.{entry['page']})")
|
||||
|
||||
Functions:
|
||||
- extract_toc_from_annotations(): Main entry point with strategy routing
|
||||
- build_hierarchical_toc(): Converts flat TOC entries to nested structure
|
||||
- map_toc_to_content(): Associates TOC entries with document content
|
||||
|
||||
See Also:
|
||||
- utils.toc_extractor_visual: Visual/bbox-based extraction (default)
|
||||
- utils.toc_extractor_markdown: Markdown indentation-based extraction
|
||||
- utils.llm_toc: LLM-based TOC extraction (alternative approach)
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional, Union, cast
|
||||
from pathlib import Path
|
||||
|
||||
from .ocr_schemas import DocumentMetadata, TocEntry
|
||||
from .ocr_processor import run_ocr_with_annotations
|
||||
from .mistral_client import create_client
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# TypedDict for hierarchical TOC nodes
|
||||
class TOCNode(Dict[str, Any]):
|
||||
"""Type alias for TOC node structure with title, page, level, type, children."""
|
||||
pass
|
||||
|
||||
|
||||
def extract_toc_from_annotations(
|
||||
pdf_path: Path,
|
||||
api_key: Optional[str] = None,
|
||||
max_toc_pages: int = 8,
|
||||
use_visual_bbox: bool = True, # NOUVEAU : Utiliser l'analyse visuelle par défaut
|
||||
) -> Dict[str, Any]:
|
||||
"""Extrait la TOC structurée via OCR avec annotations.
|
||||
|
||||
Coût : 3€/1000 pages pour les pages annotées (vs 1€/1000 pour OCR basique).
|
||||
|
||||
Args:
|
||||
pdf_path: Chemin du fichier PDF
|
||||
api_key: Clé API Mistral (optionnel, sinon charge depuis .env)
|
||||
max_toc_pages: Nombre max de pages à annoter (défaut 8, limite API pour document_annotation)
|
||||
use_visual_bbox: Si True, utilise l'analyse visuelle des bounding boxes (plus fiable)
|
||||
|
||||
Returns:
|
||||
Dict avec :
|
||||
- success: bool
|
||||
- metadata: dict avec métadonnées enrichies
|
||||
- toc: liste hiérarchique [{title, page, level, children}]
|
||||
- toc_flat: liste plate [{title, page, level, type, parent_title}]
|
||||
- cost_ocr_annotated: float (coût en €)
|
||||
- error: str (si échec)
|
||||
"""
|
||||
# Si demandé, utiliser l'approche visuelle (bbox)
|
||||
if use_visual_bbox:
|
||||
logger.info("Utilisation de l'analyse visuelle (bbox) pour extraction TOC")
|
||||
from .toc_extractor_visual import extract_toc_with_visual_analysis
|
||||
return cast(Dict[str, Any], extract_toc_with_visual_analysis(pdf_path, api_key, max_toc_pages))
|
||||
|
||||
# Sinon, continuer avec l'approche sémantique (document_annotation_format)
|
||||
try:
|
||||
client = create_client(api_key)
|
||||
pdf_bytes = pdf_path.read_bytes()
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur initialisation client/lecture PDF : {e}")
|
||||
return {"success": False, "error": f"Initialisation échouée : {str(e)}"}
|
||||
|
||||
# Phase 1 : Annoter les premières pages pour extraire TOC + métadonnées
|
||||
logger.info(f"Extraction TOC avec annotations sur {max_toc_pages} premières pages")
|
||||
|
||||
try:
|
||||
annotated_response = run_ocr_with_annotations(
|
||||
client=client,
|
||||
file_bytes=pdf_bytes,
|
||||
filename=pdf_path.name,
|
||||
include_images=False, # Pas besoin d'images pour la TOC
|
||||
document_annotation_format=DocumentMetadata,
|
||||
pages=list(range(max_toc_pages)), # Pages 0 à max_toc_pages-1
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur appel OCR avec annotations : {e}")
|
||||
return {"success": False, "error": f"Appel OCR échoué : {str(e)}"}
|
||||
|
||||
# Extraire les annotations du document
|
||||
doc_annotation = getattr(annotated_response, "document_annotation", None)
|
||||
|
||||
if not doc_annotation:
|
||||
return {"success": False, "error": "Aucune annotation retournée par l'API"}
|
||||
|
||||
# Convertir en dictionnaire
|
||||
try:
|
||||
if isinstance(doc_annotation, str):
|
||||
metadata_dict = json.loads(doc_annotation)
|
||||
else:
|
||||
metadata_dict = doc_annotation
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur parsing annotations : {e}")
|
||||
return {"success": False, "error": f"Parsing annotations échoué : {str(e)}"}
|
||||
|
||||
# Valider avec Pydantic
|
||||
try:
|
||||
metadata = DocumentMetadata(**metadata_dict)
|
||||
toc_entries = metadata.toc.entries
|
||||
|
||||
logger.info(f"TOC extraite : {len(toc_entries)} entrées")
|
||||
|
||||
# Construire la TOC hiérarchique
|
||||
hierarchical_toc = build_hierarchical_toc(toc_entries)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"metadata": metadata.model_dump(),
|
||||
"toc": hierarchical_toc,
|
||||
"toc_flat": [entry.model_dump() for entry in toc_entries],
|
||||
"cost_ocr_annotated": max_toc_pages * 0.003, # 3€/1000 pages
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur validation annotations : {e}")
|
||||
return {"success": False, "error": f"Validation Pydantic échouée : {str(e)}"}
|
||||
|
||||
|
||||
def build_hierarchical_toc(entries: List[TocEntry]) -> List[Dict[str, Any]]:
|
||||
"""Construit une TOC hiérarchique à partir des entrées plates avec niveaux.
|
||||
|
||||
Utilise une stack pour gérer la hiérarchie basée sur les niveaux.
|
||||
|
||||
Args:
|
||||
entries: Liste d'entrées TocEntry avec level (1=racine, 2=enfant de 1, etc.)
|
||||
|
||||
Returns:
|
||||
TOC hiérarchique avec structure [{title, page, level, type, children: [...]}]
|
||||
"""
|
||||
if not entries:
|
||||
return []
|
||||
|
||||
toc: List[Dict[str, Any]] = []
|
||||
stack: List[Dict[str, Any]] = [] # Stack pour gérer la hiérarchie courante
|
||||
|
||||
for entry in entries:
|
||||
node: Dict[str, Any] = {
|
||||
"title": entry.title,
|
||||
"page": entry.page_number,
|
||||
"level": entry.level,
|
||||
"type": entry.entry_type.value,
|
||||
"children": [],
|
||||
}
|
||||
|
||||
# Remonter dans la stack jusqu'au parent approprié
|
||||
# Un élément de level N doit être enfant du dernier élément de level < N
|
||||
while stack and stack[-1]["level"] >= entry.level:
|
||||
stack.pop()
|
||||
|
||||
if stack:
|
||||
# Ajouter comme enfant du dernier élément de la stack
|
||||
children: List[Dict[str, Any]] = stack[-1]["children"]
|
||||
children.append(node)
|
||||
else:
|
||||
# Ajouter à la racine de la TOC
|
||||
toc.append(node)
|
||||
|
||||
# Empiler ce nœud pour les prochaines itérations
|
||||
stack.append(node)
|
||||
|
||||
return toc
|
||||
|
||||
|
||||
def map_toc_to_content(
|
||||
toc_entries: List[TocEntry],
|
||||
all_pages_markdown: str,
|
||||
) -> Dict[str, str]:
|
||||
"""Associe les entrées de TOC au contenu réel du document.
|
||||
|
||||
Utilise les vrais numéros de page pour découper le contenu par section.
|
||||
|
||||
Args:
|
||||
toc_entries: Entrées de TOC avec numéros de page réels
|
||||
all_pages_markdown: Markdown complet du document avec <!-- Page N --> markers
|
||||
|
||||
Returns:
|
||||
Mapping {section_title: content_text}
|
||||
"""
|
||||
# Découper le markdown par commentaires de page
|
||||
pages: List[str] = all_pages_markdown.split("<!-- Page ")
|
||||
|
||||
content_map: Dict[str, str] = {}
|
||||
|
||||
for i, entry in enumerate(toc_entries):
|
||||
start_page: int = entry.page_number
|
||||
|
||||
# Trouver la page de fin (numéro de page de la prochaine entrée ou fin du doc)
|
||||
end_page: int
|
||||
if i < len(toc_entries) - 1:
|
||||
end_page = toc_entries[i + 1].page_number
|
||||
else:
|
||||
end_page = len(pages) # Jusqu'à la fin
|
||||
|
||||
# Extraire le contenu entre start_page et end_page
|
||||
section_content: List[str] = []
|
||||
for page_idx in range(start_page, end_page):
|
||||
if page_idx < len(pages):
|
||||
# Nettoyer le commentaire de page et extraire le contenu
|
||||
page_text: str = pages[page_idx].split("-->", 1)[-1].strip()
|
||||
section_content.append(page_text)
|
||||
|
||||
content_map[entry.title] = "\n\n".join(section_content)
|
||||
|
||||
return content_map
|
||||
303
generations/library_rag/utils/toc_extractor_markdown.py
Normal file
303
generations/library_rag/utils/toc_extractor_markdown.py
Normal file
@@ -0,0 +1,303 @@
|
||||
"""TOC extraction via Markdown indentation analysis.
|
||||
|
||||
This module provides a **cost-free** TOC extraction strategy that works on
|
||||
already-generated Markdown text. Unlike the OCR annotation approach, this
|
||||
method doesn't require additional API calls.
|
||||
|
||||
Strategy:
|
||||
1. Search for "Table des matières" heading in the first N lines
|
||||
2. Parse lines matching pattern: "Title.....Page" or "Title Page"
|
||||
3. Detect hierarchy from leading whitespace (indentation)
|
||||
4. Build nested TOC structure using stack-based algorithm
|
||||
|
||||
When to Use:
|
||||
- When OCR has already been performed (markdown available)
|
||||
- When cost optimization is critical (no additional API calls)
|
||||
- For documents with clear indentation in the TOC
|
||||
|
||||
Limitations:
|
||||
- Requires French "Table des matières" header (can be extended)
|
||||
- Indentation detection may be less accurate than visual/bbox analysis
|
||||
- Only works if OCR preserved whitespace accurately
|
||||
|
||||
Indentation Levels:
|
||||
- 0-2 spaces: Level 1 (main chapters/parts)
|
||||
- 3-6 spaces: Level 2 (sections)
|
||||
- 7+ spaces: Level 3 (subsections)
|
||||
|
||||
Output Structure:
|
||||
{
|
||||
"success": bool,
|
||||
"toc": [...], # Hierarchical TOC
|
||||
"toc_flat": [...], # Flat entries with levels
|
||||
"cost_ocr_annotated": 0.0, # No additional cost
|
||||
"method": "markdown_indentation"
|
||||
}
|
||||
|
||||
Example:
|
||||
>>> from utils.toc_extractor_markdown import extract_toc_from_markdown
|
||||
>>>
|
||||
>>> markdown = '''
|
||||
... # Table des matières
|
||||
... Introduction.............................5
|
||||
... Première partie..........................10
|
||||
... Chapitre 1............................15
|
||||
... Chapitre 2............................25
|
||||
... Deuxième partie..........................50
|
||||
... '''
|
||||
>>> result = extract_toc_from_markdown(markdown)
|
||||
>>> if result["success"]:
|
||||
... print(f"Found {len(result['toc_flat'])} entries")
|
||||
Found 5 entries
|
||||
|
||||
Functions:
|
||||
- extract_toc_from_markdown(): Main extraction from markdown text
|
||||
- build_hierarchy(): Converts flat entries to nested structure
|
||||
|
||||
See Also:
|
||||
- utils.toc_extractor: Main entry point (routes to visual by default)
|
||||
- utils.toc_extractor_visual: More accurate X-position based extraction
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, TypedDict, Union
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Type definitions for internal data structures
|
||||
class MarkdownTOCEntryRaw(TypedDict):
|
||||
"""Raw TOC entry extracted from markdown with indentation info."""
|
||||
title: str
|
||||
page_number: int
|
||||
level: int
|
||||
leading_spaces: int
|
||||
|
||||
|
||||
class MarkdownTOCNode(TypedDict):
|
||||
"""Hierarchical TOC node with children."""
|
||||
title: str
|
||||
page: int
|
||||
level: int
|
||||
type: str
|
||||
children: List[MarkdownTOCNode]
|
||||
|
||||
|
||||
class MarkdownTOCFlatEntry(TypedDict):
|
||||
"""Flat TOC entry with parent information."""
|
||||
title: str
|
||||
page_number: int
|
||||
level: int
|
||||
entry_type: str
|
||||
parent_title: Optional[str]
|
||||
|
||||
|
||||
class MarkdownTOCResultSuccess(TypedDict):
|
||||
"""Successful TOC extraction result."""
|
||||
success: bool # Always True
|
||||
metadata: Dict[str, Any]
|
||||
toc: List[MarkdownTOCNode]
|
||||
toc_flat: List[MarkdownTOCFlatEntry]
|
||||
cost_ocr_annotated: float
|
||||
method: str
|
||||
|
||||
|
||||
class MarkdownTOCResultError(TypedDict):
|
||||
"""Failed TOC extraction result."""
|
||||
success: bool # Always False
|
||||
error: str
|
||||
|
||||
|
||||
# Union type for function return
|
||||
MarkdownTOCResult = Union[MarkdownTOCResultSuccess, MarkdownTOCResultError]
|
||||
|
||||
|
||||
def extract_toc_from_markdown(
|
||||
markdown_text: str,
|
||||
max_lines: int = 200,
|
||||
) -> MarkdownTOCResult:
|
||||
"""Extract table of contents by analyzing raw markdown text.
|
||||
|
||||
Detects hierarchy by counting leading spaces (indentation) at the
|
||||
beginning of each line. This is a cost-free alternative to OCR
|
||||
annotation-based extraction.
|
||||
|
||||
Args:
|
||||
markdown_text: Complete markdown text of the document.
|
||||
max_lines: Maximum number of lines to analyze (searches TOC at start).
|
||||
|
||||
Returns:
|
||||
Dictionary with hierarchical TOC structure. On success, includes:
|
||||
- success: True
|
||||
- metadata: Empty dict (for consistency with other extractors)
|
||||
- toc: Hierarchical nested TOC structure
|
||||
- toc_flat: Flat list of entries with levels
|
||||
- cost_ocr_annotated: 0.0 (no additional cost)
|
||||
- method: "markdown_indentation"
|
||||
On failure, includes:
|
||||
- success: False
|
||||
- error: Error message string
|
||||
|
||||
Example:
|
||||
>>> markdown = '''
|
||||
... # Table des matières
|
||||
... Introduction.....5
|
||||
... Part One........10
|
||||
... Chapter 1.....15
|
||||
... '''
|
||||
>>> result = extract_toc_from_markdown(markdown)
|
||||
>>> if result["success"]:
|
||||
... print(len(result["toc_flat"]))
|
||||
3
|
||||
"""
|
||||
logger.info("Extraction TOC depuis markdown (analyse indentation)")
|
||||
|
||||
lines: List[str] = markdown_text.split('\n')[:max_lines]
|
||||
|
||||
# Find "Table des matières" section
|
||||
toc_start: Optional[int] = None
|
||||
for i, line in enumerate(lines):
|
||||
if re.search(r'table\s+des\s+mati[èe]res', line, re.IGNORECASE):
|
||||
toc_start = i + 1
|
||||
logger.info(f"TOC trouvée à la ligne {i}")
|
||||
break
|
||||
|
||||
if toc_start is None:
|
||||
logger.warning("Aucune table des matières trouvée dans le markdown")
|
||||
return MarkdownTOCResultError(
|
||||
success=False,
|
||||
error="Table des matières introuvable"
|
||||
)
|
||||
|
||||
# Extract TOC entries
|
||||
entries: List[MarkdownTOCEntryRaw] = []
|
||||
toc_pattern: re.Pattern[str] = re.compile(r'^(\s*)(.+?)\s*\.+\s*(\d+)\s*$')
|
||||
|
||||
for line in lines[toc_start:toc_start + 100]: # Max 100 lines of TOC
|
||||
line_stripped: str = line.strip()
|
||||
if not line_stripped or line_stripped.startswith('#') or line_stripped.startswith('---'):
|
||||
continue
|
||||
|
||||
# Search for pattern "Title.....Page"
|
||||
# Must analyze line BEFORE strip() to count leading spaces
|
||||
original_line: str = lines[lines.index(line) if line in lines else 0]
|
||||
leading_spaces: int = len(original_line) - len(original_line.lstrip())
|
||||
|
||||
# Alternative pattern: search for title + number at end
|
||||
match: Optional[re.Match[str]] = re.match(r'^(.+?)\s*\.{2,}\s*(\d+)\s*$', line_stripped)
|
||||
if not match:
|
||||
# Try without dotted leaders
|
||||
match = re.match(r'^(.+?)\s+(\d+)\s*$', line_stripped)
|
||||
|
||||
if match:
|
||||
title: str = match.group(1).strip()
|
||||
page: int = int(match.group(2))
|
||||
|
||||
# Ignore lines too short or that don't look like titles
|
||||
if len(title) < 3 or title.isdigit():
|
||||
continue
|
||||
|
||||
# Determine level based on indentation
|
||||
# 0-2 spaces = level 1
|
||||
# 3-6 spaces = level 2
|
||||
# 7+ spaces = level 3
|
||||
level: int
|
||||
if leading_spaces <= 2:
|
||||
level = 1
|
||||
elif leading_spaces <= 6:
|
||||
level = 2
|
||||
else:
|
||||
level = 3
|
||||
|
||||
entries.append(MarkdownTOCEntryRaw(
|
||||
title=title,
|
||||
page_number=page,
|
||||
level=level,
|
||||
leading_spaces=leading_spaces,
|
||||
))
|
||||
|
||||
logger.debug(f" '{title}' → {leading_spaces} espaces → level {level} (page {page})")
|
||||
|
||||
if not entries:
|
||||
logger.warning("Aucune entrée TOC extraite")
|
||||
return MarkdownTOCResultError(
|
||||
success=False,
|
||||
error="Aucune entrée TOC trouvée"
|
||||
)
|
||||
|
||||
logger.info(f"✅ {len(entries)} entrées extraites depuis markdown")
|
||||
|
||||
# Build hierarchy
|
||||
toc: List[MarkdownTOCNode] = build_hierarchy(entries)
|
||||
|
||||
return MarkdownTOCResultSuccess(
|
||||
success=True,
|
||||
metadata={},
|
||||
toc=toc,
|
||||
toc_flat=[
|
||||
MarkdownTOCFlatEntry(
|
||||
title=e["title"],
|
||||
page_number=e["page_number"],
|
||||
level=e["level"],
|
||||
entry_type="section",
|
||||
parent_title=None,
|
||||
)
|
||||
for e in entries
|
||||
],
|
||||
cost_ocr_annotated=0.0, # No additional cost, uses existing OCR
|
||||
method="markdown_indentation",
|
||||
)
|
||||
|
||||
|
||||
def build_hierarchy(entries: List[MarkdownTOCEntryRaw]) -> List[MarkdownTOCNode]:
|
||||
"""Build hierarchical structure from flat entries based on levels.
|
||||
|
||||
Uses a stack-based algorithm to construct nested TOC structure where
|
||||
entries with higher indentation become children of the previous
|
||||
less-indented entry.
|
||||
|
||||
Args:
|
||||
entries: List of raw TOC entries with title, page, and level.
|
||||
|
||||
Returns:
|
||||
Nested list of TOC nodes where each node contains children.
|
||||
|
||||
Example:
|
||||
>>> entries = [
|
||||
... {"title": "Part 1", "page_number": 1, "level": 1, "leading_spaces": 0},
|
||||
... {"title": "Chapter 1", "page_number": 5, "level": 2, "leading_spaces": 4},
|
||||
... ]
|
||||
>>> hierarchy = build_hierarchy(entries)
|
||||
>>> len(hierarchy[0]["children"])
|
||||
1
|
||||
"""
|
||||
toc: List[MarkdownTOCNode] = []
|
||||
stack: List[MarkdownTOCNode] = []
|
||||
|
||||
for entry in entries:
|
||||
node: MarkdownTOCNode = MarkdownTOCNode(
|
||||
title=entry["title"],
|
||||
page=entry["page_number"],
|
||||
level=entry["level"],
|
||||
type="section",
|
||||
children=[],
|
||||
)
|
||||
|
||||
# Pop from stack until we find a parent at lower level
|
||||
while stack and stack[-1]["level"] >= node["level"]:
|
||||
stack.pop()
|
||||
|
||||
if stack:
|
||||
# Add as child to top of stack
|
||||
stack[-1]["children"].append(node)
|
||||
else:
|
||||
# Add as root-level entry
|
||||
toc.append(node)
|
||||
|
||||
stack.append(node)
|
||||
|
||||
return toc
|
||||
512
generations/library_rag/utils/toc_extractor_visual.py
Normal file
512
generations/library_rag/utils/toc_extractor_visual.py
Normal file
@@ -0,0 +1,512 @@
|
||||
"""Visual TOC extraction using bounding box X-coordinate analysis.
|
||||
|
||||
This module provides the **most accurate** TOC extraction strategy for
|
||||
philosophical texts by analyzing the horizontal position (X-coordinate)
|
||||
of each TOC entry. This approach is more reliable than text indentation
|
||||
analysis because it directly measures visual layout.
|
||||
|
||||
How It Works:
|
||||
1. OCR with annotations extracts text + bounding box positions
|
||||
2. Pydantic schema (TocEntryBbox) captures title, page, and x_position
|
||||
3. X-coordinates are clustered to identify distinct indentation levels
|
||||
4. Hierarchy is built based on relative X-positions
|
||||
|
||||
X-Position Interpretation:
|
||||
The x_position is normalized between 0.0 (left edge) and 1.0 (right edge):
|
||||
|
||||
- x ≈ 0.05-0.12: Level 1 (no indentation, main parts/chapters)
|
||||
- x ≈ 0.13-0.22: Level 2 (small indentation, sections)
|
||||
- x ≈ 0.23-0.35: Level 3 (double indentation, subsections)
|
||||
|
||||
Positions within 0.03 tolerance are grouped into the same level.
|
||||
|
||||
Advantages over Markdown Analysis:
|
||||
- Works regardless of OCR whitespace accuracy
|
||||
- More reliable for complex hierarchies
|
||||
- Handles both printed and handwritten indentation
|
||||
|
||||
Cost:
|
||||
- Uses OCR with annotations: ~0.003€/page
|
||||
- Only processes first N pages (default: 8)
|
||||
|
||||
Pydantic Schemas:
|
||||
- TocEntryBbox: Single TOC entry with text, page_number, x_position
|
||||
- DocumentTocBbox: Container for list of entries
|
||||
|
||||
Output Structure:
|
||||
{
|
||||
"success": bool,
|
||||
"metadata": {...},
|
||||
"toc": [...], # Hierarchical TOC
|
||||
"toc_flat": [...], # Flat entries with levels
|
||||
"cost_ocr_annotated": float,
|
||||
"method": "visual_x_position"
|
||||
}
|
||||
|
||||
Example:
|
||||
>>> from pathlib import Path
|
||||
>>> from utils.toc_extractor_visual import extract_toc_with_visual_analysis
|
||||
>>>
|
||||
>>> result = extract_toc_with_visual_analysis(
|
||||
... pdf_path=Path("input/philosophy_book.pdf"),
|
||||
... max_toc_pages=8
|
||||
... )
|
||||
>>> if result["success"]:
|
||||
... for entry in result["toc"]:
|
||||
... indent = " " * (entry["level"] - 1)
|
||||
... print(f"{indent}{entry['title']} (p.{entry['page']})")
|
||||
|
||||
Algorithm Details:
|
||||
1. Collect all x_position values from OCR response
|
||||
2. Sort and cluster positions (tolerance: 0.03)
|
||||
3. Compute cluster centroids as level thresholds
|
||||
4. Assign level to each entry based on nearest centroid
|
||||
5. Build hierarchy using stack-based approach
|
||||
|
||||
Functions:
|
||||
- extract_toc_with_visual_analysis(): Main extraction function
|
||||
- build_hierarchy_from_bbox(): Converts entries with X-positions to hierarchy
|
||||
- flatten_toc(): Flattens hierarchical TOC for storage
|
||||
|
||||
See Also:
|
||||
- utils.toc_extractor: Main entry point (routes here by default)
|
||||
- utils.toc_extractor_markdown: Alternative cost-free extraction
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Type, TypedDict, Union
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from .mistral_client import create_client
|
||||
from .ocr_processor import run_ocr_with_annotations
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TocEntryBbox(BaseModel):
|
||||
"""TOC entry with bounding box for visual detection.
|
||||
|
||||
Attributes:
|
||||
text: Complete entry text as it appears in the table of contents.
|
||||
Example: 'Presentation' or 'What is virtue?' or 'Meno or on virtue'.
|
||||
DO NOT include leader dots or page number in this field.
|
||||
page_number: Actual page number as printed in the book (the visible number
|
||||
on the right in the TOC). Example: if the line says 'Presentation.....3',
|
||||
extract the number 3. This is the BOOK page number, not the PDF index.
|
||||
x_position: Horizontal position (X coordinate) of the text start, normalized
|
||||
between 0 and 1. This is the CRUCIAL COORDINATE for detecting indentation:
|
||||
- x ≈ 0.05-0.12 = left-aligned title, NOT indented (hierarchical level 1)
|
||||
- x ≈ 0.13-0.22 = title with SMALL indentation (hierarchical level 2)
|
||||
- x ≈ 0.23-0.35 = title with DOUBLE indentation (hierarchical level 3)
|
||||
Measure precisely where the first character of the title begins.
|
||||
"""
|
||||
text: str = Field(..., description="""Texte COMPLET de l'entrée tel qu'il apparaît dans la table des matières.
|
||||
Exemple: 'Présentation' ou 'Qu'est-ce que la vertu ?' ou 'Ménon ou de la vertu'.
|
||||
NE PAS inclure les points de suite ni le numéro de page dans ce champ.""")
|
||||
page_number: int = Field(..., description="""Numéro de page réel tel qu'imprimé dans le livre (le numéro visible à droite dans la TOC).
|
||||
Exemple: si la ligne dit 'Présentation.....3', extraire le nombre 3.
|
||||
C'est le numéro de page du LIVRE, pas l'index PDF.""")
|
||||
x_position: float = Field(..., description="""Position horizontale (coordonnée X) du début du texte, normalisée entre 0 et 1.
|
||||
C'est LA COORDONNÉE CRUCIALE pour détecter l'indentation:
|
||||
- x ≈ 0.05-0.12 = titre aligné à gauche, NON indenté (niveau hiérarchique 1)
|
||||
- x ≈ 0.13-0.22 = titre avec PETITE indentation (niveau hiérarchique 2)
|
||||
- x ≈ 0.23-0.35 = titre avec DOUBLE indentation (niveau hiérarchique 3)
|
||||
Mesurer précisément où commence le premier caractère du titre.""")
|
||||
|
||||
|
||||
class DocumentTocBbox(BaseModel):
|
||||
"""Schema for extracting all TOC entries with their positions.
|
||||
|
||||
Attributes:
|
||||
entries: Complete list of ALL entries found in the table of contents.
|
||||
For EACH line in the TOC, extract:
|
||||
1. The title text (without leader dots)
|
||||
2. The page number (the number on the right)
|
||||
3. The exact horizontal X position of the title start (to detect indentation)
|
||||
|
||||
Include ALL entries, even those that appear to be at the same visual level.
|
||||
"""
|
||||
|
||||
entries: List[TocEntryBbox] = Field(
|
||||
...,
|
||||
description="""Complete list of ALL entries found in the table of contents.
|
||||
For EACH line in the TOC, extract:
|
||||
1. The title text (without leader dots)
|
||||
2. The page number (the number on the right)
|
||||
3. The exact horizontal X position of the title start (to detect indentation)
|
||||
|
||||
Include ALL entries, even those that appear to be at the same visual level.""",
|
||||
)
|
||||
|
||||
|
||||
# TypedDict classes for structured return types
|
||||
class VisualTOCMetadata(TypedDict):
|
||||
"""Metadata extracted from the document.
|
||||
|
||||
Attributes:
|
||||
title: Document title.
|
||||
author: Document author.
|
||||
languages: List of languages present in the document.
|
||||
summary: Brief document summary.
|
||||
"""
|
||||
|
||||
title: str
|
||||
author: str
|
||||
languages: List[str]
|
||||
summary: str
|
||||
|
||||
|
||||
class VisualTOCNode(TypedDict):
|
||||
"""Hierarchical TOC node.
|
||||
|
||||
Attributes:
|
||||
title: Entry title text.
|
||||
page: Page number in the book.
|
||||
level: Hierarchical level (1 = top level, 2 = subsection, etc.).
|
||||
type: Entry type (e.g., "section", "chapter").
|
||||
children: List of child nodes.
|
||||
"""
|
||||
|
||||
title: str
|
||||
page: int
|
||||
level: int
|
||||
type: str
|
||||
children: List[VisualTOCNode]
|
||||
|
||||
|
||||
class VisualTOCFlatEntry(TypedDict):
|
||||
"""Flattened TOC entry for storage.
|
||||
|
||||
Attributes:
|
||||
title: Entry title text.
|
||||
page_number: Page number in the book.
|
||||
level: Hierarchical level.
|
||||
entry_type: Entry type (e.g., "section", "chapter").
|
||||
parent_title: Title of the parent entry, if any.
|
||||
"""
|
||||
|
||||
title: str
|
||||
page_number: int
|
||||
level: int
|
||||
entry_type: str
|
||||
parent_title: Optional[str]
|
||||
|
||||
|
||||
class VisualTOCResultSuccess(TypedDict):
|
||||
"""Successful TOC extraction result.
|
||||
|
||||
Attributes:
|
||||
success: Always True for success case.
|
||||
metadata: Document metadata.
|
||||
toc: Hierarchical TOC structure.
|
||||
toc_flat: Flattened TOC entries.
|
||||
cost_ocr_annotated: OCR processing cost in euros.
|
||||
method: Extraction method identifier.
|
||||
"""
|
||||
|
||||
success: bool
|
||||
metadata: VisualTOCMetadata
|
||||
toc: List[VisualTOCNode]
|
||||
toc_flat: List[VisualTOCFlatEntry]
|
||||
cost_ocr_annotated: float
|
||||
method: str
|
||||
|
||||
|
||||
class VisualTOCResultError(TypedDict):
|
||||
"""Failed TOC extraction result.
|
||||
|
||||
Attributes:
|
||||
success: Always False for error case.
|
||||
error: Error message describing the failure.
|
||||
"""
|
||||
|
||||
success: bool
|
||||
error: str
|
||||
|
||||
|
||||
# Union type for the function return
|
||||
VisualTOCResult = Union[VisualTOCResultSuccess, VisualTOCResultError]
|
||||
|
||||
|
||||
class VisualTOCEntryInternal(TypedDict):
|
||||
"""Internal representation of TOC entry during processing.
|
||||
|
||||
Attributes:
|
||||
text: Entry title text.
|
||||
page_number: Page number in the book.
|
||||
x_position: Normalized X position (0.0 to 1.0).
|
||||
x_start: Same as x_position (for processing).
|
||||
page: Same as page_number (for processing).
|
||||
level: Computed hierarchical level.
|
||||
"""
|
||||
|
||||
text: str
|
||||
page_number: int
|
||||
x_position: float
|
||||
x_start: float
|
||||
page: int
|
||||
level: int
|
||||
|
||||
|
||||
def extract_toc_with_visual_analysis(
|
||||
pdf_path: Path,
|
||||
api_key: Optional[str] = None,
|
||||
max_toc_pages: int = 8,
|
||||
) -> VisualTOCResult:
|
||||
"""Extract TOC by visually analyzing bounding boxes.
|
||||
|
||||
Detects hierarchy from horizontal alignment (X coordinate). This method
|
||||
uses OCR with annotations to extract the precise X-coordinate of each
|
||||
TOC entry, then clusters these positions to identify indentation levels.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the PDF file.
|
||||
api_key: Mistral API key (optional, uses environment variable if not provided).
|
||||
max_toc_pages: Number of pages to analyze (default: 8).
|
||||
|
||||
Returns:
|
||||
Dictionary containing either:
|
||||
- Success: metadata, hierarchical TOC, flat TOC, cost, method
|
||||
- Error: success=False and error message
|
||||
|
||||
Raises:
|
||||
Does not raise exceptions; errors are returned in the result dictionary.
|
||||
|
||||
Example:
|
||||
>>> from pathlib import Path
|
||||
>>> result = extract_toc_with_visual_analysis(Path("book.pdf"))
|
||||
>>> if result["success"]:
|
||||
... print(f"Extracted {len(result['toc'])} top-level entries")
|
||||
... else:
|
||||
... print(f"Error: {result['error']}")
|
||||
"""
|
||||
try:
|
||||
client = create_client(api_key)
|
||||
pdf_bytes: bytes = pdf_path.read_bytes()
|
||||
except Exception as e:
|
||||
logger.error(f"Initialization error: {e}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
logger.info(f"Visual TOC extraction on {max_toc_pages} pages")
|
||||
|
||||
# Call OCR with document_annotation_format for global structure
|
||||
try:
|
||||
response = run_ocr_with_annotations(
|
||||
client=client,
|
||||
file_bytes=pdf_bytes,
|
||||
filename=pdf_path.name,
|
||||
include_images=False,
|
||||
document_annotation_format=DocumentTocBbox,
|
||||
pages=list(range(max_toc_pages)),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"OCR with annotations error: {e}")
|
||||
return {"success": False, "error": f"OCR failed: {str(e)}"}
|
||||
|
||||
# Extract annotations
|
||||
doc_annotation: Any = getattr(response, "document_annotation", None)
|
||||
|
||||
if not doc_annotation:
|
||||
return {"success": False, "error": "No annotation returned"}
|
||||
|
||||
# Parse entries
|
||||
try:
|
||||
if isinstance(doc_annotation, str):
|
||||
toc_data: Any = json.loads(doc_annotation)
|
||||
else:
|
||||
toc_data = doc_annotation
|
||||
|
||||
entries_data: List[Dict[str, Any]] = (
|
||||
toc_data.get("entries", []) if isinstance(toc_data, dict) else toc_data
|
||||
)
|
||||
|
||||
# Build hierarchy from X coordinates
|
||||
toc_entries: List[VisualTOCNode] = build_hierarchy_from_bbox(entries_data)
|
||||
|
||||
logger.info(f"TOC extracted visually: {len(toc_entries)} entries")
|
||||
|
||||
# Basic metadata (no enriched metadata in visual mode)
|
||||
metadata: VisualTOCMetadata = {
|
||||
"title": pdf_path.stem,
|
||||
"author": "Unknown author",
|
||||
"languages": [],
|
||||
"summary": "",
|
||||
}
|
||||
|
||||
result: VisualTOCResultSuccess = {
|
||||
"success": True,
|
||||
"metadata": metadata,
|
||||
"toc": toc_entries,
|
||||
"toc_flat": flatten_toc(toc_entries),
|
||||
"cost_ocr_annotated": max_toc_pages * 0.003,
|
||||
"method": "visual_x_position",
|
||||
}
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Bbox parsing error: {e}")
|
||||
return {"success": False, "error": f"Parsing failed: {str(e)}"}
|
||||
|
||||
|
||||
def build_hierarchy_from_bbox(entries: List[Dict[str, Any]]) -> List[VisualTOCNode]:
|
||||
"""Build TOC hierarchy from X positions (indentation).
|
||||
|
||||
Detects the hierarchical level by analyzing the horizontal X coordinate.
|
||||
Clusters nearby X positions to identify distinct indentation levels, then
|
||||
builds a tree structure using a stack-based approach.
|
||||
|
||||
Args:
|
||||
entries: List of entries with x_position field. Each entry should have:
|
||||
- text: Entry title
|
||||
- page_number: Page number
|
||||
- x_position: Normalized X coordinate (0.0 to 1.0)
|
||||
|
||||
Returns:
|
||||
Hierarchical TOC structure as a list of nodes. Each node contains:
|
||||
- title: Entry title
|
||||
- page: Page number
|
||||
- level: Hierarchical level (1, 2, 3, ...)
|
||||
- type: Entry type (always "section")
|
||||
- children: List of child nodes
|
||||
|
||||
Example:
|
||||
>>> entries = [
|
||||
... {"text": "Chapter 1", "page_number": 1, "x_position": 0.1},
|
||||
... {"text": "Section 1.1", "page_number": 2, "x_position": 0.2},
|
||||
... ]
|
||||
>>> hierarchy = build_hierarchy_from_bbox(entries)
|
||||
>>> hierarchy[0]["children"][0]["title"]
|
||||
'Section 1.1'
|
||||
"""
|
||||
if not entries:
|
||||
return []
|
||||
|
||||
# Extract X positions and normalize entry data
|
||||
entry_list: List[VisualTOCEntryInternal] = []
|
||||
for entry in entries:
|
||||
x_start: float = entry.get("x_position", 0.1)
|
||||
page_num: int = entry.get("page_number", 0)
|
||||
entry["x_start"] = x_start
|
||||
entry["page"] = page_num
|
||||
entry_list.append(entry) # type: ignore[arg-type]
|
||||
|
||||
# Find unique indentation thresholds
|
||||
x_positions: List[float] = sorted(set(e["x_start"] for e in entry_list))
|
||||
|
||||
if not x_positions:
|
||||
logger.warning("No X position detected")
|
||||
return []
|
||||
|
||||
# Group nearby positions (tolerance 0.03 to normalize small variations)
|
||||
x_levels: List[float] = []
|
||||
current_group: List[float] = [x_positions[0]]
|
||||
|
||||
for x in x_positions[1:]:
|
||||
if x - current_group[-1] < 0.03:
|
||||
current_group.append(x)
|
||||
else:
|
||||
x_levels.append(sum(current_group) / len(current_group))
|
||||
current_group = [x]
|
||||
|
||||
if current_group:
|
||||
x_levels.append(sum(current_group) / len(current_group))
|
||||
|
||||
logger.info(
|
||||
f"Indentation levels detected (X positions): {[f'{x:.3f}' for x in x_levels]}"
|
||||
)
|
||||
|
||||
# Assign levels based on X position
|
||||
for entry_item in entry_list:
|
||||
x_val: float = entry_item["x_start"]
|
||||
# Find the closest level
|
||||
level: int = min(range(len(x_levels)), key=lambda i: abs(x_levels[i] - x_val)) + 1
|
||||
entry_item["level"] = level
|
||||
logger.debug(f" '{entry_item.get('text', '')}' -> X={x_val:.3f} -> level {level}")
|
||||
|
||||
# Build hierarchy
|
||||
toc: List[VisualTOCNode] = []
|
||||
stack: List[VisualTOCNode] = []
|
||||
|
||||
for entry_item in entry_list:
|
||||
node: VisualTOCNode = {
|
||||
"title": entry_item.get("text", "").strip(),
|
||||
"page": entry_item["page"],
|
||||
"level": entry_item["level"],
|
||||
"type": "section",
|
||||
"children": [],
|
||||
}
|
||||
|
||||
# Pop from stack while current level is less than or equal to stack top
|
||||
while stack and stack[-1]["level"] >= node["level"]:
|
||||
stack.pop()
|
||||
|
||||
if stack:
|
||||
stack[-1]["children"].append(node)
|
||||
else:
|
||||
toc.append(node)
|
||||
|
||||
stack.append(node)
|
||||
|
||||
return toc
|
||||
|
||||
|
||||
def flatten_toc(toc: List[VisualTOCNode]) -> List[VisualTOCFlatEntry]:
|
||||
"""Flatten a hierarchical TOC.
|
||||
|
||||
Converts a nested TOC structure into a flat list of entries, preserving
|
||||
parent-child relationships through the parent_title field.
|
||||
|
||||
Args:
|
||||
toc: Hierarchical TOC structure (list of VisualTOCNode).
|
||||
|
||||
Returns:
|
||||
Flat list of TOC entries with parent references.
|
||||
|
||||
Example:
|
||||
>>> toc = [{
|
||||
... "title": "Chapter 1",
|
||||
... "page": 1,
|
||||
... "level": 1,
|
||||
... "type": "section",
|
||||
... "children": [{
|
||||
... "title": "Section 1.1",
|
||||
... "page": 2,
|
||||
... "level": 2,
|
||||
... "type": "section",
|
||||
... "children": []
|
||||
... }]
|
||||
... }]
|
||||
>>> flat = flatten_toc(toc)
|
||||
>>> len(flat)
|
||||
2
|
||||
>>> flat[1]["parent_title"]
|
||||
'Chapter 1'
|
||||
"""
|
||||
flat: List[VisualTOCFlatEntry] = []
|
||||
|
||||
def recurse(items: List[VisualTOCNode], parent_title: Optional[str] = None) -> None:
|
||||
"""Recursively flatten TOC nodes.
|
||||
|
||||
Args:
|
||||
items: List of TOC nodes to process.
|
||||
parent_title: Title of the parent node (None for top level).
|
||||
"""
|
||||
for item in items:
|
||||
flat_entry: VisualTOCFlatEntry = {
|
||||
"title": item["title"],
|
||||
"page_number": item["page"],
|
||||
"level": item["level"],
|
||||
"entry_type": item["type"],
|
||||
"parent_title": parent_title,
|
||||
}
|
||||
flat.append(flat_entry)
|
||||
if item.get("children"):
|
||||
recurse(item["children"], item["title"])
|
||||
|
||||
recurse(toc)
|
||||
return flat
|
||||
|
||||
1218
generations/library_rag/utils/types.py
Normal file
1218
generations/library_rag/utils/types.py
Normal file
File diff suppressed because it is too large
Load Diff
815
generations/library_rag/utils/weaviate_ingest.py
Normal file
815
generations/library_rag/utils/weaviate_ingest.py
Normal file
@@ -0,0 +1,815 @@
|
||||
"""Weaviate document ingestion module for the Library RAG pipeline.
|
||||
|
||||
This module handles the ingestion of processed documents (chunks, metadata,
|
||||
summaries) into the Weaviate vector database. It supports the V3.0 schema
|
||||
with nested objects for efficient semantic search.
|
||||
|
||||
Architecture:
|
||||
The module uses four Weaviate collections:
|
||||
|
||||
- **Work**: Represents a literary/philosophical work (title, author, year)
|
||||
- **Document**: A specific edition/version of a work (sourceId, pages, TOC)
|
||||
- **Chunk**: Text chunks with vectorized content for semantic search
|
||||
- **Summary**: Section summaries with vectorized concepts
|
||||
|
||||
Chunks and Summaries use nested objects to reference their parent
|
||||
Work and Document, avoiding data duplication while enabling
|
||||
efficient filtering.
|
||||
|
||||
Batch Operations:
|
||||
The module uses Weaviate insert_many() for efficient batch insertion.
|
||||
Chunks are prepared as a list and inserted in a single operation,
|
||||
which is significantly faster than individual insertions.
|
||||
|
||||
Nested Objects:
|
||||
Each Chunk contains nested work and document objects::
|
||||
|
||||
{
|
||||
"text": "La justice est une vertu...",
|
||||
"work": {"title": "La Republique", "author": "Platon"},
|
||||
"document": {"sourceId": "platon_republique", "edition": "GF"}
|
||||
}
|
||||
|
||||
This enables filtering like: document.sourceId == "platon_republique"
|
||||
|
||||
Typical Usage:
|
||||
>>> from utils.weaviate_ingest import ingest_document, delete_document_chunks
|
||||
>>>
|
||||
>>> # Ingest a processed document
|
||||
>>> result = ingest_document(
|
||||
... doc_name="platon_republique",
|
||||
... chunks=[{"text": "La justice est...", "section": "Livre I"}],
|
||||
... metadata={"title": "La Republique", "author": "Platon"},
|
||||
... language="fr",
|
||||
... )
|
||||
>>> print(f"Ingested {result['count']} chunks")
|
||||
|
||||
Connection:
|
||||
The module connects to a local Weaviate instance using:
|
||||
|
||||
- HTTP port: 8080
|
||||
- gRPC port: 50051
|
||||
|
||||
Ensure Weaviate is running via: docker-compose up -d
|
||||
|
||||
See Also:
|
||||
- schema.py: Weaviate schema definitions
|
||||
- pdf_pipeline.py: Document processing pipeline
|
||||
- flask_app.py: Web interface for search
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, Generator, List, Optional, TypedDict
|
||||
|
||||
import weaviate
|
||||
from weaviate import WeaviateClient
|
||||
from weaviate.collections import Collection
|
||||
import weaviate.classes.query as wvq
|
||||
|
||||
# Import type definitions from central types module
|
||||
from utils.types import WeaviateIngestResult as IngestResult
|
||||
|
||||
# Import TOC enrichment functions
|
||||
from .toc_enricher import enrich_chunks_with_toc
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Type Definitions (module-specific, not exported to utils.types)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class SummaryObject(TypedDict):
|
||||
"""Weaviate Summary object structure for section summaries.
|
||||
|
||||
This TypedDict defines the structure of Summary objects stored in Weaviate.
|
||||
Summaries are vectorized and can be searched semantically.
|
||||
|
||||
Attributes:
|
||||
sectionPath: Full hierarchical path (e.g., "Livre I > Chapitre 2").
|
||||
title: Section title.
|
||||
level: Hierarchy level (1 = top level, 2 = subsection, etc.).
|
||||
text: Summary text content (vectorized for search).
|
||||
concepts: List of key concepts extracted from the section.
|
||||
chunksCount: Number of chunks in this section.
|
||||
document: Nested object with document reference (sourceId).
|
||||
"""
|
||||
|
||||
sectionPath: str
|
||||
title: str
|
||||
level: int
|
||||
text: str
|
||||
concepts: List[str]
|
||||
chunksCount: int
|
||||
document: Dict[str, str]
|
||||
|
||||
|
||||
class ChunkObject(TypedDict, total=False):
|
||||
"""Weaviate Chunk object structure for text chunks.
|
||||
|
||||
This TypedDict defines the structure of Chunk objects stored in Weaviate.
|
||||
The text and keywords fields are vectorized for semantic search.
|
||||
|
||||
Attributes:
|
||||
text: Chunk text content (vectorized for search).
|
||||
sectionPath: Full hierarchical path (e.g., "Livre I > Chapitre 2").
|
||||
sectionLevel: Hierarchy level (1 = top level).
|
||||
chapterTitle: Title of the containing chapter.
|
||||
canonicalReference: Canonical academic reference (e.g., "CP 1.628", "Ménon 80a").
|
||||
unitType: Type of argumentative unit (main_content, exposition, etc.).
|
||||
keywords: List of keywords/concepts (vectorized for search).
|
||||
language: Language code (e.g., "fr", "en").
|
||||
orderIndex: Position in document for ordering.
|
||||
work: Nested object with work metadata (title, author).
|
||||
document: Nested object with document reference (sourceId, edition).
|
||||
|
||||
Note:
|
||||
Uses total=False because some fields are optional during creation.
|
||||
"""
|
||||
|
||||
text: str
|
||||
sectionPath: str
|
||||
sectionLevel: int
|
||||
chapterTitle: str
|
||||
canonicalReference: str
|
||||
unitType: str
|
||||
keywords: List[str]
|
||||
language: str
|
||||
orderIndex: int
|
||||
work: Dict[str, str]
|
||||
document: Dict[str, str]
|
||||
|
||||
|
||||
class InsertedChunkSummary(TypedDict):
|
||||
"""Summary of an inserted chunk for display purposes.
|
||||
|
||||
This TypedDict provides a preview of inserted chunks, useful for
|
||||
displaying ingestion results to users.
|
||||
|
||||
Attributes:
|
||||
chunk_id: Generated chunk identifier.
|
||||
sectionPath: Hierarchical path of the chunk.
|
||||
work: Title of the work.
|
||||
author: Author name.
|
||||
text_preview: First 150 characters of chunk text.
|
||||
unitType: Type of argumentative unit.
|
||||
"""
|
||||
|
||||
chunk_id: str
|
||||
sectionPath: str
|
||||
work: str
|
||||
author: str
|
||||
text_preview: str
|
||||
unitType: str
|
||||
|
||||
|
||||
# Note: IngestResult is imported from utils.types as WeaviateIngestResult
|
||||
|
||||
|
||||
class DeleteResult(TypedDict, total=False):
|
||||
"""Result from document deletion operation.
|
||||
|
||||
This TypedDict contains the result of a deletion operation,
|
||||
including counts of deleted objects from each collection.
|
||||
|
||||
Attributes:
|
||||
success: Whether deletion succeeded.
|
||||
error: Error message if deletion failed.
|
||||
deleted_chunks: Number of chunks deleted from Chunk collection.
|
||||
deleted_summaries: Number of summaries deleted from Summary collection.
|
||||
deleted_document: Whether the Document object was deleted.
|
||||
|
||||
Example:
|
||||
>>> result = delete_document_chunks("platon_republique")
|
||||
>>> print(f"Deleted {result['deleted_chunks']} chunks")
|
||||
"""
|
||||
|
||||
success: bool
|
||||
error: str
|
||||
deleted_chunks: int
|
||||
deleted_summaries: int
|
||||
deleted_document: bool
|
||||
|
||||
|
||||
class DocumentStats(TypedDict, total=False):
|
||||
"""Document statistics from Weaviate.
|
||||
|
||||
This TypedDict contains statistics about a document stored in Weaviate,
|
||||
retrieved by querying the Chunk collection.
|
||||
|
||||
Attributes:
|
||||
success: Whether stats retrieval succeeded.
|
||||
error: Error message if retrieval failed.
|
||||
sourceId: Document identifier.
|
||||
chunks_count: Total number of chunks for this document.
|
||||
work: Title of the work (from first chunk).
|
||||
author: Author name (from first chunk).
|
||||
|
||||
Example:
|
||||
>>> stats = get_document_stats("platon_republique")
|
||||
>>> print(f"Document has {stats['chunks_count']} chunks")
|
||||
"""
|
||||
|
||||
success: bool
|
||||
error: str
|
||||
sourceId: str
|
||||
chunks_count: int
|
||||
work: Optional[str]
|
||||
author: Optional[str]
|
||||
|
||||
|
||||
# Logger
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_weaviate_client() -> Generator[Optional[WeaviateClient], None, None]:
|
||||
"""Context manager for Weaviate connection with automatic cleanup.
|
||||
|
||||
Creates a connection to the local Weaviate instance and ensures
|
||||
proper cleanup when the context exits. Handles connection errors
|
||||
gracefully by yielding None instead of raising.
|
||||
|
||||
Yields:
|
||||
Connected WeaviateClient instance, or None if connection failed.
|
||||
|
||||
Example:
|
||||
>>> with get_weaviate_client() as client:
|
||||
... if client is not None:
|
||||
... chunks = client.collections.get("Chunk")
|
||||
... # Perform operations...
|
||||
... else:
|
||||
... print("Connection failed")
|
||||
|
||||
Note:
|
||||
Connects to localhost:8080 (HTTP) and localhost:50051 (gRPC).
|
||||
Ensure Weaviate is running via docker-compose up -d.
|
||||
"""
|
||||
client: Optional[WeaviateClient] = None
|
||||
try:
|
||||
# Increased timeout for long text vectorization (e.g., Peirce CP 3.403, CP 8.388, Menon chunk 10)
|
||||
# Default is 60s, increased to 600s (10 minutes) for exceptionally large texts
|
||||
from weaviate.classes.init import AdditionalConfig, Timeout
|
||||
|
||||
client = weaviate.connect_to_local(
|
||||
host="localhost",
|
||||
port=8080,
|
||||
grpc_port=50051,
|
||||
additional_config=AdditionalConfig(
|
||||
timeout=Timeout(init=30, query=600, insert=600) # 10 min for insert/query
|
||||
)
|
||||
)
|
||||
yield client
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur connexion Weaviate: {e}")
|
||||
yield None
|
||||
finally:
|
||||
if client:
|
||||
client.close()
|
||||
|
||||
|
||||
def ingest_document_metadata(
|
||||
client: WeaviateClient,
|
||||
doc_name: str,
|
||||
metadata: Dict[str, Any],
|
||||
toc: List[Dict[str, Any]],
|
||||
hierarchy: Dict[str, Any],
|
||||
chunks_count: int,
|
||||
pages: int,
|
||||
) -> Optional[str]:
|
||||
"""Insert document metadata into the Document collection.
|
||||
|
||||
Creates a Document object containing metadata about a processed document,
|
||||
including its table of contents, hierarchy structure, and statistics.
|
||||
|
||||
Args:
|
||||
client: Active Weaviate client connection.
|
||||
doc_name: Unique document identifier (sourceId).
|
||||
metadata: Extracted metadata dict with keys: title, author, language.
|
||||
toc: Table of contents as a hierarchical list of dicts.
|
||||
hierarchy: Complete document hierarchy structure.
|
||||
chunks_count: Total number of chunks in the document.
|
||||
pages: Number of pages in the source PDF.
|
||||
|
||||
Returns:
|
||||
UUID string of the created Document object, or None if insertion failed.
|
||||
|
||||
Example:
|
||||
>>> with get_weaviate_client() as client:
|
||||
... uuid = ingest_document_metadata(
|
||||
... client,
|
||||
... doc_name="platon_republique",
|
||||
... metadata={"title": "La Republique", "author": "Platon"},
|
||||
... toc=[{"title": "Livre I", "level": 1}],
|
||||
... hierarchy={},
|
||||
... chunks_count=150,
|
||||
... pages=300,
|
||||
... )
|
||||
|
||||
Note:
|
||||
The TOC and hierarchy are serialized to JSON strings for storage.
|
||||
The createdAt field is set to the current timestamp.
|
||||
"""
|
||||
try:
|
||||
doc_collection: Collection[Any, Any] = client.collections.get("Document")
|
||||
except Exception as e:
|
||||
logger.warning(f"Collection Document non trouvée: {e}")
|
||||
return None
|
||||
|
||||
try:
|
||||
doc_obj: Dict[str, Any] = {
|
||||
"sourceId": doc_name,
|
||||
"title": metadata.get("title") or doc_name,
|
||||
"author": metadata.get("author") or "Inconnu",
|
||||
"toc": json.dumps(toc, ensure_ascii=False) if toc else "[]",
|
||||
"hierarchy": json.dumps(hierarchy, ensure_ascii=False) if hierarchy else "{}",
|
||||
"pages": pages,
|
||||
"chunksCount": chunks_count,
|
||||
"language": metadata.get("language", "fr"),
|
||||
"createdAt": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
|
||||
result = doc_collection.data.insert(doc_obj)
|
||||
logger.info(f"Document metadata ingéré: {doc_name}")
|
||||
return str(result)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur ingestion document metadata: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def ingest_summaries(
|
||||
client: WeaviateClient,
|
||||
doc_name: str,
|
||||
toc: List[Dict[str, Any]],
|
||||
summaries_content: Dict[str, str],
|
||||
) -> int:
|
||||
"""Insert section summaries into the Summary collection.
|
||||
|
||||
Creates Summary objects for each entry in the table of contents,
|
||||
with optional summary text content. Summaries are vectorized and
|
||||
can be searched semantically.
|
||||
|
||||
Args:
|
||||
client: Active Weaviate client connection.
|
||||
doc_name: Document identifier for linking summaries.
|
||||
toc: Hierarchical table of contents list.
|
||||
summaries_content: Mapping of section titles to summary text.
|
||||
If a title is not in this dict, the title itself is used as text.
|
||||
|
||||
Returns:
|
||||
Number of summaries successfully inserted.
|
||||
|
||||
Example:
|
||||
>>> with get_weaviate_client() as client:
|
||||
... count = ingest_summaries(
|
||||
... client,
|
||||
... doc_name="platon_republique",
|
||||
... toc=[{"title": "Livre I", "level": 1}],
|
||||
... summaries_content={"Livre I": "Discussion sur la justice..."},
|
||||
... )
|
||||
... print(f"Inserted {count} summaries")
|
||||
|
||||
Note:
|
||||
Uses batch insertion via insert_many() for efficiency.
|
||||
Recursively processes nested TOC entries (children).
|
||||
"""
|
||||
try:
|
||||
summary_collection: Collection[Any, Any] = client.collections.get("Summary")
|
||||
except Exception as e:
|
||||
logger.warning(f"Collection Summary non trouvée: {e}")
|
||||
return 0
|
||||
|
||||
summaries_to_insert: List[SummaryObject] = []
|
||||
|
||||
def process_toc(items: List[Dict[str, Any]], parent_path: str = "") -> None:
|
||||
for item in items:
|
||||
title: str = item.get("title", "")
|
||||
level: int = item.get("level", 1)
|
||||
path: str = f"{parent_path} > {title}" if parent_path else title
|
||||
|
||||
summary_obj: SummaryObject = {
|
||||
"sectionPath": path,
|
||||
"title": title,
|
||||
"level": level,
|
||||
"text": summaries_content.get(title, title),
|
||||
"concepts": item.get("concepts", []),
|
||||
"chunksCount": 0,
|
||||
"document": {
|
||||
"sourceId": doc_name,
|
||||
},
|
||||
}
|
||||
summaries_to_insert.append(summary_obj)
|
||||
|
||||
if "children" in item:
|
||||
process_toc(item["children"], path)
|
||||
|
||||
process_toc(toc)
|
||||
|
||||
if not summaries_to_insert:
|
||||
return 0
|
||||
|
||||
# Insérer par petits lots pour éviter les timeouts
|
||||
BATCH_SIZE = 50
|
||||
total_inserted = 0
|
||||
|
||||
try:
|
||||
logger.info(f"Ingesting {len(summaries_to_insert)} summaries in batches of {BATCH_SIZE}...")
|
||||
|
||||
for batch_start in range(0, len(summaries_to_insert), BATCH_SIZE):
|
||||
batch_end = min(batch_start + BATCH_SIZE, len(summaries_to_insert))
|
||||
batch = summaries_to_insert[batch_start:batch_end]
|
||||
|
||||
try:
|
||||
summary_collection.data.insert_many(batch)
|
||||
total_inserted += len(batch)
|
||||
logger.info(f" Batch {batch_start//BATCH_SIZE + 1}: Inserted {len(batch)} summaries ({total_inserted}/{len(summaries_to_insert)})")
|
||||
except Exception as batch_error:
|
||||
logger.warning(f" Batch {batch_start//BATCH_SIZE + 1} failed: {batch_error}")
|
||||
continue
|
||||
|
||||
logger.info(f"{total_inserted} résumés ingérés pour {doc_name}")
|
||||
return total_inserted
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur ingestion résumés: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def ingest_document(
|
||||
doc_name: str,
|
||||
chunks: List[Dict[str, Any]],
|
||||
metadata: Dict[str, Any],
|
||||
language: str = "fr",
|
||||
toc: Optional[List[Dict[str, Any]]] = None,
|
||||
hierarchy: Optional[Dict[str, Any]] = None,
|
||||
pages: int = 0,
|
||||
ingest_document_collection: bool = True,
|
||||
ingest_summary_collection: bool = False,
|
||||
) -> IngestResult:
|
||||
"""Ingest document chunks into Weaviate with nested objects.
|
||||
|
||||
Main ingestion function that inserts chunks into the Chunk collection
|
||||
with nested Work and Document references. Optionally also creates
|
||||
entries in the Document and Summary collections.
|
||||
|
||||
This function uses batch insertion for optimal performance and
|
||||
constructs proper nested objects for filtering capabilities.
|
||||
|
||||
Args:
|
||||
doc_name: Unique document identifier (used as sourceId).
|
||||
chunks: List of chunk dicts, each containing at minimum:
|
||||
- text: The chunk text content
|
||||
- section (optional): Section path string
|
||||
- hierarchy (optional): Dict with part/chapter/section
|
||||
- type (optional): Argumentative unit type
|
||||
- concepts/keywords (optional): List of keywords
|
||||
metadata: Document metadata dict with keys:
|
||||
- title: Work title
|
||||
- author: Author name
|
||||
- edition (optional): Edition identifier
|
||||
language: ISO language code. Defaults to "fr".
|
||||
toc: Optional table of contents for Document/Summary collections.
|
||||
hierarchy: Optional complete document hierarchy structure.
|
||||
pages: Number of pages in source document. Defaults to 0.
|
||||
ingest_document_collection: If True, also insert into Document
|
||||
collection. Defaults to True.
|
||||
ingest_summary_collection: If True, also insert into Summary
|
||||
collection (requires toc). Defaults to False.
|
||||
|
||||
Returns:
|
||||
IngestResult dict containing:
|
||||
- success: True if ingestion succeeded
|
||||
- count: Number of chunks inserted
|
||||
- inserted: Preview of first 10 inserted chunks
|
||||
- work: Work title
|
||||
- author: Author name
|
||||
- document_uuid: UUID of Document object (if created)
|
||||
- all_objects: Complete list of inserted ChunkObjects
|
||||
- error: Error message (if failed)
|
||||
|
||||
Raises:
|
||||
No exceptions are raised; errors are returned in the result dict.
|
||||
|
||||
Example:
|
||||
>>> result = ingest_document(
|
||||
... doc_name="platon_republique",
|
||||
... chunks=[{"text": "La justice est...", "section": "Livre I"}],
|
||||
... metadata={"title": "La Republique", "author": "Platon"},
|
||||
... language="fr",
|
||||
... pages=450,
|
||||
... )
|
||||
>>> if result["success"]:
|
||||
... print(f"Ingested {result['count']} chunks")
|
||||
|
||||
Note:
|
||||
Empty chunks (no text or whitespace-only) are automatically skipped.
|
||||
The function logs progress and errors using the module logger.
|
||||
"""
|
||||
try:
|
||||
with get_weaviate_client() as client:
|
||||
if client is None:
|
||||
return IngestResult(
|
||||
success=False,
|
||||
error="Connexion Weaviate impossible",
|
||||
inserted=[],
|
||||
)
|
||||
|
||||
# Récupérer la collection Chunk
|
||||
try:
|
||||
chunk_collection: Collection[Any, Any] = client.collections.get("Chunk")
|
||||
except Exception as e:
|
||||
return IngestResult(
|
||||
success=False,
|
||||
error=f"Collection Chunk non trouvée: {e}",
|
||||
inserted=[],
|
||||
)
|
||||
|
||||
# Insérer les métadonnées du document (optionnel)
|
||||
doc_uuid: Optional[str] = None
|
||||
if ingest_document_collection:
|
||||
doc_uuid = ingest_document_metadata(
|
||||
client, doc_name, metadata, toc or [], hierarchy or {},
|
||||
len(chunks), pages
|
||||
)
|
||||
|
||||
# Insérer les résumés (optionnel)
|
||||
if ingest_summary_collection and toc:
|
||||
ingest_summaries(client, doc_name, toc, {})
|
||||
|
||||
# NOUVEAU : Enrichir chunks avec métadonnées TOC si disponibles
|
||||
if toc and hierarchy:
|
||||
logger.info(f"Enriching {len(chunks)} chunks with TOC metadata...")
|
||||
chunks = enrich_chunks_with_toc(chunks, toc, hierarchy)
|
||||
else:
|
||||
logger.info("No TOC/hierarchy available, using basic metadata")
|
||||
|
||||
# Préparer les objets Chunk à insérer avec nested objects
|
||||
objects_to_insert: List[ChunkObject] = []
|
||||
|
||||
title: str = metadata.get("title") or metadata.get("work") or doc_name
|
||||
author: str = metadata.get("author") or "Inconnu"
|
||||
edition: str = metadata.get("edition", "")
|
||||
|
||||
for idx, chunk in enumerate(chunks):
|
||||
# Extraire le texte du chunk
|
||||
text: str = chunk.get("text", "")
|
||||
if not text or not text.strip():
|
||||
continue
|
||||
|
||||
# Utiliser sectionPath enrichi si disponible, sinon fallback vers logique existante
|
||||
section_path: str = chunk.get("sectionPath", "")
|
||||
if not section_path:
|
||||
section_path = chunk.get("section", "")
|
||||
if not section_path:
|
||||
chunk_hierarchy: Dict[str, Any] = chunk.get("hierarchy", {})
|
||||
section_parts: List[str] = []
|
||||
if chunk_hierarchy.get("part"):
|
||||
section_parts.append(chunk_hierarchy["part"])
|
||||
if chunk_hierarchy.get("chapter"):
|
||||
section_parts.append(chunk_hierarchy["chapter"])
|
||||
if chunk_hierarchy.get("section"):
|
||||
section_parts.append(chunk_hierarchy["section"])
|
||||
section_path = " > ".join(section_parts) if section_parts else chunk.get("title", f"Section {idx}")
|
||||
|
||||
# Utiliser chapterTitle enrichi si disponible
|
||||
chapter_title: str = chunk.get("chapterTitle", chunk.get("chapter_title", ""))
|
||||
|
||||
# Utiliser canonicalReference enrichi si disponible
|
||||
canonical_ref: str = chunk.get("canonicalReference", "")
|
||||
|
||||
# Créer l objet Chunk avec nested objects
|
||||
chunk_obj: ChunkObject = {
|
||||
"text": text,
|
||||
"sectionPath": section_path,
|
||||
"sectionLevel": chunk.get("section_level", chunk.get("level", 1)),
|
||||
"chapterTitle": chapter_title,
|
||||
"canonicalReference": canonical_ref,
|
||||
"unitType": chunk.get("type", "main_content"),
|
||||
"keywords": chunk.get("concepts", chunk.get("keywords", [])),
|
||||
"language": language,
|
||||
"orderIndex": idx,
|
||||
"work": {
|
||||
"title": title,
|
||||
"author": author,
|
||||
},
|
||||
"document": {
|
||||
"sourceId": doc_name,
|
||||
"edition": edition,
|
||||
},
|
||||
}
|
||||
|
||||
objects_to_insert.append(chunk_obj)
|
||||
|
||||
if not objects_to_insert:
|
||||
return IngestResult(
|
||||
success=True,
|
||||
message="Aucun chunk à insérer",
|
||||
inserted=[],
|
||||
count=0,
|
||||
)
|
||||
|
||||
# Insérer les objets par petits lots pour éviter les timeouts
|
||||
BATCH_SIZE = 50 # Process 50 chunks at a time
|
||||
total_inserted = 0
|
||||
|
||||
logger.info(f"Ingesting {len(objects_to_insert)} chunks in batches of {BATCH_SIZE}...")
|
||||
|
||||
for batch_start in range(0, len(objects_to_insert), BATCH_SIZE):
|
||||
batch_end = min(batch_start + BATCH_SIZE, len(objects_to_insert))
|
||||
batch = objects_to_insert[batch_start:batch_end]
|
||||
|
||||
try:
|
||||
_response = chunk_collection.data.insert_many(objects=batch)
|
||||
total_inserted += len(batch)
|
||||
logger.info(f" Batch {batch_start//BATCH_SIZE + 1}: Inserted {len(batch)} chunks ({total_inserted}/{len(objects_to_insert)})")
|
||||
except Exception as batch_error:
|
||||
logger.error(f" Batch {batch_start//BATCH_SIZE + 1} failed: {batch_error}")
|
||||
# Continue with next batch instead of failing completely
|
||||
continue
|
||||
|
||||
# Préparer le résumé des objets insérés
|
||||
inserted_summary: List[InsertedChunkSummary] = []
|
||||
for i, obj in enumerate(objects_to_insert[:10]):
|
||||
text_content: str = obj.get("text", "")
|
||||
work_obj: Dict[str, str] = obj.get("work", {})
|
||||
inserted_summary.append(InsertedChunkSummary(
|
||||
chunk_id=f"chunk_{i:05d}",
|
||||
sectionPath=obj.get("sectionPath", ""),
|
||||
work=work_obj.get("title", ""),
|
||||
author=work_obj.get("author", ""),
|
||||
text_preview=text_content[:150] + "..." if len(text_content) > 150 else text_content,
|
||||
unitType=obj.get("unitType", ""),
|
||||
))
|
||||
|
||||
logger.info(f"Ingestion réussie: {total_inserted} chunks insérés pour {doc_name}")
|
||||
|
||||
return IngestResult(
|
||||
success=True,
|
||||
count=total_inserted,
|
||||
inserted=inserted_summary,
|
||||
work=title,
|
||||
author=author,
|
||||
document_uuid=doc_uuid,
|
||||
all_objects=objects_to_insert,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur ingestion: {e}")
|
||||
return IngestResult(
|
||||
success=False,
|
||||
error=str(e),
|
||||
inserted=[],
|
||||
)
|
||||
|
||||
|
||||
def delete_document_chunks(doc_name: str) -> DeleteResult:
|
||||
"""Delete all data for a document from Weaviate collections.
|
||||
|
||||
Removes chunks, summaries, and the document metadata from their
|
||||
respective collections. Uses nested object filtering to find
|
||||
related objects.
|
||||
|
||||
This function is useful for re-processing a document after changes
|
||||
to the processing pipeline or to clean up test data.
|
||||
|
||||
Args:
|
||||
doc_name: Document identifier (sourceId) to delete.
|
||||
|
||||
Returns:
|
||||
DeleteResult dict containing:
|
||||
- success: True if deletion succeeded (even if no objects found)
|
||||
- deleted_chunks: Number of Chunk objects deleted
|
||||
- deleted_summaries: Number of Summary objects deleted
|
||||
- deleted_document: True if Document object was deleted
|
||||
- error: Error message (if failed)
|
||||
|
||||
Example:
|
||||
>>> result = delete_document_chunks("platon_republique")
|
||||
>>> if result["success"]:
|
||||
... print(f"Deleted {result['deleted_chunks']} chunks")
|
||||
... # Now safe to re-ingest
|
||||
... ingest_document("platon_republique", new_chunks, metadata)
|
||||
|
||||
Note:
|
||||
Uses delete_many() with filters on nested object properties.
|
||||
Continues even if some collections fail (logs warnings).
|
||||
"""
|
||||
try:
|
||||
with get_weaviate_client() as client:
|
||||
if client is None:
|
||||
return DeleteResult(success=False, error="Connexion Weaviate impossible")
|
||||
|
||||
deleted_chunks: int = 0
|
||||
deleted_summaries: int = 0
|
||||
deleted_document: bool = False
|
||||
|
||||
# Supprimer les chunks (filtrer sur document.sourceId nested)
|
||||
try:
|
||||
chunk_collection: Collection[Any, Any] = client.collections.get("Chunk")
|
||||
result = chunk_collection.data.delete_many(
|
||||
where=wvq.Filter.by_property("document.sourceId").equal(doc_name)
|
||||
)
|
||||
deleted_chunks = result.successful
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur suppression chunks: {e}")
|
||||
|
||||
# Supprimer les summaries (filtrer sur document.sourceId nested)
|
||||
try:
|
||||
summary_collection: Collection[Any, Any] = client.collections.get("Summary")
|
||||
result = summary_collection.data.delete_many(
|
||||
where=wvq.Filter.by_property("document.sourceId").equal(doc_name)
|
||||
)
|
||||
deleted_summaries = result.successful
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur suppression summaries: {e}")
|
||||
|
||||
# Supprimer le document
|
||||
try:
|
||||
doc_collection: Collection[Any, Any] = client.collections.get("Document")
|
||||
result = doc_collection.data.delete_many(
|
||||
where=wvq.Filter.by_property("sourceId").equal(doc_name)
|
||||
)
|
||||
deleted_document = result.successful > 0
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur suppression document: {e}")
|
||||
|
||||
logger.info(f"Suppression: {deleted_chunks} chunks, {deleted_summaries} summaries pour {doc_name}")
|
||||
|
||||
return DeleteResult(
|
||||
success=True,
|
||||
deleted_chunks=deleted_chunks,
|
||||
deleted_summaries=deleted_summaries,
|
||||
deleted_document=deleted_document,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur suppression: {e}")
|
||||
return DeleteResult(success=False, error=str(e))
|
||||
|
||||
|
||||
def get_document_stats(doc_name: str) -> DocumentStats:
|
||||
"""Retrieve statistics for a document from Weaviate.
|
||||
|
||||
Queries the Chunk collection to count chunks and extract work
|
||||
metadata for a given document identifier.
|
||||
|
||||
Args:
|
||||
doc_name: Document identifier (sourceId) to query.
|
||||
|
||||
Returns:
|
||||
DocumentStats dict containing:
|
||||
- success: True if query succeeded
|
||||
- sourceId: The queried document identifier
|
||||
- chunks_count: Number of chunks found
|
||||
- work: Work title (from first chunk, if any)
|
||||
- author: Author name (from first chunk, if any)
|
||||
- error: Error message (if failed)
|
||||
|
||||
Example:
|
||||
>>> stats = get_document_stats("platon_republique")
|
||||
>>> if stats["success"]:
|
||||
... print(f"Document: {stats['work']} by {stats['author']}")
|
||||
... print(f"Chunks: {stats['chunks_count']}")
|
||||
|
||||
Note:
|
||||
Limited to 1000 chunks for counting. For documents with more
|
||||
chunks, consider using Weaviate's aggregate queries.
|
||||
"""
|
||||
try:
|
||||
with get_weaviate_client() as client:
|
||||
if client is None:
|
||||
return DocumentStats(success=False, error="Connexion Weaviate impossible")
|
||||
|
||||
# Compter les chunks (filtrer sur document.sourceId nested)
|
||||
chunk_collection: Collection[Any, Any] = client.collections.get("Chunk")
|
||||
chunks = chunk_collection.query.fetch_objects(
|
||||
filters=wvq.Filter.by_property("document.sourceId").equal(doc_name),
|
||||
limit=1000,
|
||||
)
|
||||
|
||||
chunks_count: int = len(chunks.objects)
|
||||
|
||||
# Récupérer les infos du premier chunk
|
||||
work: Optional[str] = None
|
||||
author: Optional[str] = None
|
||||
if chunks.objects:
|
||||
first: Dict[str, Any] = chunks.objects[0].properties
|
||||
work_obj: Any = first.get("work", {})
|
||||
work = work_obj.get("title") if isinstance(work_obj, dict) else None
|
||||
author = work_obj.get("author") if isinstance(work_obj, dict) else None
|
||||
|
||||
return DocumentStats(
|
||||
success=True,
|
||||
sourceId=doc_name,
|
||||
chunks_count=chunks_count,
|
||||
work=work,
|
||||
author=author,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur stats document: {e}")
|
||||
return DocumentStats(success=False, error=str(e))
|
||||
Reference in New Issue
Block a user