- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
390 lines
14 KiB
Python
390 lines
14 KiB
Python
"""Text cleaning and validation for OCR-extracted content.
|
|
|
|
This module provides utilities for cleaning OCR artifacts from extracted text,
|
|
validating chunk content, and optionally using LLM for intelligent corrections.
|
|
It handles common OCR issues like page markers, isolated page numbers,
|
|
repeated headers/footers, and character recognition errors.
|
|
|
|
Overview:
|
|
The module offers three levels of cleaning:
|
|
|
|
1. **Basic cleaning** (clean_page_markers, clean_ocr_artifacts):
|
|
Fast regex-based cleaning for common issues. Always applied.
|
|
|
|
2. **LLM-enhanced cleaning** (clean_content_with_llm):
|
|
Uses an LLM to correct subtle OCR errors while preserving meaning.
|
|
Only applied when explicitly requested and for medium-length texts.
|
|
|
|
3. **Validation** (is_chunk_valid):
|
|
Checks if a text chunk contains meaningful content.
|
|
|
|
Cleaning Operations:
|
|
- Remove page markers (<!-- Page X -->)
|
|
- Remove isolated page numbers
|
|
- Remove short/repetitive header/footer lines
|
|
- Normalize multiple spaces and blank lines
|
|
- Correct obvious OCR character errors (LLM mode)
|
|
- Preserve citations, technical vocabulary, paragraph structure
|
|
|
|
Validation Criteria:
|
|
- Minimum character count (default: 20)
|
|
- Minimum word count (default: 5)
|
|
- Not pure metadata (URLs, ISBNs, DOIs, copyright notices)
|
|
|
|
LLM Provider Support:
|
|
- ollama: Local LLM (free, slower, default)
|
|
- mistral: Mistral API (faster, requires API key)
|
|
|
|
Example:
|
|
>>> from utils.llm_cleaner import clean_chunk, is_chunk_valid
|
|
>>>
|
|
>>> # Clean a chunk with basic cleaning only
|
|
>>> text = "<!-- Page 42 --> Some philosophical content..."
|
|
>>> cleaned = clean_chunk(text)
|
|
>>> print(cleaned)
|
|
'Some philosophical content...'
|
|
>>>
|
|
>>> # Validate chunk before processing
|
|
>>> if is_chunk_valid(cleaned):
|
|
... process_chunk(cleaned)
|
|
|
|
See Also:
|
|
utils.llm_chunker: Semantic chunking of sections
|
|
utils.llm_validator: Document validation and concept extraction
|
|
utils.pdf_pipeline: Main pipeline orchestration
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from typing import List, Optional, Pattern
|
|
|
|
from .llm_structurer import call_llm, _get_default_model, _get_default_mistral_model
|
|
from .types import LLMProvider
|
|
|
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
|
|
# Type alias for compiled regex patterns
|
|
RegexPattern = Pattern[str]
|
|
|
|
|
|
def clean_page_markers(text: str) -> str:
|
|
r"""Remove page markers and normalize blank lines from text.
|
|
|
|
Page markers are HTML comments inserted during OCR processing to track
|
|
page boundaries. This function removes them along with excessive blank
|
|
lines that may result from the removal.
|
|
|
|
Args:
|
|
text: Text content potentially containing page markers like
|
|
'<!-- Page 42 -->' and multiple consecutive newlines.
|
|
|
|
Returns:
|
|
Cleaned text with page markers removed and no more than two
|
|
consecutive newlines. Text is stripped of leading/trailing whitespace.
|
|
|
|
Example:
|
|
>>> text = "<!-- Page 1 -->\nContent here\n\n\n\n<!-- Page 2 -->"
|
|
>>> clean_page_markers(text)
|
|
'Content here'
|
|
"""
|
|
# Supprimer les marqueurs <!-- Page X -->
|
|
text = re.sub(r'<!--\s*Page\s*\d+\s*-->', '', text)
|
|
|
|
# Supprimer les lignes vides multiples
|
|
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
|
|
return text.strip()
|
|
|
|
|
|
def clean_ocr_artifacts(text: str) -> str:
|
|
r"""Remove common OCR artifacts without using LLM.
|
|
|
|
This function performs fast, rule-based cleaning of typical OCR issues:
|
|
- Isolated page numbers (1-4 digits on their own line)
|
|
- Very short lines likely to be headers/footers (<=3 chars)
|
|
- Multiple consecutive spaces
|
|
- Excessive blank lines (>2)
|
|
|
|
Lines starting with '#' (markdown headers) are preserved regardless
|
|
of length. Empty lines are preserved (single blank lines only).
|
|
|
|
Args:
|
|
text: Raw OCR-extracted text potentially containing artifacts
|
|
like isolated page numbers, repeated headers, and irregular spacing.
|
|
|
|
Returns:
|
|
Cleaned text with artifacts removed and spacing normalized.
|
|
Leading/trailing whitespace is stripped.
|
|
|
|
Example:
|
|
>>> text = "42\n\nActual content here\n\n\n\n\nMore text"
|
|
>>> clean_ocr_artifacts(text)
|
|
'Actual content here\n\nMore text'
|
|
|
|
Note:
|
|
This function is always called as part of clean_chunk() and provides
|
|
a baseline level of cleaning even when LLM cleaning is disabled.
|
|
"""
|
|
# Supprimer les numéros de page isolés
|
|
text = re.sub(r'^\d{1,4}\s*$', '', text, flags=re.MULTILINE)
|
|
|
|
# Supprimer les en-têtes/pieds de page répétés (lignes très courtes isolées)
|
|
lines: List[str] = text.split('\n')
|
|
cleaned_lines: List[str] = []
|
|
for line in lines:
|
|
# Garder les lignes non vides et significatives
|
|
stripped: str = line.strip()
|
|
if stripped and (len(stripped) > 3 or stripped.startswith('#')):
|
|
cleaned_lines.append(line)
|
|
elif not stripped:
|
|
cleaned_lines.append('') # Préserver les lignes vides simples
|
|
|
|
text = '\n'.join(cleaned_lines)
|
|
|
|
# Normaliser les espaces
|
|
text = re.sub(r' {2,}', ' ', text)
|
|
|
|
# Supprimer les lignes vides multiples
|
|
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
|
|
return text.strip()
|
|
|
|
|
|
def clean_content_with_llm(
|
|
text: str,
|
|
context: Optional[str] = None,
|
|
model: Optional[str] = None,
|
|
provider: LLMProvider = "ollama",
|
|
temperature: float = 0.1,
|
|
) -> str:
|
|
"""Clean text content using an LLM for intelligent OCR error correction.
|
|
|
|
Uses a language model to correct subtle OCR errors that rule-based
|
|
cleaning cannot handle, such as misrecognized characters in context.
|
|
The LLM is instructed to preserve the intellectual content exactly
|
|
while fixing obvious technical errors.
|
|
|
|
The function includes safeguards:
|
|
- Texts < 50 chars: Only basic cleaning (LLM skipped)
|
|
- Texts > 3000 chars: Only basic cleaning (timeout risk)
|
|
- If LLM changes text by >50%: Fallback to basic cleaning
|
|
|
|
Args:
|
|
text: Text content to clean. Should be between 50-3000 characters
|
|
for LLM processing.
|
|
context: Optional context about the document (title, subject) to
|
|
help the LLM make better corrections. Example: "Heidegger's
|
|
Being and Time, Chapter 2".
|
|
model: LLM model name. If None, uses provider default
|
|
(qwen2.5:7b for ollama, mistral-small-latest for mistral).
|
|
provider: LLM provider to use. Options: "ollama" (local, free)
|
|
or "mistral" (API, faster).
|
|
temperature: LLM temperature for response generation. Lower values
|
|
(0.1) produce more deterministic corrections. Defaults to 0.1.
|
|
|
|
Returns:
|
|
Cleaned text with OCR errors corrected. If LLM fails or produces
|
|
suspicious output (too short/long), returns basic-cleaned text.
|
|
|
|
Raises:
|
|
No exceptions raised - all errors caught and handled with fallback.
|
|
|
|
Example:
|
|
>>> text = "Heidegger's concept of Dase1n is central..." # '1' should be 'i'
|
|
>>> clean_content_with_llm(text, context="Being and Time")
|
|
"Heidegger's concept of Dasein is central..."
|
|
|
|
Note:
|
|
The LLM is explicitly instructed NOT to:
|
|
- Modify meaning or intellectual content
|
|
- Rephrase or summarize
|
|
- Add any new content
|
|
- Alter citations or technical vocabulary
|
|
"""
|
|
if model is None:
|
|
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
|
|
|
# Ne pas traiter les textes trop courts
|
|
if len(text.strip()) < 50:
|
|
return clean_page_markers(text)
|
|
|
|
# Limiter la taille pour éviter les timeouts
|
|
max_chars: int = 3000
|
|
if len(text) > max_chars:
|
|
# Pour les longs textes, nettoyer sans LLM
|
|
return clean_page_markers(clean_ocr_artifacts(text))
|
|
|
|
context_info: str = f"Contexte: {context}\n" if context else ""
|
|
|
|
prompt: str = f"""Tu es un expert en correction de textes OCRisés.
|
|
|
|
TÂCHE: Nettoyer ce texte extrait par OCR.
|
|
|
|
{context_info}
|
|
ACTIONS À EFFECTUER:
|
|
1. Supprimer les marqueurs de page (<!-- Page X -->)
|
|
2. Corriger les erreurs OCR ÉVIDENTES (caractères mal reconnus)
|
|
3. Supprimer les artefacts (numéros de page isolés, en-têtes répétés)
|
|
4. Normaliser la ponctuation et les espaces
|
|
|
|
RÈGLES STRICTES:
|
|
- NE PAS modifier le sens ou le contenu intellectuel
|
|
- NE PAS reformuler ou résumer
|
|
- NE PAS ajouter de contenu
|
|
- Préserver les citations et le vocabulaire technique
|
|
- Garder la structure des paragraphes
|
|
|
|
TEXTE À NETTOYER:
|
|
{text}
|
|
|
|
RÉPONDS UNIQUEMENT avec le texte nettoyé, sans commentaires ni balises."""
|
|
|
|
try:
|
|
response: str = call_llm(
|
|
prompt, model=model, provider=provider, temperature=temperature, timeout=120
|
|
)
|
|
|
|
# Vérifier que la réponse est valide
|
|
cleaned: str = response.strip()
|
|
|
|
# Si la réponse est trop différente (LLM a trop modifié), garder l'original nettoyé basiquement
|
|
if len(cleaned) < len(text) * 0.5 or len(cleaned) > len(text) * 1.5:
|
|
logger.warning("LLM a trop modifié le texte, utilisation du nettoyage basique")
|
|
return clean_page_markers(clean_ocr_artifacts(text))
|
|
|
|
return cleaned
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Erreur nettoyage LLM: {e}, utilisation du nettoyage basique")
|
|
return clean_page_markers(clean_ocr_artifacts(text))
|
|
|
|
|
|
def clean_chunk(
|
|
chunk_text: str,
|
|
use_llm: bool = False,
|
|
context: Optional[str] = None,
|
|
model: Optional[str] = None,
|
|
provider: LLMProvider = "ollama",
|
|
) -> str:
|
|
r"""Clean a text chunk with optional LLM enhancement.
|
|
|
|
This is the main entry point for chunk cleaning. It always applies
|
|
basic cleaning (page markers, OCR artifacts) and optionally uses
|
|
LLM for more intelligent error correction.
|
|
|
|
Cleaning pipeline:
|
|
1. Remove page markers (always)
|
|
2. Remove OCR artifacts (always)
|
|
3. LLM correction (if use_llm=True and text >= 50 chars)
|
|
|
|
Args:
|
|
chunk_text: Raw text content of the chunk to clean.
|
|
use_llm: Whether to use LLM for enhanced cleaning. Defaults to
|
|
False. Set to True for higher quality but slower processing.
|
|
context: Optional document context (title, chapter) passed to LLM
|
|
for better corrections. Ignored if use_llm=False.
|
|
model: LLM model name. If None, uses provider default.
|
|
Ignored if use_llm=False.
|
|
provider: LLM provider ("ollama" or "mistral"). Defaults to
|
|
"ollama". Ignored if use_llm=False.
|
|
|
|
Returns:
|
|
Cleaned chunk text ready for indexing or further processing.
|
|
|
|
Example:
|
|
>>> # Basic cleaning only (fast)
|
|
>>> chunk = "<!-- Page 5 -->\n42\n\nThe concept of being..."
|
|
>>> clean_chunk(chunk)
|
|
'The concept of being...'
|
|
>>>
|
|
>>> # With LLM enhancement (slower, higher quality)
|
|
>>> clean_chunk(chunk, use_llm=True, context="Heidegger analysis")
|
|
'The concept of being...'
|
|
|
|
See Also:
|
|
is_chunk_valid: Validate cleaned chunks before processing
|
|
clean_page_markers: Basic page marker removal
|
|
clean_ocr_artifacts: Basic artifact removal
|
|
"""
|
|
# Nettoyage de base toujours appliqué
|
|
text: str = clean_page_markers(chunk_text)
|
|
text = clean_ocr_artifacts(text)
|
|
|
|
# Nettoyage LLM optionnel
|
|
if use_llm and len(text) >= 50:
|
|
text = clean_content_with_llm(text, context=context, model=model, provider=provider)
|
|
|
|
return text
|
|
|
|
|
|
def is_chunk_valid(chunk_text: str, min_chars: int = 20, min_words: int = 5) -> bool:
|
|
"""Check if a text chunk contains meaningful content.
|
|
|
|
Validates that a chunk has sufficient length and is not purely
|
|
metadata or boilerplate content. Used to filter out non-content
|
|
chunks before indexing.
|
|
|
|
Validation criteria:
|
|
1. Character count >= min_chars (after page marker removal)
|
|
2. Word count >= min_words
|
|
3. Not matching metadata patterns (URLs, ISBNs, DOIs, dates, copyright)
|
|
|
|
Args:
|
|
chunk_text: Text content of the chunk to validate. Page markers
|
|
are removed before validation.
|
|
min_chars: Minimum number of characters required. Defaults to 20.
|
|
Chunks shorter than this are considered invalid.
|
|
min_words: Minimum number of words required. Defaults to 5.
|
|
Chunks with fewer words are considered invalid.
|
|
|
|
Returns:
|
|
True if the chunk passes all validation criteria and contains
|
|
meaningful content suitable for indexing. False otherwise.
|
|
|
|
Example:
|
|
>>> is_chunk_valid("The concept of Dasein is central to Heidegger.")
|
|
True
|
|
>>> is_chunk_valid("42") # Too short
|
|
False
|
|
>>> is_chunk_valid("ISBN 978-0-123456-78-9") # Metadata
|
|
False
|
|
>>> is_chunk_valid("https://example.com/page") # URL
|
|
False
|
|
|
|
Note:
|
|
Metadata patterns checked:
|
|
- URLs (http://, https://)
|
|
- Dates (YYYY-MM-DD format)
|
|
- ISBN numbers
|
|
- DOI identifiers
|
|
- Copyright notices (©)
|
|
"""
|
|
text: str = clean_page_markers(chunk_text).strip()
|
|
|
|
# Vérifier la longueur
|
|
if len(text) < min_chars:
|
|
return False
|
|
|
|
# Compter les mots
|
|
words: List[str] = text.split()
|
|
if len(words) < min_words:
|
|
return False
|
|
|
|
# Vérifier que ce n'est pas juste des métadonnées
|
|
metadata_patterns: List[str] = [
|
|
r'^https?://',
|
|
r'^\d{4}-\d{2}-\d{2}$',
|
|
r'^ISBN',
|
|
r'^DOI',
|
|
r'^©',
|
|
]
|
|
pattern: str
|
|
for pattern in metadata_patterns:
|
|
if re.match(pattern, text, re.IGNORECASE):
|
|
return False
|
|
|
|
return True
|
|
|