"""Text cleaning and validation for OCR-extracted content. This module provides utilities for cleaning OCR artifacts from extracted text, validating chunk content, and optionally using LLM for intelligent corrections. It handles common OCR issues like page markers, isolated page numbers, repeated headers/footers, and character recognition errors. Overview: The module offers three levels of cleaning: 1. **Basic cleaning** (clean_page_markers, clean_ocr_artifacts): Fast regex-based cleaning for common issues. Always applied. 2. **LLM-enhanced cleaning** (clean_content_with_llm): Uses an LLM to correct subtle OCR errors while preserving meaning. Only applied when explicitly requested and for medium-length texts. 3. **Validation** (is_chunk_valid): Checks if a text chunk contains meaningful content. Cleaning Operations: - Remove page markers () - Remove isolated page numbers - Remove short/repetitive header/footer lines - Normalize multiple spaces and blank lines - Correct obvious OCR character errors (LLM mode) - Preserve citations, technical vocabulary, paragraph structure Validation Criteria: - Minimum character count (default: 20) - Minimum word count (default: 5) - Not pure metadata (URLs, ISBNs, DOIs, copyright notices) LLM Provider Support: - ollama: Local LLM (free, slower, default) - mistral: Mistral API (faster, requires API key) Example: >>> from utils.llm_cleaner import clean_chunk, is_chunk_valid >>> >>> # Clean a chunk with basic cleaning only >>> text = " Some philosophical content..." >>> cleaned = clean_chunk(text) >>> print(cleaned) 'Some philosophical content...' >>> >>> # Validate chunk before processing >>> if is_chunk_valid(cleaned): ... process_chunk(cleaned) See Also: utils.llm_chunker: Semantic chunking of sections utils.llm_validator: Document validation and concept extraction utils.pdf_pipeline: Main pipeline orchestration """ from __future__ import annotations import logging import re from typing import List, Optional, Pattern from .llm_structurer import call_llm, _get_default_model, _get_default_mistral_model from .types import LLMProvider logger: logging.Logger = logging.getLogger(__name__) # Type alias for compiled regex patterns RegexPattern = Pattern[str] def clean_page_markers(text: str) -> str: r"""Remove page markers and normalize blank lines from text. Page markers are HTML comments inserted during OCR processing to track page boundaries. This function removes them along with excessive blank lines that may result from the removal. Args: text: Text content potentially containing page markers like '' and multiple consecutive newlines. Returns: Cleaned text with page markers removed and no more than two consecutive newlines. Text is stripped of leading/trailing whitespace. Example: >>> text = "\nContent here\n\n\n\n" >>> clean_page_markers(text) 'Content here' """ # Supprimer les marqueurs text = re.sub(r'', '', text) # Supprimer les lignes vides multiples text = re.sub(r'\n{3,}', '\n\n', text) return text.strip() def clean_ocr_artifacts(text: str) -> str: r"""Remove common OCR artifacts without using LLM. This function performs fast, rule-based cleaning of typical OCR issues: - Isolated page numbers (1-4 digits on their own line) - Very short lines likely to be headers/footers (<=3 chars) - Multiple consecutive spaces - Excessive blank lines (>2) Lines starting with '#' (markdown headers) are preserved regardless of length. Empty lines are preserved (single blank lines only). Args: text: Raw OCR-extracted text potentially containing artifacts like isolated page numbers, repeated headers, and irregular spacing. Returns: Cleaned text with artifacts removed and spacing normalized. Leading/trailing whitespace is stripped. Example: >>> text = "42\n\nActual content here\n\n\n\n\nMore text" >>> clean_ocr_artifacts(text) 'Actual content here\n\nMore text' Note: This function is always called as part of clean_chunk() and provides a baseline level of cleaning even when LLM cleaning is disabled. """ # Supprimer les numéros de page isolés text = re.sub(r'^\d{1,4}\s*$', '', text, flags=re.MULTILINE) # Supprimer les en-têtes/pieds de page répétés (lignes très courtes isolées) lines: List[str] = text.split('\n') cleaned_lines: List[str] = [] for line in lines: # Garder les lignes non vides et significatives stripped: str = line.strip() if stripped and (len(stripped) > 3 or stripped.startswith('#')): cleaned_lines.append(line) elif not stripped: cleaned_lines.append('') # Préserver les lignes vides simples text = '\n'.join(cleaned_lines) # Normaliser les espaces text = re.sub(r' {2,}', ' ', text) # Supprimer les lignes vides multiples text = re.sub(r'\n{3,}', '\n\n', text) return text.strip() def clean_content_with_llm( text: str, context: Optional[str] = None, model: Optional[str] = None, provider: LLMProvider = "ollama", temperature: float = 0.1, ) -> str: """Clean text content using an LLM for intelligent OCR error correction. Uses a language model to correct subtle OCR errors that rule-based cleaning cannot handle, such as misrecognized characters in context. The LLM is instructed to preserve the intellectual content exactly while fixing obvious technical errors. The function includes safeguards: - Texts < 50 chars: Only basic cleaning (LLM skipped) - Texts > 3000 chars: Only basic cleaning (timeout risk) - If LLM changes text by >50%: Fallback to basic cleaning Args: text: Text content to clean. Should be between 50-3000 characters for LLM processing. context: Optional context about the document (title, subject) to help the LLM make better corrections. Example: "Heidegger's Being and Time, Chapter 2". model: LLM model name. If None, uses provider default (qwen2.5:7b for ollama, mistral-small-latest for mistral). provider: LLM provider to use. Options: "ollama" (local, free) or "mistral" (API, faster). temperature: LLM temperature for response generation. Lower values (0.1) produce more deterministic corrections. Defaults to 0.1. Returns: Cleaned text with OCR errors corrected. If LLM fails or produces suspicious output (too short/long), returns basic-cleaned text. Raises: No exceptions raised - all errors caught and handled with fallback. Example: >>> text = "Heidegger's concept of Dase1n is central..." # '1' should be 'i' >>> clean_content_with_llm(text, context="Being and Time") "Heidegger's concept of Dasein is central..." Note: The LLM is explicitly instructed NOT to: - Modify meaning or intellectual content - Rephrase or summarize - Add any new content - Alter citations or technical vocabulary """ if model is None: model = _get_default_mistral_model() if provider == "mistral" else _get_default_model() # Ne pas traiter les textes trop courts if len(text.strip()) < 50: return clean_page_markers(text) # Limiter la taille pour éviter les timeouts max_chars: int = 3000 if len(text) > max_chars: # Pour les longs textes, nettoyer sans LLM return clean_page_markers(clean_ocr_artifacts(text)) context_info: str = f"Contexte: {context}\n" if context else "" prompt: str = f"""Tu es un expert en correction de textes OCRisés. TÂCHE: Nettoyer ce texte extrait par OCR. {context_info} ACTIONS À EFFECTUER: 1. Supprimer les marqueurs de page () 2. Corriger les erreurs OCR ÉVIDENTES (caractères mal reconnus) 3. Supprimer les artefacts (numéros de page isolés, en-têtes répétés) 4. Normaliser la ponctuation et les espaces RÈGLES STRICTES: - NE PAS modifier le sens ou le contenu intellectuel - NE PAS reformuler ou résumer - NE PAS ajouter de contenu - Préserver les citations et le vocabulaire technique - Garder la structure des paragraphes TEXTE À NETTOYER: {text} RÉPONDS UNIQUEMENT avec le texte nettoyé, sans commentaires ni balises.""" try: response: str = call_llm( prompt, model=model, provider=provider, temperature=temperature, timeout=120 ) # Vérifier que la réponse est valide cleaned: str = response.strip() # Si la réponse est trop différente (LLM a trop modifié), garder l'original nettoyé basiquement if len(cleaned) < len(text) * 0.5 or len(cleaned) > len(text) * 1.5: logger.warning("LLM a trop modifié le texte, utilisation du nettoyage basique") return clean_page_markers(clean_ocr_artifacts(text)) return cleaned except Exception as e: logger.warning(f"Erreur nettoyage LLM: {e}, utilisation du nettoyage basique") return clean_page_markers(clean_ocr_artifacts(text)) def clean_chunk( chunk_text: str, use_llm: bool = False, context: Optional[str] = None, model: Optional[str] = None, provider: LLMProvider = "ollama", ) -> str: r"""Clean a text chunk with optional LLM enhancement. This is the main entry point for chunk cleaning. It always applies basic cleaning (page markers, OCR artifacts) and optionally uses LLM for more intelligent error correction. Cleaning pipeline: 1. Remove page markers (always) 2. Remove OCR artifacts (always) 3. LLM correction (if use_llm=True and text >= 50 chars) Args: chunk_text: Raw text content of the chunk to clean. use_llm: Whether to use LLM for enhanced cleaning. Defaults to False. Set to True for higher quality but slower processing. context: Optional document context (title, chapter) passed to LLM for better corrections. Ignored if use_llm=False. model: LLM model name. If None, uses provider default. Ignored if use_llm=False. provider: LLM provider ("ollama" or "mistral"). Defaults to "ollama". Ignored if use_llm=False. Returns: Cleaned chunk text ready for indexing or further processing. Example: >>> # Basic cleaning only (fast) >>> chunk = "\n42\n\nThe concept of being..." >>> clean_chunk(chunk) 'The concept of being...' >>> >>> # With LLM enhancement (slower, higher quality) >>> clean_chunk(chunk, use_llm=True, context="Heidegger analysis") 'The concept of being...' See Also: is_chunk_valid: Validate cleaned chunks before processing clean_page_markers: Basic page marker removal clean_ocr_artifacts: Basic artifact removal """ # Nettoyage de base toujours appliqué text: str = clean_page_markers(chunk_text) text = clean_ocr_artifacts(text) # Nettoyage LLM optionnel if use_llm and len(text) >= 50: text = clean_content_with_llm(text, context=context, model=model, provider=provider) return text def is_chunk_valid(chunk_text: str, min_chars: int = 20, min_words: int = 5) -> bool: """Check if a text chunk contains meaningful content. Validates that a chunk has sufficient length and is not purely metadata or boilerplate content. Used to filter out non-content chunks before indexing. Validation criteria: 1. Character count >= min_chars (after page marker removal) 2. Word count >= min_words 3. Not matching metadata patterns (URLs, ISBNs, DOIs, dates, copyright) Args: chunk_text: Text content of the chunk to validate. Page markers are removed before validation. min_chars: Minimum number of characters required. Defaults to 20. Chunks shorter than this are considered invalid. min_words: Minimum number of words required. Defaults to 5. Chunks with fewer words are considered invalid. Returns: True if the chunk passes all validation criteria and contains meaningful content suitable for indexing. False otherwise. Example: >>> is_chunk_valid("The concept of Dasein is central to Heidegger.") True >>> is_chunk_valid("42") # Too short False >>> is_chunk_valid("ISBN 978-0-123456-78-9") # Metadata False >>> is_chunk_valid("https://example.com/page") # URL False Note: Metadata patterns checked: - URLs (http://, https://) - Dates (YYYY-MM-DD format) - ISBN numbers - DOI identifiers - Copyright notices (©) """ text: str = clean_page_markers(chunk_text).strip() # Vérifier la longueur if len(text) < min_chars: return False # Compter les mots words: List[str] = text.split() if len(words) < min_words: return False # Vérifier que ce n'est pas juste des métadonnées metadata_patterns: List[str] = [ r'^https?://', r'^\d{4}-\d{2}-\d{2}$', r'^ISBN', r'^DOI', r'^©', ] pattern: str for pattern in metadata_patterns: if re.match(pattern, text, re.IGNORECASE): return False return True