Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions
--- a/generations/library_rag/utils/llm_cleaner.py
+++ b/generations/library_rag/utils/llm_cleaner.py
@@ -0,0 +1,389 @@
+"""Text cleaning and validation for OCR-extracted content.
+
+This module provides utilities for cleaning OCR artifacts from extracted text,
+validating chunk content, and optionally using LLM for intelligent corrections.
+It handles common OCR issues like page markers, isolated page numbers,
+repeated headers/footers, and character recognition errors.
+
+Overview:
+    The module offers three levels of cleaning:
+
+    1. **Basic cleaning** (clean_page_markers, clean_ocr_artifacts):
+       Fast regex-based cleaning for common issues. Always applied.
+
+    2. **LLM-enhanced cleaning** (clean_content_with_llm):
+       Uses an LLM to correct subtle OCR errors while preserving meaning.
+       Only applied when explicitly requested and for medium-length texts.
+
+    3. **Validation** (is_chunk_valid):
+       Checks if a text chunk contains meaningful content.
+
+Cleaning Operations:
+    - Remove page markers (<!-- Page X -->)
+    - Remove isolated page numbers
+    - Remove short/repetitive header/footer lines
+    - Normalize multiple spaces and blank lines
+    - Correct obvious OCR character errors (LLM mode)
+    - Preserve citations, technical vocabulary, paragraph structure
+
+Validation Criteria:
+    - Minimum character count (default: 20)
+    - Minimum word count (default: 5)
+    - Not pure metadata (URLs, ISBNs, DOIs, copyright notices)
+
+LLM Provider Support:
+    - ollama: Local LLM (free, slower, default)
+    - mistral: Mistral API (faster, requires API key)
+
+Example:
+    >>> from utils.llm_cleaner import clean_chunk, is_chunk_valid
+    >>> 
+    >>> # Clean a chunk with basic cleaning only
+    >>> text = "<!-- Page 42 --> Some philosophical content..."
+    >>> cleaned = clean_chunk(text)
+    >>> print(cleaned)
+    'Some philosophical content...'
+    >>> 
+    >>> # Validate chunk before processing
+    >>> if is_chunk_valid(cleaned):
+    ...     process_chunk(cleaned)
+
+See Also:
+    utils.llm_chunker: Semantic chunking of sections
+    utils.llm_validator: Document validation and concept extraction
+    utils.pdf_pipeline: Main pipeline orchestration
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import List, Optional, Pattern
+
+from .llm_structurer import call_llm, _get_default_model, _get_default_mistral_model
+from .types import LLMProvider
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+# Type alias for compiled regex patterns
+RegexPattern = Pattern[str]
+
+
+def clean_page_markers(text: str) -> str:
+    r"""Remove page markers and normalize blank lines from text.
+
+    Page markers are HTML comments inserted during OCR processing to track
+    page boundaries. This function removes them along with excessive blank
+    lines that may result from the removal.
+
+    Args:
+        text: Text content potentially containing page markers like
+            '<!-- Page 42 -->' and multiple consecutive newlines.
+
+    Returns:
+        Cleaned text with page markers removed and no more than two
+        consecutive newlines. Text is stripped of leading/trailing whitespace.
+
+    Example:
+        >>> text = "<!-- Page 1 -->\nContent here\n\n\n\n<!-- Page 2 -->"
+        >>> clean_page_markers(text)
+        'Content here'
+    """
+    # Supprimer les marqueurs <!-- Page X -->
+    text = re.sub(r'<!--\s*Page\s*\d+\s*-->', '', text)
+    
+    # Supprimer les lignes vides multiples
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    
+    return text.strip()
+
+
+def clean_ocr_artifacts(text: str) -> str:
+    r"""Remove common OCR artifacts without using LLM.
+
+    This function performs fast, rule-based cleaning of typical OCR issues:
+    - Isolated page numbers (1-4 digits on their own line)
+    - Very short lines likely to be headers/footers (<=3 chars)
+    - Multiple consecutive spaces
+    - Excessive blank lines (>2)
+
+    Lines starting with '#' (markdown headers) are preserved regardless
+    of length. Empty lines are preserved (single blank lines only).
+
+    Args:
+        text: Raw OCR-extracted text potentially containing artifacts
+            like isolated page numbers, repeated headers, and irregular spacing.
+
+    Returns:
+        Cleaned text with artifacts removed and spacing normalized.
+        Leading/trailing whitespace is stripped.
+
+    Example:
+        >>> text = "42\n\nActual content here\n\n\n\n\nMore text"
+        >>> clean_ocr_artifacts(text)
+        'Actual content here\n\nMore text'
+
+    Note:
+        This function is always called as part of clean_chunk() and provides
+        a baseline level of cleaning even when LLM cleaning is disabled.
+    """
+    # Supprimer les numéros de page isolés
+    text = re.sub(r'^\d{1,4}\s*$', '', text, flags=re.MULTILINE)
+
+    # Supprimer les en-têtes/pieds de page répétés (lignes très courtes isolées)
+    lines: List[str] = text.split('\n')
+    cleaned_lines: List[str] = []
+    for line in lines:
+        # Garder les lignes non vides et significatives
+        stripped: str = line.strip()
+        if stripped and (len(stripped) > 3 or stripped.startswith('#')):
+            cleaned_lines.append(line)
+        elif not stripped:
+            cleaned_lines.append('')  # Préserver les lignes vides simples
+
+    text = '\n'.join(cleaned_lines)
+
+    # Normaliser les espaces
+    text = re.sub(r' {2,}', ' ', text)
+
+    # Supprimer les lignes vides multiples
+    text = re.sub(r'\n{3,}', '\n\n', text)
+
+    return text.strip()
+
+
+def clean_content_with_llm(
+    text: str,
+    context: Optional[str] = None,
+    model: Optional[str] = None,
+    provider: LLMProvider = "ollama",
+    temperature: float = 0.1,
+) -> str:
+    """Clean text content using an LLM for intelligent OCR error correction.
+
+    Uses a language model to correct subtle OCR errors that rule-based
+    cleaning cannot handle, such as misrecognized characters in context.
+    The LLM is instructed to preserve the intellectual content exactly
+    while fixing obvious technical errors.
+
+    The function includes safeguards:
+    - Texts < 50 chars: Only basic cleaning (LLM skipped)
+    - Texts > 3000 chars: Only basic cleaning (timeout risk)
+    - If LLM changes text by >50%: Fallback to basic cleaning
+
+    Args:
+        text: Text content to clean. Should be between 50-3000 characters
+            for LLM processing.
+        context: Optional context about the document (title, subject) to
+            help the LLM make better corrections. Example: "Heidegger's
+            Being and Time, Chapter 2".
+        model: LLM model name. If None, uses provider default
+            (qwen2.5:7b for ollama, mistral-small-latest for mistral).
+        provider: LLM provider to use. Options: "ollama" (local, free)
+            or "mistral" (API, faster).
+        temperature: LLM temperature for response generation. Lower values
+            (0.1) produce more deterministic corrections. Defaults to 0.1.
+
+    Returns:
+        Cleaned text with OCR errors corrected. If LLM fails or produces
+        suspicious output (too short/long), returns basic-cleaned text.
+
+    Raises:
+        No exceptions raised - all errors caught and handled with fallback.
+
+    Example:
+        >>> text = "Heidegger's concept of Dase1n is central..."  # '1' should be 'i'
+        >>> clean_content_with_llm(text, context="Being and Time")
+        "Heidegger's concept of Dasein is central..."
+
+    Note:
+        The LLM is explicitly instructed NOT to:
+        - Modify meaning or intellectual content
+        - Rephrase or summarize
+        - Add any new content
+        - Alter citations or technical vocabulary
+    """
+    if model is None:
+        model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
+
+    # Ne pas traiter les textes trop courts
+    if len(text.strip()) < 50:
+        return clean_page_markers(text)
+
+    # Limiter la taille pour éviter les timeouts
+    max_chars: int = 3000
+    if len(text) > max_chars:
+        # Pour les longs textes, nettoyer sans LLM
+        return clean_page_markers(clean_ocr_artifacts(text))
+
+    context_info: str = f"Contexte: {context}\n" if context else ""
+
+    prompt: str = f"""Tu es un expert en correction de textes OCRisés.
+
+TÂCHE: Nettoyer ce texte extrait par OCR.
+
+{context_info}
+ACTIONS À EFFECTUER:
+1. Supprimer les marqueurs de page (<!-- Page X -->)
+2. Corriger les erreurs OCR ÉVIDENTES (caractères mal reconnus)
+3. Supprimer les artefacts (numéros de page isolés, en-têtes répétés)
+4. Normaliser la ponctuation et les espaces
+
+RÈGLES STRICTES:
+- NE PAS modifier le sens ou le contenu intellectuel
+- NE PAS reformuler ou résumer
+- NE PAS ajouter de contenu
+- Préserver les citations et le vocabulaire technique
+- Garder la structure des paragraphes
+
+TEXTE À NETTOYER:
+{text}
+
+RÉPONDS UNIQUEMENT avec le texte nettoyé, sans commentaires ni balises."""
+
+    try:
+        response: str = call_llm(
+            prompt, model=model, provider=provider, temperature=temperature, timeout=120
+        )
+
+        # Vérifier que la réponse est valide
+        cleaned: str = response.strip()
+
+        # Si la réponse est trop différente (LLM a trop modifié), garder l'original nettoyé basiquement
+        if len(cleaned) < len(text) * 0.5 or len(cleaned) > len(text) * 1.5:
+            logger.warning("LLM a trop modifié le texte, utilisation du nettoyage basique")
+            return clean_page_markers(clean_ocr_artifacts(text))
+
+        return cleaned
+
+    except Exception as e:
+        logger.warning(f"Erreur nettoyage LLM: {e}, utilisation du nettoyage basique")
+        return clean_page_markers(clean_ocr_artifacts(text))
+
+
+def clean_chunk(
+    chunk_text: str,
+    use_llm: bool = False,
+    context: Optional[str] = None,
+    model: Optional[str] = None,
+    provider: LLMProvider = "ollama",
+) -> str:
+    r"""Clean a text chunk with optional LLM enhancement.
+
+    This is the main entry point for chunk cleaning. It always applies
+    basic cleaning (page markers, OCR artifacts) and optionally uses
+    LLM for more intelligent error correction.
+
+    Cleaning pipeline:
+    1. Remove page markers (always)
+    2. Remove OCR artifacts (always)
+    3. LLM correction (if use_llm=True and text >= 50 chars)
+
+    Args:
+        chunk_text: Raw text content of the chunk to clean.
+        use_llm: Whether to use LLM for enhanced cleaning. Defaults to
+            False. Set to True for higher quality but slower processing.
+        context: Optional document context (title, chapter) passed to LLM
+            for better corrections. Ignored if use_llm=False.
+        model: LLM model name. If None, uses provider default.
+            Ignored if use_llm=False.
+        provider: LLM provider ("ollama" or "mistral"). Defaults to
+            "ollama". Ignored if use_llm=False.
+
+    Returns:
+        Cleaned chunk text ready for indexing or further processing.
+
+    Example:
+        >>> # Basic cleaning only (fast)
+        >>> chunk = "<!-- Page 5 -->\n42\n\nThe concept of being..."
+        >>> clean_chunk(chunk)
+        'The concept of being...'
+        >>>
+        >>> # With LLM enhancement (slower, higher quality)
+        >>> clean_chunk(chunk, use_llm=True, context="Heidegger analysis")
+        'The concept of being...'
+
+    See Also:
+        is_chunk_valid: Validate cleaned chunks before processing
+        clean_page_markers: Basic page marker removal
+        clean_ocr_artifacts: Basic artifact removal
+    """
+    # Nettoyage de base toujours appliqué
+    text: str = clean_page_markers(chunk_text)
+    text = clean_ocr_artifacts(text)
+
+    # Nettoyage LLM optionnel
+    if use_llm and len(text) >= 50:
+        text = clean_content_with_llm(text, context=context, model=model, provider=provider)
+
+    return text
+
+
+def is_chunk_valid(chunk_text: str, min_chars: int = 20, min_words: int = 5) -> bool:
+    """Check if a text chunk contains meaningful content.
+
+    Validates that a chunk has sufficient length and is not purely
+    metadata or boilerplate content. Used to filter out non-content
+    chunks before indexing.
+
+    Validation criteria:
+    1. Character count >= min_chars (after page marker removal)
+    2. Word count >= min_words
+    3. Not matching metadata patterns (URLs, ISBNs, DOIs, dates, copyright)
+
+    Args:
+        chunk_text: Text content of the chunk to validate. Page markers
+            are removed before validation.
+        min_chars: Minimum number of characters required. Defaults to 20.
+            Chunks shorter than this are considered invalid.
+        min_words: Minimum number of words required. Defaults to 5.
+            Chunks with fewer words are considered invalid.
+
+    Returns:
+        True if the chunk passes all validation criteria and contains
+        meaningful content suitable for indexing. False otherwise.
+
+    Example:
+        >>> is_chunk_valid("The concept of Dasein is central to Heidegger.")
+        True
+        >>> is_chunk_valid("42")  # Too short
+        False
+        >>> is_chunk_valid("ISBN 978-0-123456-78-9")  # Metadata
+        False
+        >>> is_chunk_valid("https://example.com/page")  # URL
+        False
+
+    Note:
+        Metadata patterns checked:
+        - URLs (http://, https://)
+        - Dates (YYYY-MM-DD format)
+        - ISBN numbers
+        - DOI identifiers
+        - Copyright notices (©)
+    """
+    text: str = clean_page_markers(chunk_text).strip()
+
+    # Vérifier la longueur
+    if len(text) < min_chars:
+        return False
+
+    # Compter les mots
+    words: List[str] = text.split()
+    if len(words) < min_words:
+        return False
+
+    # Vérifier que ce n'est pas juste des métadonnées
+    metadata_patterns: List[str] = [
+        r'^https?://',
+        r'^\d{4}-\d{2}-\d{2}$',
+        r'^ISBN',
+        r'^DOI',
+        r'^©',
+    ]
+    pattern: str
+    for pattern in metadata_patterns:
+        if re.match(pattern, text, re.IGNORECASE):
+            return False
+
+    return True
+