Add Library RAG project and cleanup root directory

- Add complete Library RAG application (Flask + MCP server) - PDF processing pipeline with OCR and LLM extraction - Weaviate vector database integration (BGE-M3 embeddings) - Flask web interface with search and document management - MCP server for Claude Desktop integration - Comprehensive test suite (134 tests) - Clean up root directory - Remove obsolete documentation files - Remove backup and temporary files - Update autonomous agent configuration - Update prompts - Enhance initializer bis prompt with better instructions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 11:57:12 +01:00
parent 48470236da
commit d2f7165120
84 changed files with 26517 additions and 2 deletions
--- a/generations/library_rag/utils/llm_metadata.py
+++ b/generations/library_rag/utils/llm_metadata.py
@@ -0,0 +1,294 @@
+r"""LLM-based bibliographic metadata extraction from documents.
+
+This module extracts bibliographic metadata (title, author, publisher, year, etc.)
+from document text using Large Language Models. It supports both local (Ollama)
+and cloud-based (Mistral API) LLM providers.
+
+The extraction process:
+    1. Takes the first N characters of the document markdown (typically first pages)
+    2. Sends a structured prompt to the LLM requesting JSON-formatted metadata
+    3. Parses the LLM response to extract the JSON data
+    4. Applies default values and cleanup for missing/invalid fields
+
+Supported metadata fields:
+    - title: Document title (including subtitle if present)
+    - author: Primary author name
+    - collection: Series or collection name
+    - publisher: Publisher name
+    - year: Publication year
+    - doi: Digital Object Identifier
+    - isbn: ISBN number
+    - language: ISO 639-1 language code (default: "fr")
+    - confidence: Dict of confidence scores per field (0.0-1.0)
+
+LLM Provider Differences:
+    - **Ollama** (local): Free, slower, requires local installation.
+      Uses models like "mistral", "llama2", "mixtral".
+    - **Mistral API** (cloud): Fast, paid (~0.002€/call for small prompts).
+      Uses models like "mistral-small-latest", "mistral-medium-latest".
+
+Cost Implications:
+    - Ollama: No API cost, only local compute resources
+    - Mistral API: ~0.002€ per metadata extraction call (small prompt)
+
+Example:
+    >>> from utils.llm_metadata import extract_metadata
+    >>>
+    >>> markdown = '''
+    ... # La technique et le temps
+    ... ## Tome 1 : La faute d'Épiméthée
+    ...
+    ... Bernard Stiegler
+    ...
+    ... Éditions Galilée, 1994
+    ... '''
+    >>>
+    >>> metadata = extract_metadata(markdown, provider="ollama")
+    >>> print(metadata)
+    {
+        'title': 'La technique et le temps. Tome 1 : La faute d\'Épiméthée',
+        'author': 'Bernard Stiegler',
+        'publisher': 'Éditions Galilée',
+        'year': 1994,
+        'language': 'fr',
+        'confidence': {'title': 0.95, 'author': 0.98}
+    }
+
+See Also:
+    - llm_toc: Table of contents extraction via LLM
+    - llm_structurer: Core LLM call infrastructure
+    - pdf_pipeline: Orchestration using this module (Step 4)
+"""
+
+import json
+import logging
+import re
+from typing import Any, Dict, Optional
+
+from .llm_structurer import (
+    _clean_json_string,
+    _get_default_mistral_model,
+    _get_default_model,
+    call_llm,
+)
+from .types import LLMProvider
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def _extract_json_from_response(text: str) -> Dict[str, Any]:
+    """Extract JSON data from an LLM response string.
+
+    Attempts to parse JSON from the LLM response using two strategies:
+    1. First, looks for JSON enclosed in <JSON></JSON> tags (preferred format)
+    2. Falls back to finding the first {...} block in the response
+
+    The function applies JSON string cleaning to handle common LLM quirks
+    like trailing commas, unescaped quotes, etc.
+
+    Args:
+        text: Raw LLM response text that may contain JSON data.
+
+    Returns:
+        Parsed JSON as a dictionary. Returns empty dict if no valid
+        JSON could be extracted.
+
+    Example:
+        >>> response = '<JSON>{"title": "Test", "author": "Smith"}</JSON>'
+        >>> _extract_json_from_response(response)
+        {'title': 'Test', 'author': 'Smith'}
+
+        >>> response = 'Here is the metadata: {"title": "Test"}'
+        >>> _extract_json_from_response(response)
+        {'title': 'Test'}
+    """
+    # Chercher entre balises <JSON> et </JSON>
+    json_match: Optional[re.Match[str]] = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL)
+    if json_match:
+        json_str: str = _clean_json_string(json_match.group(1))
+        try:
+            result: Dict[str, Any] = json.loads(json_str)
+            return result
+        except json.JSONDecodeError:
+            pass
+
+    # Fallback: chercher le premier objet JSON
+    start: int = text.find("{")
+    end: int = text.rfind("}")
+    if start != -1 and end > start:
+        json_str = _clean_json_string(text[start:end + 1])
+        try:
+            result = json.loads(json_str)
+            return result
+        except json.JSONDecodeError as e:
+            logger.warning(f"JSON invalide: {e}")
+
+    return {}
+
+
+def extract_metadata(
+    markdown: str,
+    model: Optional[str] = None,
+    provider: LLMProvider = "ollama",
+    temperature: float = 0.1,
+    max_chars: int = 6000,
+) -> Dict[str, Any]:
+    """Extract bibliographic metadata from a document using an LLM.
+
+    Analyzes the beginning of a document (typically first few pages) to extract
+    bibliographic metadata including title, author, publisher, year, and more.
+    Uses a structured prompt that guides the LLM to distinguish between
+    document title vs. collection name vs. publisher name.
+
+    The LLM is instructed to return confidence scores for extracted fields,
+    allowing downstream processing to handle uncertain extractions appropriately.
+
+    Args:
+        markdown: Document text in Markdown format. For best results, provide
+            at least the first 2-3 pages containing title page and colophon.
+        model: LLM model name to use. If None, uses the default model for the
+            selected provider (e.g., "mistral" for Ollama, "mistral-small-latest"
+            for Mistral API).
+        provider: LLM provider to use. Options are:
+            - "ollama": Local LLM (free, slower, requires Ollama installation)
+            - "mistral": Mistral API (fast, paid, requires API key)
+        temperature: Model temperature for generation. Lower values (0.0-0.3)
+            produce more consistent, deterministic results. Default 0.1.
+        max_chars: Maximum number of characters to send to the LLM. Longer
+            documents are truncated. Default 6000 (~2 pages).
+
+    Returns:
+        Dictionary containing extracted metadata with the following keys:
+            - title (str | None): Document title with subtitle if present
+            - author (str | None): Primary author name
+            - collection (str | None): Series or collection name
+            - publisher (str | None): Publisher name
+            - year (int | None): Publication year
+            - doi (str | None): Digital Object Identifier
+            - isbn (str | None): ISBN number
+            - language (str): ISO 639-1 language code (default "fr")
+            - confidence (dict): Confidence scores per field (0.0-1.0)
+            - error (str): Error message if extraction failed (only on error)
+
+    Raises:
+        No exceptions are raised; errors are captured in the return dict.
+
+    Note:
+        - Cost for Mistral API: ~0.002€ per call (6000 chars input)
+        - Ollama is free but requires local GPU/CPU resources
+        - The prompt is in French as most processed documents are French texts
+        - Low temperature (0.1) is used for consistent metadata extraction
+
+    Example:
+        >>> # Extract from first pages of a philosophy book
+        >>> markdown = Path("output/stiegler/stiegler.md").read_text()[:6000]
+        >>> metadata = extract_metadata(markdown, provider="ollama")
+        >>> print(f"Title: {metadata['title']}")
+        Title: La technique et le temps
+
+        >>> # Using Mistral API for faster extraction
+        >>> metadata = extract_metadata(markdown, provider="mistral")
+        >>> print(f"Author: {metadata['author']} (confidence: {metadata['confidence'].get('author', 'N/A')})")
+        Author: Bernard Stiegler (confidence: 0.98)
+    """
+    if model is None:
+        model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
+    
+    # Prendre les premières pages (métadonnées souvent au début)
+    content: str = markdown[:max_chars]
+    if len(markdown) > max_chars:
+        content += "\n\n[... document tronqué ...]"
+    
+    prompt: str = f"""Tu es un expert en bibliographie et édition scientifique.
+
+TÂCHE: Extraire les métadonnées bibliographiques de ce document.
+
+ATTENTION - PIÈGES COURANTS:
+- Le titre n'est PAS forcément le premier titre H1 (peut être le nom de la collection)
+- Le sous-titre fait partie du titre
+- L'auteur peut apparaître sous le titre, dans les métadonnées éditeur, ou ailleurs
+- Distingue bien: titre de l'œuvre ≠ nom de la collection/série ≠ nom de l'éditeur
+
+INDICES POUR TROUVER LE VRAI TITRE:
+- Souvent en plus grand / plus visible
+- Accompagné du nom de l'auteur juste après
+- Répété sur la page de garde et la page de titre
+- Peut contenir un sous-titre après ":"
+
+IMPORTANT - FORMAT DES DONNÉES:
+- N'ajoute JAMAIS d'annotations comme "(correct)", "(à confirmer)", "(possiblement)", etc.
+- Retourne uniquement les noms propres et titres sans commentaires
+- NE METS PAS de phrases comme "À confirmer avec...", "Vérifier si...", "Possiblement..."
+- Le champ "confidence" sert à exprimer ton niveau de certitude
+- Si tu n'es pas sûr du titre, mets le titre le plus probable ET un confidence faible
+- EXEMPLE CORRECT: "title": "La pensée-signe" avec "confidence": {{"title": 0.6}}
+- EXEMPLE INCORRECT: "title": "À confirmer avec le titre exact"
+
+RÉPONDS UNIQUEMENT avec un JSON entre balises <JSON></JSON>:
+
+<JSON>
+{{
+    "title": "Le vrai titre de l'œuvre (avec sous-titre si présent)",
+    "author": "Prénom Nom de l'auteur principal",
+    "collection": "Nom de la collection ou série (null si absent)",
+    "publisher": "Nom de l'éditeur",
+    "year": 2023,
+    "doi": "10.xxxx/xxxxx (null si absent)",
+    "isbn": "978-x-xxxx-xxxx-x (null si absent)",
+    "language": "fr",
+    "confidence": {{
+        "title": 0.95,
+        "author": 0.90
+    }}
+}}
+</JSON>
+
+DOCUMENT À ANALYSER:
+{content}
+
+Réponds UNIQUEMENT avec le JSON."""
+
+    logger.info(f"Extraction métadonnées via {provider.upper()} ({model})")
+    
+    try:
+        response: str = call_llm(prompt, model=model, provider=provider, temperature=temperature)
+        metadata: Dict[str, Any] = _extract_json_from_response(response)
+        
+        # Valeurs par défaut si non trouvées
+        defaults: Dict[str, Optional[str]] = {
+            "title": None,
+            "author": None,
+            "collection": None,
+            "publisher": None,
+            "year": None,
+            "doi": None,
+            "isbn": None,
+            "language": "fr",
+        }
+        
+        for key, default in defaults.items():
+            if key not in metadata or metadata[key] == "":
+                metadata[key] = default
+        
+        # Nettoyer les valeurs "null" string
+        for key in metadata:
+            if metadata[key] == "null" or metadata[key] == "None":
+                metadata[key] = None
+        
+        logger.info(f"Métadonnées extraites: titre='{metadata.get('title')}', auteur='{metadata.get('author')}'")
+        return metadata
+        
+    except Exception as e:
+        logger.error(f"Erreur extraction métadonnées: {e}")
+        return {
+            "title": None,
+            "author": None,
+            "collection": None,
+            "publisher": None,
+            "year": None,
+            "doi": None,
+            "isbn": None,
+            "language": "fr",
+            "error": str(e),
+        }
+