"""Document validation and enrichment using Large Language Models. This module provides comprehensive validation, correction, and enrichment functionality for parsed documents. It uses LLMs to verify document coherence, detect inconsistencies, suggest corrections, and extract key concepts from text chunks. Overview: The module performs three main functions: 1. **Document Validation** (validate_document): Verifies the coherence of parsed documents by checking metadata, table of contents, and chunk content quality. Returns detailed validation results with issues, corrections, and confidence scores. 2. **Content Enrichment** (enrich_chunks_with_concepts, generate_section_summary): Enhances document content by extracting key philosophical concepts from chunks and generating concise summaries for sections. 3. **Correction Application** (apply_corrections, clean_validation_annotations): Applies suggested corrections from validation results and cleans LLM-generated annotation artifacts from text. Validation Criteria: The validator checks several aspects of document quality: - **Metadata Quality**: Verifies title and author are correctly identified (not collection names, not "Unknown" when visible in text) - **TOC Coherence**: Checks for duplicates, proper ordering, completeness - **Chunk Content**: Ensures chunks contain substantive content, not just metadata fragments or headers Validation Result Structure: The ValidationResult TypedDict contains: - valid (bool): Overall validation pass/fail - errors (List[str]): Critical issues requiring attention - warnings (List[str]): Non-critical suggestions - corrections (Dict[str, str]): Suggested field corrections - concepts (List[str]): Extracted key concepts - score (float): Confidence score (0.0 to 1.0) LLM Provider Support: - ollama: Local LLM (free, slower, privacy-preserving) - mistral: Mistral API (faster, requires API key, ~0.001 per validation) Example: >>> from utils.llm_validator import validate_document, apply_corrections >>> >>> # Validate a parsed document >>> parsed_doc = { ... "metadata": {"title": "Phenomenologie", "author": "Hegel"}, ... "toc": [{"title": "Preface", "level": 1, "page": 1}], ... "chunks": [{"text": "La conscience...", "section_path": "Preface"}] ... } >>> result = validate_document(parsed_doc, provider="ollama") >>> print(f"Valid: {result['valid']}, Score: {result['score']}") Valid: True, Score: 0.85 See Also: utils.llm_cleaner: Text cleaning and validation utils.llm_chunker: Semantic chunking of sections utils.pdf_pipeline: Main pipeline orchestration """ from __future__ import annotations import json import logging import re from typing import Any, Dict, List, Optional, Match from .llm_structurer import call_llm, _get_default_model, _get_default_mistral_model, _clean_json_string from .types import LLMProvider, ValidationResult, ParsedDocument, ChunkData logger: logging.Logger = logging.getLogger(__name__) def _extract_json_from_response(text: str) -> Dict[str, Any]: """Extract JSON from an LLM response text. Attempts to parse JSON from the response using two strategies: 1. Look for content wrapped in tags 2. Find the first { and last } to extract raw JSON Args: text: LLM response text potentially containing JSON data. May include markdown, explanatory text, or XML-style tags. Returns: Parsed dictionary from the JSON content. Returns an empty dict if no valid JSON is found or parsing fails. Example: >>> response = '{"valid": true, "score": 0.9}' >>> _extract_json_from_response(response) {'valid': True, 'score': 0.9} """ json_match: Optional[Match[str]] = re.search(r'\s*(.*?)\s*', text, re.DOTALL) if json_match: json_str: str = _clean_json_string(json_match.group(1)) try: result: Dict[str, Any] = json.loads(json_str) return result except json.JSONDecodeError: pass start: int = text.find("{") end: int = text.rfind("}") if start != -1 and end > start: json_str = _clean_json_string(text[start:end + 1]) try: result = json.loads(json_str) return result except json.JSONDecodeError as e: logger.warning(f"JSON invalide: {e}") return {} def validate_document( parsed_doc: Dict[str, Any], model: Optional[str] = None, provider: LLMProvider = "ollama", temperature: float = 0.1, ) -> ValidationResult: """Validate a parsed document's coherence and suggest corrections. Uses an LLM to analyze the document structure and content, checking for common issues like incorrect metadata, inconsistent TOC, or low-quality chunk content. Args: parsed_doc: Dictionary containing the parsed document with keys: - metadata: Dict with title, author, year, language - toc: List of TOC entries with title, level, page - chunks: List of text chunks with content and metadata model: LLM model name. If None, uses provider's default model. provider: LLM provider, either "ollama" (local) or "mistral" (API). temperature: Model temperature for response generation (0.0-1.0). Lower values produce more deterministic results. Returns: ValidationResult TypedDict containing: - valid: Overall validation status (True if no critical errors) - errors: List of critical issues as strings - warnings: List of non-critical suggestions - corrections: Dict mapping field names to suggested corrections - concepts: Extracted key concepts (empty for this function) - score: Confidence score from 0.0 to 1.0 Note: The function always returns a valid result, even on LLM errors. Check the 'score' field - a score of 0.0 indicates an error occurred. """ if model is None: model = _get_default_mistral_model() if provider == "mistral" else _get_default_model() # Extraire les infos clés metadata: Dict[str, Any] = parsed_doc.get("metadata", {}) toc: List[Dict[str, Any]] = parsed_doc.get("toc", []) chunks: List[Dict[str, Any]] = parsed_doc.get("chunks", []) # Préparer le résumé du document doc_summary: Dict[str, Any] = { "title": metadata.get("title"), "author": metadata.get("author"), "toc_count": len(toc), "toc_preview": [t.get("title") for t in toc[:10]] if toc else [], "chunks_count": len(chunks), "first_chunks_preview": [ c.get("text", "")[:100] for c in chunks[:5] ] if chunks else [], } prompt: str = f"""Tu es un expert en validation de documents structurés. TÂCHE: Vérifier la cohérence de ce document parsé et détecter les erreurs. DOCUMENT PARSÉ: {json.dumps(doc_summary, ensure_ascii=False, indent=2)} VÉRIFICATIONS À EFFECTUER: 1. Le titre correspond-il au contenu? (pas le nom d'une collection) 2. L'auteur est-il correctement identifié? (pas "Inconnu" si visible) 3. La TOC est-elle cohérente? (pas de doublons, bon ordre) 4. Les chunks contiennent-ils du vrai contenu? (pas que des métadonnées) RÉPONDS avec un JSON entre : {{ "is_valid": true, "confidence": 0.85, "issues": [ {{ "field": "title", "severity": "warning", "message": "Le titre semble être le nom de la collection", "suggestion": "Vrai titre suggéré" }} ], "corrections": {{ "title": "Titre corrigé si nécessaire", "author": "Auteur corrigé si nécessaire" }}, "quality_score": {{ "metadata": 0.8, "toc": 0.9, "chunks": 0.7 }} }} """ logger.info(f"Validation du document parsé via {provider.upper()}") try: response: str = call_llm( prompt, model=model, provider=provider, temperature=temperature, timeout=180 ) result: Dict[str, Any] = _extract_json_from_response(response) # Construire ValidationResult avec valeurs par défaut is_valid: bool = result.get("is_valid", True) issues: List[str] = result.get("issues", []) corrections: Dict[str, str] = result.get("corrections", {}) confidence: float = result.get("confidence", 0.5) logger.info(f"Validation terminée: valid={is_valid}, issues={len(issues)}") validation_result: ValidationResult = { "valid": is_valid, "errors": [str(issue) for issue in issues] if issues else [], "warnings": [], "corrections": corrections, "concepts": [], "score": confidence, } return validation_result except Exception as e: logger.error(f"Erreur validation document: {e}") error_result: ValidationResult = { "valid": True, "errors": [str(e)], "warnings": [], "corrections": {}, "concepts": [], "score": 0.0, } return error_result def generate_section_summary( section_content: str, section_title: str, model: Optional[str] = None, provider: LLMProvider = "ollama", max_words: int = 50, ) -> str: """Generate a concise summary for a document section using LLM. Creates a single-sentence summary capturing the main idea of the section. For very short sections (< 100 characters), returns the section title instead of calling the LLM. Args: section_content: Full text content of the section to summarize. section_title: Title of the section, used as fallback if summarization fails or content is too short. model: LLM model name. If None, uses provider's default model. provider: LLM provider, either "ollama" (local) or "mistral" (API). max_words: Maximum number of words for the generated summary. Defaults to 50 words. Returns: Generated summary string, truncated to max_words if necessary. Returns section_title if content is too short or on error. Note: Only the first 2000 characters of section_content are sent to the LLM to manage context window limits and costs. """ if model is None: model = _get_default_mistral_model() if provider == "mistral" else _get_default_model() if len(section_content) < 100: return section_title prompt: str = f"""Résume cette section en maximum {max_words} mots. Le résumé doit capturer l'idée principale. Titre: {section_title} Contenu: {section_content[:2000]} Résumé (en une phrase):""" try: response: str = call_llm( prompt, model=model, provider=provider, temperature=0.2, timeout=60 ) # Nettoyer la réponse summary: str = response.strip() # Limiter la longueur words: List[str] = summary.split() if len(words) > max_words: summary = ' '.join(words[:max_words]) + '...' return summary or section_title except Exception as e: logger.warning(f"Erreur génération résumé: {e}") return section_title def enrich_chunks_with_concepts( chunks: List[Dict[str, Any]], model: Optional[str] = None, provider: LLMProvider = "ollama", ) -> List[Dict[str, Any]]: """Enrich text chunks with extracted key concepts using LLM. Processes each chunk to extract 3-5 key philosophical or thematic concepts, adding them to the chunk's 'concepts' field. Skips chunks that already have concepts or are too short (< 100 characters). Args: chunks: List of chunk dictionaries, each containing at minimum: - text: The chunk's text content May also contain existing 'concepts' field (will be skipped). model: LLM model name. If None, uses provider's default model. provider: LLM provider, either "ollama" (local) or "mistral" (API). Returns: The same list of chunks, modified in-place with 'concepts' field added to each chunk. Each concepts field is a list of 0-5 strings. Note: - Chunks are processed individually with logging every 10 chunks. - Only the first 1000 characters of each chunk are analyzed. - The function modifies chunks in-place AND returns them. - On extraction error, sets concepts to an empty list. """ if model is None: model = _get_default_mistral_model() if provider == "mistral" else _get_default_model() # Limiter le nombre de chunks à traiter en une fois batch_size: int = 10 i: int chunk: Dict[str, Any] for i, chunk in enumerate(chunks): if "concepts" in chunk and chunk["concepts"]: continue # Déjà enrichi text: str = chunk.get("text", "") if len(text) < 100: chunk["concepts"] = [] continue # Traiter par batch pour optimiser if i % batch_size == 0: logger.info(f"Enrichissement concepts: chunks {i} à {min(i+batch_size, len(chunks))}") prompt: str = f"""Extrait 3-5 concepts clés de ce texte. Réponds avec une liste JSON: ["concept1", "concept2", ...] Texte: {text[:1000]} Concepts:""" try: response: str = call_llm( prompt, model=model, provider=provider, temperature=0.1, timeout=30 ) # Chercher la liste JSON match: Optional[Match[str]] = re.search(r'\[.*?\]', response, re.DOTALL) if match: concepts: List[str] = json.loads(match.group()) chunk["concepts"] = concepts[:5] else: chunk["concepts"] = [] except Exception as e: logger.warning(f"Erreur extraction concepts chunk {i}: {e}") chunk["concepts"] = [] return chunks def clean_validation_annotations(text: str) -> str: """Remove LLM-generated validation annotations from text. Cleans common annotation patterns that LLMs may add when validating or correcting text, such as confidence markers or verification notes. Patterns removed: - "(correct)" or "(a confirmer)" at end of text - "(a confirmer comme titre principal)" - "(possiblement...)" or "(probablement...)" - Isolated "(correct)" or "(a confirmer)" mid-text Args: text: Text potentially containing LLM annotation artifacts. Returns: Cleaned text with annotations removed and whitespace normalized. Returns the original text if input is None or empty. Example: >>> clean_validation_annotations("Phenomenologie (a confirmer)") "Phenomenologie" >>> clean_validation_annotations("G.W.F. Hegel (correct)") 'G.W.F. Hegel' """ if not text: return text # Supprimer les annotations à la fin du texte text = re.sub( r'\s*$[^)]*(?:correct|à confirmer|possiblement|probablement)[^)]*$\s*$', '', text, flags=re.IGNORECASE ) # Nettoyer aussi les annotations au milieu si elles sont isolées text = re.sub(r'\s*$(?:correct|à confirmer)$\s*', ' ', text, flags=re.IGNORECASE) return text.strip() def apply_corrections( parsed_doc: Dict[str, Any], validation_result: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: """Apply validation corrections to a parsed document. Takes the corrections suggested by validate_document() and applies them to the document's metadata. Also cleans any LLM annotation artifacts from existing metadata fields. Args: parsed_doc: Parsed document dictionary containing at minimum: - metadata: Dict with title, author, and other fields May also contain 'work' field as fallback title source. validation_result: Result from validate_document() containing: - corrections: Dict mapping field names to corrected values If None, only cleans existing metadata annotations. Returns: The modified parsed_doc with: - Corrected metadata fields applied - Original values preserved in 'original_' keys - LLM annotations cleaned from all text fields - 'validation' key added with the validation_result Note: - Modifies parsed_doc in-place AND returns it - Empty correction values are ignored - If title contains validation phrases and 'work' field exists, the work field value is used as the corrected title """ corrections: Dict[str, str] = ( validation_result.get("corrections", {}) if validation_result else {} ) metadata: Dict[str, Any] = parsed_doc.get("metadata", {}) # Appliquer les corrections de métadonnées if "title" in corrections and corrections["title"]: old_title: Optional[str] = metadata.get("title") # Nettoyer les annotations de validation clean_title: str = clean_validation_annotations(corrections["title"]) metadata["title"] = clean_title metadata["original_title"] = old_title logger.info(f"Titre corrigé: '{old_title}' -> '{clean_title}'") if "author" in corrections and corrections["author"]: old_author: Optional[str] = metadata.get("author") # Nettoyer les annotations de validation clean_author: str = clean_validation_annotations(corrections["author"]) metadata["author"] = clean_author metadata["original_author"] = old_author logger.info(f"Auteur corrigé: '{old_author}' -> '{clean_author}'") # Nettoyer aussi les métadonnées existantes si pas de corrections if "title" in metadata and metadata["title"]: title: str = metadata["title"] # Si le titre contient des phrases de validation, utiliser le champ "work" à la place validation_phrases: List[str] = ["à confirmer", "confirmer avec", "vérifier"] if title and any(phrase in title.lower() for phrase in validation_phrases): if "work" in metadata and metadata["work"]: logger.info(f"Titre remplacé par 'work': '{title}' -> '{metadata['work']}'") metadata["original_title"] = title metadata["title"] = metadata["work"] else: metadata["title"] = clean_validation_annotations(title) if "author" in metadata and metadata["author"]: metadata["author"] = clean_validation_annotations(metadata["author"]) parsed_doc["metadata"] = metadata parsed_doc["validation"] = validation_result return parsed_doc