Problème:
- AttributeError: 'NoneType' object has no attribute 'lower'
- Se produisait quand section.get("title") retournait None au lieu de ""
Corrections:
- llm_classifier.py:
* is_excluded_section(): (section.get("title") or "").lower()
* filter_indexable_sections(): (s.get("chapterTitle") or "").lower()
* validate_classified_sections(): Idem pour chapter_title et section_title
- llm_validator.py:
* apply_corrections(): Ajout de vérification "if title and ..."
- llm_chat.py:
* call_llm(): Ajout d'une exception si provider est None/vide
Pattern de correction:
AVANT: section.get("title", "").lower() # Échoue si None
APRÈS: (section.get("title") or "").lower() # Sûr avec None
Raison:
.get(key, default) retourne le default SEULEMENT si la clé n'existe pas.
Si la clé existe avec valeur None, .get() retourne None, pas le default!
Donc: {"title": None}.get("title", "") -> None (pas "")
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
514 lines
18 KiB
Python
514 lines
18 KiB
Python
"""Document validation and enrichment using Large Language Models.
|
|
|
|
This module provides comprehensive validation, correction, and enrichment
|
|
functionality for parsed documents. It uses LLMs to verify document coherence,
|
|
detect inconsistencies, suggest corrections, and extract key concepts from
|
|
text chunks.
|
|
|
|
Overview:
|
|
The module performs three main functions:
|
|
|
|
1. **Document Validation** (validate_document):
|
|
Verifies the coherence of parsed documents by checking metadata,
|
|
table of contents, and chunk content quality. Returns detailed
|
|
validation results with issues, corrections, and confidence scores.
|
|
|
|
2. **Content Enrichment** (enrich_chunks_with_concepts, generate_section_summary):
|
|
Enhances document content by extracting key philosophical concepts
|
|
from chunks and generating concise summaries for sections.
|
|
|
|
3. **Correction Application** (apply_corrections, clean_validation_annotations):
|
|
Applies suggested corrections from validation results and cleans
|
|
LLM-generated annotation artifacts from text.
|
|
|
|
Validation Criteria:
|
|
The validator checks several aspects of document quality:
|
|
|
|
- **Metadata Quality**: Verifies title and author are correctly identified
|
|
(not collection names, not "Unknown" when visible in text)
|
|
- **TOC Coherence**: Checks for duplicates, proper ordering, completeness
|
|
- **Chunk Content**: Ensures chunks contain substantive content, not just
|
|
metadata fragments or headers
|
|
|
|
Validation Result Structure:
|
|
The ValidationResult TypedDict contains:
|
|
|
|
- valid (bool): Overall validation pass/fail
|
|
- errors (List[str]): Critical issues requiring attention
|
|
- warnings (List[str]): Non-critical suggestions
|
|
- corrections (Dict[str, str]): Suggested field corrections
|
|
- concepts (List[str]): Extracted key concepts
|
|
- score (float): Confidence score (0.0 to 1.0)
|
|
|
|
LLM Provider Support:
|
|
- ollama: Local LLM (free, slower, privacy-preserving)
|
|
- mistral: Mistral API (faster, requires API key, ~0.001 per validation)
|
|
|
|
Example:
|
|
>>> from utils.llm_validator import validate_document, apply_corrections
|
|
>>>
|
|
>>> # Validate a parsed document
|
|
>>> parsed_doc = {
|
|
... "metadata": {"title": "Phenomenologie", "author": "Hegel"},
|
|
... "toc": [{"title": "Preface", "level": 1, "page": 1}],
|
|
... "chunks": [{"text": "La conscience...", "section_path": "Preface"}]
|
|
... }
|
|
>>> result = validate_document(parsed_doc, provider="ollama")
|
|
>>> print(f"Valid: {result['valid']}, Score: {result['score']}")
|
|
Valid: True, Score: 0.85
|
|
|
|
See Also:
|
|
utils.llm_cleaner: Text cleaning and validation
|
|
utils.llm_chunker: Semantic chunking of sections
|
|
utils.pdf_pipeline: Main pipeline orchestration
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List, Optional, Match
|
|
|
|
from .llm_structurer import call_llm, _get_default_model, _get_default_mistral_model, _clean_json_string
|
|
from .types import LLMProvider, ValidationResult, ParsedDocument, ChunkData
|
|
|
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _extract_json_from_response(text: str) -> Dict[str, Any]:
|
|
"""Extract JSON from an LLM response text.
|
|
|
|
Attempts to parse JSON from the response using two strategies:
|
|
1. Look for content wrapped in <JSON></JSON> tags
|
|
2. Find the first { and last } to extract raw JSON
|
|
|
|
Args:
|
|
text: LLM response text potentially containing JSON data.
|
|
May include markdown, explanatory text, or XML-style tags.
|
|
|
|
Returns:
|
|
Parsed dictionary from the JSON content. Returns an empty dict
|
|
if no valid JSON is found or parsing fails.
|
|
|
|
Example:
|
|
>>> response = '<JSON>{"valid": true, "score": 0.9}</JSON>'
|
|
>>> _extract_json_from_response(response)
|
|
{'valid': True, 'score': 0.9}
|
|
"""
|
|
json_match: Optional[Match[str]] = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL)
|
|
if json_match:
|
|
json_str: str = _clean_json_string(json_match.group(1))
|
|
try:
|
|
result: Dict[str, Any] = json.loads(json_str)
|
|
return result
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
start: int = text.find("{")
|
|
end: int = text.rfind("}")
|
|
if start != -1 and end > start:
|
|
json_str = _clean_json_string(text[start:end + 1])
|
|
try:
|
|
result = json.loads(json_str)
|
|
return result
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"JSON invalide: {e}")
|
|
|
|
return {}
|
|
|
|
|
|
def validate_document(
|
|
parsed_doc: Dict[str, Any],
|
|
model: Optional[str] = None,
|
|
provider: LLMProvider = "ollama",
|
|
temperature: float = 0.1,
|
|
) -> ValidationResult:
|
|
"""Validate a parsed document's coherence and suggest corrections.
|
|
|
|
Uses an LLM to analyze the document structure and content, checking
|
|
for common issues like incorrect metadata, inconsistent TOC, or
|
|
low-quality chunk content.
|
|
|
|
Args:
|
|
parsed_doc: Dictionary containing the parsed document with keys:
|
|
- metadata: Dict with title, author, year, language
|
|
- toc: List of TOC entries with title, level, page
|
|
- chunks: List of text chunks with content and metadata
|
|
model: LLM model name. If None, uses provider's default model.
|
|
provider: LLM provider, either "ollama" (local) or "mistral" (API).
|
|
temperature: Model temperature for response generation (0.0-1.0).
|
|
Lower values produce more deterministic results.
|
|
|
|
Returns:
|
|
ValidationResult TypedDict containing:
|
|
- valid: Overall validation status (True if no critical errors)
|
|
- errors: List of critical issues as strings
|
|
- warnings: List of non-critical suggestions
|
|
- corrections: Dict mapping field names to suggested corrections
|
|
- concepts: Extracted key concepts (empty for this function)
|
|
- score: Confidence score from 0.0 to 1.0
|
|
|
|
Note:
|
|
The function always returns a valid result, even on LLM errors.
|
|
Check the 'score' field - a score of 0.0 indicates an error occurred.
|
|
"""
|
|
if model is None:
|
|
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
|
|
|
# Extraire les infos clés
|
|
metadata: Dict[str, Any] = parsed_doc.get("metadata", {})
|
|
toc: List[Dict[str, Any]] = parsed_doc.get("toc", [])
|
|
chunks: List[Dict[str, Any]] = parsed_doc.get("chunks", [])
|
|
|
|
# Préparer le résumé du document
|
|
doc_summary: Dict[str, Any] = {
|
|
"title": metadata.get("title"),
|
|
"author": metadata.get("author"),
|
|
"toc_count": len(toc),
|
|
"toc_preview": [t.get("title") for t in toc[:10]] if toc else [],
|
|
"chunks_count": len(chunks),
|
|
"first_chunks_preview": [
|
|
c.get("text", "")[:100] for c in chunks[:5]
|
|
] if chunks else [],
|
|
}
|
|
|
|
prompt: str = f"""Tu es un expert en validation de documents structurés.
|
|
|
|
TÂCHE: Vérifier la cohérence de ce document parsé et détecter les erreurs.
|
|
|
|
DOCUMENT PARSÉ:
|
|
{json.dumps(doc_summary, ensure_ascii=False, indent=2)}
|
|
|
|
VÉRIFICATIONS À EFFECTUER:
|
|
1. Le titre correspond-il au contenu? (pas le nom d'une collection)
|
|
2. L'auteur est-il correctement identifié? (pas "Inconnu" si visible)
|
|
3. La TOC est-elle cohérente? (pas de doublons, bon ordre)
|
|
4. Les chunks contiennent-ils du vrai contenu? (pas que des métadonnées)
|
|
|
|
RÉPONDS avec un JSON entre <JSON></JSON>:
|
|
|
|
<JSON>
|
|
{{
|
|
"is_valid": true,
|
|
"confidence": 0.85,
|
|
"issues": [
|
|
{{
|
|
"field": "title",
|
|
"severity": "warning",
|
|
"message": "Le titre semble être le nom de la collection",
|
|
"suggestion": "Vrai titre suggéré"
|
|
}}
|
|
],
|
|
"corrections": {{
|
|
"title": "Titre corrigé si nécessaire",
|
|
"author": "Auteur corrigé si nécessaire"
|
|
}},
|
|
"quality_score": {{
|
|
"metadata": 0.8,
|
|
"toc": 0.9,
|
|
"chunks": 0.7
|
|
}}
|
|
}}
|
|
</JSON>
|
|
"""
|
|
|
|
logger.info(f"Validation du document parsé via {provider.upper()}")
|
|
|
|
try:
|
|
response: str = call_llm(
|
|
prompt, model=model, provider=provider, temperature=temperature, timeout=180
|
|
)
|
|
result: Dict[str, Any] = _extract_json_from_response(response)
|
|
|
|
# Construire ValidationResult avec valeurs par défaut
|
|
is_valid: bool = result.get("is_valid", True)
|
|
issues: List[str] = result.get("issues", [])
|
|
corrections: Dict[str, str] = result.get("corrections", {})
|
|
confidence: float = result.get("confidence", 0.5)
|
|
|
|
logger.info(f"Validation terminée: valid={is_valid}, issues={len(issues)}")
|
|
|
|
validation_result: ValidationResult = {
|
|
"valid": is_valid,
|
|
"errors": [str(issue) for issue in issues] if issues else [],
|
|
"warnings": [],
|
|
"corrections": corrections,
|
|
"concepts": [],
|
|
"score": confidence,
|
|
}
|
|
return validation_result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Erreur validation document: {e}")
|
|
error_result: ValidationResult = {
|
|
"valid": True,
|
|
"errors": [str(e)],
|
|
"warnings": [],
|
|
"corrections": {},
|
|
"concepts": [],
|
|
"score": 0.0,
|
|
}
|
|
return error_result
|
|
|
|
|
|
def generate_section_summary(
|
|
section_content: str,
|
|
section_title: str,
|
|
model: Optional[str] = None,
|
|
provider: LLMProvider = "ollama",
|
|
max_words: int = 50,
|
|
) -> str:
|
|
"""Generate a concise summary for a document section using LLM.
|
|
|
|
Creates a single-sentence summary capturing the main idea of the section.
|
|
For very short sections (< 100 characters), returns the section title
|
|
instead of calling the LLM.
|
|
|
|
Args:
|
|
section_content: Full text content of the section to summarize.
|
|
section_title: Title of the section, used as fallback if summarization
|
|
fails or content is too short.
|
|
model: LLM model name. If None, uses provider's default model.
|
|
provider: LLM provider, either "ollama" (local) or "mistral" (API).
|
|
max_words: Maximum number of words for the generated summary.
|
|
Defaults to 50 words.
|
|
|
|
Returns:
|
|
Generated summary string, truncated to max_words if necessary.
|
|
Returns section_title if content is too short or on error.
|
|
|
|
Note:
|
|
Only the first 2000 characters of section_content are sent to the LLM
|
|
to manage context window limits and costs.
|
|
"""
|
|
if model is None:
|
|
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
|
|
|
if len(section_content) < 100:
|
|
return section_title
|
|
|
|
prompt: str = f"""Résume cette section en maximum {max_words} mots.
|
|
Le résumé doit capturer l'idée principale.
|
|
|
|
Titre: {section_title}
|
|
Contenu:
|
|
{section_content[:2000]}
|
|
|
|
Résumé (en une phrase):"""
|
|
|
|
try:
|
|
response: str = call_llm(
|
|
prompt, model=model, provider=provider, temperature=0.2, timeout=60
|
|
)
|
|
|
|
# Nettoyer la réponse
|
|
summary: str = response.strip()
|
|
|
|
# Limiter la longueur
|
|
words: List[str] = summary.split()
|
|
if len(words) > max_words:
|
|
summary = ' '.join(words[:max_words]) + '...'
|
|
|
|
return summary or section_title
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Erreur génération résumé: {e}")
|
|
return section_title
|
|
|
|
|
|
def enrich_chunks_with_concepts(
|
|
chunks: List[Dict[str, Any]],
|
|
model: Optional[str] = None,
|
|
provider: LLMProvider = "ollama",
|
|
) -> List[Dict[str, Any]]:
|
|
"""Enrich text chunks with extracted key concepts using LLM.
|
|
|
|
Processes each chunk to extract 3-5 key philosophical or thematic
|
|
concepts, adding them to the chunk's 'concepts' field. Skips chunks
|
|
that already have concepts or are too short (< 100 characters).
|
|
|
|
Args:
|
|
chunks: List of chunk dictionaries, each containing at minimum:
|
|
- text: The chunk's text content
|
|
May also contain existing 'concepts' field (will be skipped).
|
|
model: LLM model name. If None, uses provider's default model.
|
|
provider: LLM provider, either "ollama" (local) or "mistral" (API).
|
|
|
|
Returns:
|
|
The same list of chunks, modified in-place with 'concepts' field
|
|
added to each chunk. Each concepts field is a list of 0-5 strings.
|
|
|
|
Note:
|
|
- Chunks are processed individually with logging every 10 chunks.
|
|
- Only the first 1000 characters of each chunk are analyzed.
|
|
- The function modifies chunks in-place AND returns them.
|
|
- On extraction error, sets concepts to an empty list.
|
|
"""
|
|
if model is None:
|
|
model = _get_default_mistral_model() if provider == "mistral" else _get_default_model()
|
|
|
|
# Limiter le nombre de chunks à traiter en une fois
|
|
batch_size: int = 10
|
|
|
|
i: int
|
|
chunk: Dict[str, Any]
|
|
for i, chunk in enumerate(chunks):
|
|
if "concepts" in chunk and chunk["concepts"]:
|
|
continue # Déjà enrichi
|
|
|
|
text: str = chunk.get("text", "")
|
|
if len(text) < 100:
|
|
chunk["concepts"] = []
|
|
continue
|
|
|
|
# Traiter par batch pour optimiser
|
|
if i % batch_size == 0:
|
|
logger.info(f"Enrichissement concepts: chunks {i} à {min(i+batch_size, len(chunks))}")
|
|
|
|
prompt: str = f"""Extrait 3-5 concepts clés de ce texte.
|
|
Réponds avec une liste JSON: ["concept1", "concept2", ...]
|
|
|
|
Texte:
|
|
{text[:1000]}
|
|
|
|
Concepts:"""
|
|
|
|
try:
|
|
response: str = call_llm(
|
|
prompt, model=model, provider=provider, temperature=0.1, timeout=30
|
|
)
|
|
|
|
# Chercher la liste JSON
|
|
match: Optional[Match[str]] = re.search(r'\[.*?\]', response, re.DOTALL)
|
|
if match:
|
|
concepts: List[str] = json.loads(match.group())
|
|
chunk["concepts"] = concepts[:5]
|
|
else:
|
|
chunk["concepts"] = []
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Erreur extraction concepts chunk {i}: {e}")
|
|
chunk["concepts"] = []
|
|
|
|
return chunks
|
|
|
|
|
|
def clean_validation_annotations(text: str) -> str:
|
|
"""Remove LLM-generated validation annotations from text.
|
|
|
|
Cleans common annotation patterns that LLMs may add when validating
|
|
or correcting text, such as confidence markers or verification notes.
|
|
|
|
Patterns removed:
|
|
- "(correct)" or "(a confirmer)" at end of text
|
|
- "(a confirmer comme titre principal)"
|
|
- "(possiblement...)" or "(probablement...)"
|
|
- Isolated "(correct)" or "(a confirmer)" mid-text
|
|
|
|
Args:
|
|
text: Text potentially containing LLM annotation artifacts.
|
|
|
|
Returns:
|
|
Cleaned text with annotations removed and whitespace normalized.
|
|
Returns the original text if input is None or empty.
|
|
|
|
Example:
|
|
>>> clean_validation_annotations("Phenomenologie (a confirmer)")
|
|
"Phenomenologie"
|
|
>>> clean_validation_annotations("G.W.F. Hegel (correct)")
|
|
'G.W.F. Hegel'
|
|
"""
|
|
if not text:
|
|
return text
|
|
|
|
# Supprimer les annotations à la fin du texte
|
|
text = re.sub(
|
|
r'\s*\([^)]*(?:correct|à confirmer|possiblement|probablement)[^)]*\)\s*$',
|
|
'',
|
|
text,
|
|
flags=re.IGNORECASE
|
|
)
|
|
|
|
# Nettoyer aussi les annotations au milieu si elles sont isolées
|
|
text = re.sub(r'\s*\((?:correct|à confirmer)\)\s*', ' ', text, flags=re.IGNORECASE)
|
|
|
|
return text.strip()
|
|
|
|
|
|
def apply_corrections(
|
|
parsed_doc: Dict[str, Any],
|
|
validation_result: Optional[Dict[str, Any]] = None,
|
|
) -> Dict[str, Any]:
|
|
"""Apply validation corrections to a parsed document.
|
|
|
|
Takes the corrections suggested by validate_document() and applies them
|
|
to the document's metadata. Also cleans any LLM annotation artifacts
|
|
from existing metadata fields.
|
|
|
|
Args:
|
|
parsed_doc: Parsed document dictionary containing at minimum:
|
|
- metadata: Dict with title, author, and other fields
|
|
May also contain 'work' field as fallback title source.
|
|
validation_result: Result from validate_document() containing:
|
|
- corrections: Dict mapping field names to corrected values
|
|
If None, only cleans existing metadata annotations.
|
|
|
|
Returns:
|
|
The modified parsed_doc with:
|
|
- Corrected metadata fields applied
|
|
- Original values preserved in 'original_<field>' keys
|
|
- LLM annotations cleaned from all text fields
|
|
- 'validation' key added with the validation_result
|
|
|
|
Note:
|
|
- Modifies parsed_doc in-place AND returns it
|
|
- Empty correction values are ignored
|
|
- If title contains validation phrases and 'work' field exists,
|
|
the work field value is used as the corrected title
|
|
"""
|
|
corrections: Dict[str, str] = (
|
|
validation_result.get("corrections", {}) if validation_result else {}
|
|
)
|
|
|
|
metadata: Dict[str, Any] = parsed_doc.get("metadata", {})
|
|
|
|
# Appliquer les corrections de métadonnées
|
|
if "title" in corrections and corrections["title"]:
|
|
old_title: Optional[str] = metadata.get("title")
|
|
# Nettoyer les annotations de validation
|
|
clean_title: str = clean_validation_annotations(corrections["title"])
|
|
metadata["title"] = clean_title
|
|
metadata["original_title"] = old_title
|
|
logger.info(f"Titre corrigé: '{old_title}' -> '{clean_title}'")
|
|
|
|
if "author" in corrections and corrections["author"]:
|
|
old_author: Optional[str] = metadata.get("author")
|
|
# Nettoyer les annotations de validation
|
|
clean_author: str = clean_validation_annotations(corrections["author"])
|
|
metadata["author"] = clean_author
|
|
metadata["original_author"] = old_author
|
|
logger.info(f"Auteur corrigé: '{old_author}' -> '{clean_author}'")
|
|
|
|
# Nettoyer aussi les métadonnées existantes si pas de corrections
|
|
if "title" in metadata and metadata["title"]:
|
|
title: str = metadata["title"]
|
|
# Si le titre contient des phrases de validation, utiliser le champ "work" à la place
|
|
validation_phrases: List[str] = ["à confirmer", "confirmer avec", "vérifier"]
|
|
if title and any(phrase in title.lower() for phrase in validation_phrases):
|
|
if "work" in metadata and metadata["work"]:
|
|
logger.info(f"Titre remplacé par 'work': '{title}' -> '{metadata['work']}'")
|
|
metadata["original_title"] = title
|
|
metadata["title"] = metadata["work"]
|
|
else:
|
|
metadata["title"] = clean_validation_annotations(title)
|
|
|
|
if "author" in metadata and metadata["author"]:
|
|
metadata["author"] = clean_validation_annotations(metadata["author"])
|
|
|
|
parsed_doc["metadata"] = metadata
|
|
parsed_doc["validation"] = validation_result
|
|
|
|
return parsed_doc
|
|
|