fix: Correct Work titles and improve LLM metadata extraction
Fixes issue where LLM was copying placeholder instructions from the prompt template into actual metadata fields. Changes: 1. Created fix_work_titles.py script to correct existing bad titles - Detects patterns like "(si c'est bien...)", "Titre corrigé...", "Auteur à identifier" - Extracts correct metadata from chunks JSON files - Updates Work entries and associated chunks (44 chunks updated) - Fixed 3 Works with placeholder contamination 2. Improved llm_metadata.py prompt to prevent future issues - Added explicit INTERDIT/OBLIGATOIRE rules with ❌/✅ markers - Replaced placeholder examples with real concrete examples - Added two example responses (high confidence + low confidence) - Final empty JSON template guides structure without placeholders - Reinforced: use "confidence" field for uncertainty, not annotations Results: - "A Cartesian critique... (si c'est bien le titre)" → "A Cartesian critique of the artificial intelligence" - "Titre corrigé si nécessaire (ex: ...)" → "Computationalism and The Case When the Brain Is Not a Computer" - "Titre de l'article principal (à identifier)" → "Computationalism in the Philosophy of Mind" All future document uploads will now extract clean metadata without LLM commentary or placeholder instructions. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
225
fix_work_titles.py
Normal file
225
fix_work_titles.py
Normal file
@@ -0,0 +1,225 @@
|
||||
"""Fix Work titles that contain LLM placeholder instructions."""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List, Tuple
|
||||
|
||||
# Fix Windows encoding
|
||||
if sys.platform == "win32":
|
||||
import io
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
|
||||
|
||||
import weaviate
|
||||
import weaviate.classes.query as wvq
|
||||
|
||||
# Patterns indicating bad titles/authors (LLM placeholders)
|
||||
BAD_PATTERNS = [
|
||||
"si c'est bien le titre",
|
||||
"à identifier",
|
||||
"à confirmer",
|
||||
"ex:",
|
||||
"Titre corrigé",
|
||||
"Auteur à identifier",
|
||||
"Nom de l'auteur",
|
||||
"(possiblement)",
|
||||
"(correct)",
|
||||
]
|
||||
|
||||
def is_bad_metadata(text: str) -> bool:
|
||||
"""Check if metadata contains LLM placeholder patterns."""
|
||||
if not text:
|
||||
return False
|
||||
text_lower = text.lower()
|
||||
return any(pattern.lower() in text_lower for pattern in BAD_PATTERNS)
|
||||
|
||||
def clean_title(title: str) -> str:
|
||||
"""Extract clean title from placeholder text."""
|
||||
if not is_bad_metadata(title):
|
||||
return title
|
||||
|
||||
# Extract from patterns like: "Title (si c'est bien...)"
|
||||
if "(" in title:
|
||||
clean = title.split("(")[0].strip()
|
||||
if clean:
|
||||
return clean
|
||||
|
||||
# Extract from patterns like: "ex: \"Real Title\""
|
||||
if "ex:" in title.lower():
|
||||
import re
|
||||
match = re.search(r'ex:\s*["\']([^"\']+)["\']', title, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return title
|
||||
|
||||
def get_correct_metadata_from_chunks(
|
||||
output_dir: Path, source_id: str
|
||||
) -> Tuple[str | None, str | None]:
|
||||
"""Extract correct title/author from chunks JSON file.
|
||||
|
||||
Returns:
|
||||
Tuple of (title, author) or (None, None) if not found.
|
||||
"""
|
||||
chunks_file = output_dir / source_id / f"{source_id}_chunks.json"
|
||||
if not chunks_file.exists():
|
||||
return None, None
|
||||
|
||||
try:
|
||||
with open(chunks_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
metadata = data.get("metadata", {})
|
||||
|
||||
# Priority: work > original_title > title
|
||||
title = (
|
||||
metadata.get("work") or
|
||||
metadata.get("original_title") or
|
||||
metadata.get("title")
|
||||
)
|
||||
|
||||
author = (
|
||||
metadata.get("original_author") or
|
||||
metadata.get("author")
|
||||
)
|
||||
|
||||
return title, author
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Error reading {chunks_file}: {e}")
|
||||
return None, None
|
||||
|
||||
def fix_works_and_chunks():
|
||||
"""Fix Work titles and update associated chunks."""
|
||||
output_dir = Path(__file__).parent / "generations" / "library_rag" / "output"
|
||||
|
||||
print("🔧 Fixing Work titles with LLM placeholders...\n")
|
||||
|
||||
client = weaviate.connect_to_local()
|
||||
|
||||
try:
|
||||
work_collection = client.collections.get("Work")
|
||||
chunk_collection = client.collections.get("Chunk_v2")
|
||||
|
||||
# Find all Works with bad titles/authors
|
||||
works_to_fix: List[Dict[str, Any]] = []
|
||||
|
||||
print("📊 Scanning Works for placeholder patterns...\n")
|
||||
|
||||
for work in work_collection.iterator(include_vector=False):
|
||||
props = work.properties
|
||||
source_id = props.get("sourceId")
|
||||
title = props.get("title", "")
|
||||
author = props.get("author", "")
|
||||
|
||||
if not source_id:
|
||||
continue
|
||||
|
||||
needs_fix = is_bad_metadata(title) or is_bad_metadata(author)
|
||||
|
||||
if needs_fix:
|
||||
works_to_fix.append({
|
||||
"uuid": str(work.uuid),
|
||||
"source_id": source_id,
|
||||
"old_title": title,
|
||||
"old_author": author,
|
||||
})
|
||||
print(f"❌ Found bad Work: {source_id}")
|
||||
print(f" Title: {title[:80]}")
|
||||
print(f" Author: {author[:80]}\n")
|
||||
|
||||
if not works_to_fix:
|
||||
print("✅ No Works need fixing!")
|
||||
return
|
||||
|
||||
print(f"\n🔍 Found {len(works_to_fix)} Works to fix\n")
|
||||
print("=" * 70)
|
||||
|
||||
# Fix each Work
|
||||
fixed_count = 0
|
||||
failed_count = 0
|
||||
|
||||
for work_data in works_to_fix:
|
||||
source_id = work_data["source_id"]
|
||||
work_uuid = work_data["uuid"]
|
||||
old_title = work_data["old_title"]
|
||||
old_author = work_data["old_author"]
|
||||
|
||||
print(f"\n📝 Fixing: {source_id}")
|
||||
|
||||
# Get correct metadata from chunks file
|
||||
correct_title, correct_author = get_correct_metadata_from_chunks(
|
||||
output_dir, source_id
|
||||
)
|
||||
|
||||
if not correct_title:
|
||||
print(f" ⚠️ Could not find correct metadata, skipping")
|
||||
failed_count += 1
|
||||
continue
|
||||
|
||||
# Clean title if still has placeholders
|
||||
if is_bad_metadata(correct_title):
|
||||
correct_title = clean_title(correct_title)
|
||||
|
||||
if is_bad_metadata(correct_author or ""):
|
||||
correct_author = None # Better to leave empty than keep placeholder
|
||||
|
||||
print(f" Old title: {old_title[:60]}")
|
||||
print(f" New title: {correct_title[:60]}")
|
||||
print(f" Old author: {old_author[:60]}")
|
||||
print(f" New author: {correct_author or 'None'}")
|
||||
|
||||
# Update Work
|
||||
try:
|
||||
work_collection.data.update(
|
||||
uuid=work_uuid,
|
||||
properties={
|
||||
"title": correct_title,
|
||||
"author": correct_author,
|
||||
}
|
||||
)
|
||||
print(f" ✅ Updated Work")
|
||||
|
||||
# Update associated chunks
|
||||
chunks = chunk_collection.query.fetch_objects(
|
||||
filters=wvq.Filter.by_property("workTitle").equal(old_title),
|
||||
limit=1000
|
||||
)
|
||||
|
||||
chunk_count = len(chunks.objects)
|
||||
if chunk_count > 0:
|
||||
print(f" 🔄 Updating {chunk_count} chunks...")
|
||||
|
||||
for chunk in chunks.objects:
|
||||
try:
|
||||
chunk_collection.data.update(
|
||||
uuid=str(chunk.uuid),
|
||||
properties={
|
||||
"workTitle": correct_title,
|
||||
"workAuthor": correct_author,
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Failed to update chunk {chunk.uuid}: {e}")
|
||||
|
||||
print(f" ✅ Updated {chunk_count} chunks")
|
||||
|
||||
fixed_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Failed to update Work: {e}")
|
||||
failed_count += 1
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print(f"\n✅ Fixed {fixed_count} Works")
|
||||
if failed_count > 0:
|
||||
print(f"⚠️ Failed to fix {failed_count} Works")
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
fix_works_and_chunks()
|
||||
print("\n✓ Done")
|
||||
@@ -215,30 +215,49 @@ INDICES POUR TROUVER LE VRAI TITRE:
|
||||
- Répété sur la page de garde et la page de titre
|
||||
- Peut contenir un sous-titre après ":"
|
||||
|
||||
IMPORTANT - FORMAT DES DONNÉES:
|
||||
- N'ajoute JAMAIS d'annotations comme "(correct)", "(à confirmer)", "(possiblement)", etc.
|
||||
- Retourne uniquement les noms propres et titres sans commentaires
|
||||
- NE METS PAS de phrases comme "À confirmer avec...", "Vérifier si...", "Possiblement..."
|
||||
- Le champ "confidence" sert à exprimer ton niveau de certitude
|
||||
- Si tu n'es pas sûr du titre, mets le titre le plus probable ET un confidence faible
|
||||
- EXEMPLE CORRECT: "title": "La pensée-signe" avec "confidence": {{"title": 0.6}}
|
||||
- EXEMPLE INCORRECT: "title": "À confirmer avec le titre exact"
|
||||
RÈGLES CRITIQUES - FORMAT DES DONNÉES:
|
||||
❌ INTERDIT: N'ajoute JAMAIS d'annotations, commentaires ou instructions dans les valeurs
|
||||
❌ INTERDIT: "(correct)", "(à confirmer)", "(possiblement)", "(ex:)", "(si c'est bien...)"
|
||||
❌ INTERDIT: "À confirmer avec...", "Vérifier si...", "Possiblement...", "Titre corrigé..."
|
||||
❌ INTERDIT: "Auteur à identifier", "Nom de l'auteur si disponible"
|
||||
|
||||
RÉPONDS UNIQUEMENT avec un JSON entre balises <JSON></JSON>:
|
||||
✅ OBLIGATOIRE: Retourne UNIQUEMENT le titre exact tel qu'il apparaît dans le document
|
||||
✅ OBLIGATOIRE: Retourne UNIQUEMENT le nom de l'auteur tel qu'il apparaît
|
||||
✅ Si incertain: utilise le champ "confidence" avec un score bas (0.3-0.6)
|
||||
✅ Si vraiment introuvable: utilise null (pas de phrase descriptive)
|
||||
|
||||
EXEMPLE DE BONNE RÉPONSE (extrait d'un vrai livre):
|
||||
<JSON>
|
||||
{{
|
||||
"title": "Le vrai titre de l'œuvre (avec sous-titre si présent)",
|
||||
"author": "Prénom Nom de l'auteur principal",
|
||||
"collection": "Nom de la collection ou série (null si absent)",
|
||||
"publisher": "Nom de l'éditeur",
|
||||
"year": 2023,
|
||||
"doi": "10.xxxx/xxxxx (null si absent)",
|
||||
"isbn": "978-x-xxxx-xxxx-x (null si absent)",
|
||||
"title": "La technique et le temps: La faute d'Épiméthée",
|
||||
"author": "Bernard Stiegler",
|
||||
"collection": "Philosophie",
|
||||
"publisher": "Éditions Galilée",
|
||||
"year": 1994,
|
||||
"doi": null,
|
||||
"isbn": "978-2-7186-0489-8",
|
||||
"language": "fr",
|
||||
"confidence": {{
|
||||
"title": 0.95,
|
||||
"author": 0.90
|
||||
"author": 0.98
|
||||
}}
|
||||
}}
|
||||
</JSON>
|
||||
|
||||
EXEMPLE si métadonnées incertaines:
|
||||
<JSON>
|
||||
{{
|
||||
"title": "Between Past and Future",
|
||||
"author": "Hannah Arendt",
|
||||
"collection": null,
|
||||
"publisher": null,
|
||||
"year": null,
|
||||
"doi": null,
|
||||
"isbn": null,
|
||||
"language": "en",
|
||||
"confidence": {{
|
||||
"title": 0.7,
|
||||
"author": 0.85
|
||||
}}
|
||||
}}
|
||||
</JSON>
|
||||
@@ -246,7 +265,21 @@ RÉPONDS UNIQUEMENT avec un JSON entre balises <JSON></JSON>:
|
||||
DOCUMENT À ANALYSER:
|
||||
{content}
|
||||
|
||||
Réponds UNIQUEMENT avec le JSON."""
|
||||
RAPPEL: Retourne UNIQUEMENT le JSON avec les valeurs EXACTES du document, SANS commentaires ni annotations.
|
||||
|
||||
<JSON>
|
||||
{{
|
||||
"title": "",
|
||||
"author": "",
|
||||
"collection": null,
|
||||
"publisher": null,
|
||||
"year": null,
|
||||
"doi": null,
|
||||
"isbn": null,
|
||||
"language": "fr",
|
||||
"confidence": {{}}
|
||||
}}
|
||||
</JSON>"""
|
||||
|
||||
logger.info(f"Extraction métadonnées via {provider.upper()} ({model})")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user