fix: Correct Work titles and improve LLM metadata extraction

Fixes issue where LLM was copying placeholder instructions from the prompt template into actual metadata fields. Changes: 1. Created fix_work_titles.py script to correct existing bad titles - Detects patterns like "(si c'est bien...)", "Titre corrigé...", "Auteur à identifier" - Extracts correct metadata from chunks JSON files - Updates Work entries and associated chunks (44 chunks updated) - Fixed 3 Works with placeholder contamination 2. Improved llm_metadata.py prompt to prevent future issues - Added explicit INTERDIT/OBLIGATOIRE rules with ❌/✅ markers - Replaced placeholder examples with real concrete examples - Added two example responses (high confidence + low confidence) - Final empty JSON template guides structure without placeholders - Reinforced: use "confidence" field for uncertainty, not annotations Results: - "A Cartesian critique... (si c'est bien le titre)" → "A Cartesian critique of the artificial intelligence" - "Titre corrigé si nécessaire (ex: ...)" → "Computationalism and The Case When the Brain Is Not a Computer" - "Titre de l'article principal (à identifier)" → "Computationalism in the Philosophy of Mind" All future document uploads will now extract clean metadata without LLM commentary or placeholder instructions. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-08 23:59:25 +01:00
parent 0c3b6c5fea
commit 0c8ea8fa48
2 changed files with 276 additions and 18 deletions
--- a/fix_work_titles.py
+++ b/fix_work_titles.py
@@ -0,0 +1,225 @@
+"""Fix Work titles that contain LLM placeholder instructions."""
+
+import json
+import sys
+from pathlib import Path
+from typing import Dict, Any, List, Tuple
+
+# Fix Windows encoding
+if sys.platform == "win32":
+    import io
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
+
+import weaviate
+import weaviate.classes.query as wvq
+
+# Patterns indicating bad titles/authors (LLM placeholders)
+BAD_PATTERNS = [
+    "si c'est bien le titre",
+    "à identifier",
+    "à confirmer",
+    "ex:",
+    "Titre corrigé",
+    "Auteur à identifier",
+    "Nom de l'auteur",
+    "(possiblement)",
+    "(correct)",
+]
+
+def is_bad_metadata(text: str) -> bool:
+    """Check if metadata contains LLM placeholder patterns."""
+    if not text:
+        return False
+    text_lower = text.lower()
+    return any(pattern.lower() in text_lower for pattern in BAD_PATTERNS)
+
+def clean_title(title: str) -> str:
+    """Extract clean title from placeholder text."""
+    if not is_bad_metadata(title):
+        return title
+
+    # Extract from patterns like: "Title (si c'est bien...)"
+    if "(" in title:
+        clean = title.split("(")[0].strip()
+        if clean:
+            return clean
+
+    # Extract from patterns like: "ex: \"Real Title\""
+    if "ex:" in title.lower():
+        import re
+        match = re.search(r'ex:\s*["\']([^"\']+)["\']', title, re.IGNORECASE)
+        if match:
+            return match.group(1)
+
+    return title
+
+def get_correct_metadata_from_chunks(
+    output_dir: Path, source_id: str
+) -> Tuple[str | None, str | None]:
+    """Extract correct title/author from chunks JSON file.
+
+    Returns:
+        Tuple of (title, author) or (None, None) if not found.
+    """
+    chunks_file = output_dir / source_id / f"{source_id}_chunks.json"
+    if not chunks_file.exists():
+        return None, None
+
+    try:
+        with open(chunks_file, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+
+        metadata = data.get("metadata", {})
+
+        # Priority: work > original_title > title
+        title = (
+            metadata.get("work") or
+            metadata.get("original_title") or
+            metadata.get("title")
+        )
+
+        author = (
+            metadata.get("original_author") or
+            metadata.get("author")
+        )
+
+        return title, author
+    except Exception as e:
+        print(f"  ⚠️  Error reading {chunks_file}: {e}")
+        return None, None
+
+def fix_works_and_chunks():
+    """Fix Work titles and update associated chunks."""
+    output_dir = Path(__file__).parent / "generations" / "library_rag" / "output"
+
+    print("🔧 Fixing Work titles with LLM placeholders...\n")
+
+    client = weaviate.connect_to_local()
+
+    try:
+        work_collection = client.collections.get("Work")
+        chunk_collection = client.collections.get("Chunk_v2")
+
+        # Find all Works with bad titles/authors
+        works_to_fix: List[Dict[str, Any]] = []
+
+        print("📊 Scanning Works for placeholder patterns...\n")
+
+        for work in work_collection.iterator(include_vector=False):
+            props = work.properties
+            source_id = props.get("sourceId")
+            title = props.get("title", "")
+            author = props.get("author", "")
+
+            if not source_id:
+                continue
+
+            needs_fix = is_bad_metadata(title) or is_bad_metadata(author)
+
+            if needs_fix:
+                works_to_fix.append({
+                    "uuid": str(work.uuid),
+                    "source_id": source_id,
+                    "old_title": title,
+                    "old_author": author,
+                })
+                print(f"❌ Found bad Work: {source_id}")
+                print(f"   Title: {title[:80]}")
+                print(f"   Author: {author[:80]}\n")
+
+        if not works_to_fix:
+            print("✅ No Works need fixing!")
+            return
+
+        print(f"\n🔍 Found {len(works_to_fix)} Works to fix\n")
+        print("=" * 70)
+
+        # Fix each Work
+        fixed_count = 0
+        failed_count = 0
+
+        for work_data in works_to_fix:
+            source_id = work_data["source_id"]
+            work_uuid = work_data["uuid"]
+            old_title = work_data["old_title"]
+            old_author = work_data["old_author"]
+
+            print(f"\n📝 Fixing: {source_id}")
+
+            # Get correct metadata from chunks file
+            correct_title, correct_author = get_correct_metadata_from_chunks(
+                output_dir, source_id
+            )
+
+            if not correct_title:
+                print(f"   ⚠️  Could not find correct metadata, skipping")
+                failed_count += 1
+                continue
+
+            # Clean title if still has placeholders
+            if is_bad_metadata(correct_title):
+                correct_title = clean_title(correct_title)
+
+            if is_bad_metadata(correct_author or ""):
+                correct_author = None  # Better to leave empty than keep placeholder
+
+            print(f"   Old title: {old_title[:60]}")
+            print(f"   New title: {correct_title[:60]}")
+            print(f"   Old author: {old_author[:60]}")
+            print(f"   New author: {correct_author or 'None'}")
+
+            # Update Work
+            try:
+                work_collection.data.update(
+                    uuid=work_uuid,
+                    properties={
+                        "title": correct_title,
+                        "author": correct_author,
+                    }
+                )
+                print(f"   ✅ Updated Work")
+
+                # Update associated chunks
+                chunks = chunk_collection.query.fetch_objects(
+                    filters=wvq.Filter.by_property("workTitle").equal(old_title),
+                    limit=1000
+                )
+
+                chunk_count = len(chunks.objects)
+                if chunk_count > 0:
+                    print(f"   🔄 Updating {chunk_count} chunks...")
+
+                    for chunk in chunks.objects:
+                        try:
+                            chunk_collection.data.update(
+                                uuid=str(chunk.uuid),
+                                properties={
+                                    "workTitle": correct_title,
+                                    "workAuthor": correct_author,
+                                }
+                            )
+                        except Exception as e:
+                            print(f"      ⚠️  Failed to update chunk {chunk.uuid}: {e}")
+
+                    print(f"   ✅ Updated {chunk_count} chunks")
+
+                fixed_count += 1
+
+            except Exception as e:
+                print(f"   ❌ Failed to update Work: {e}")
+                failed_count += 1
+
+        print("\n" + "=" * 70)
+        print(f"\n✅ Fixed {fixed_count} Works")
+        if failed_count > 0:
+            print(f"⚠️  Failed to fix {failed_count} Works")
+
+    finally:
+        client.close()
+
+if __name__ == "__main__":
+    fix_works_and_chunks()
+    print("\n✓ Done")
--- a/generations/library_rag/utils/llm_metadata.py
+++ b/generations/library_rag/utils/llm_metadata.py
@@ -215,30 +215,49 @@ INDICES POUR TROUVER LE VRAI TITRE:
 - Répété sur la page de garde et la page de titre
 - Peut contenir un sous-titre après ":"

-IMPORTANT - FORMAT DES DONNÉES:
- N'ajoute JAMAIS d'annotations comme "(correct)", "(à confirmer)", "(possiblement)", etc.
- Retourne uniquement les noms propres et titres sans commentaires
- NE METS PAS de phrases comme "À confirmer avec...", "Vérifier si...", "Possiblement..."
- Le champ "confidence" sert à exprimer ton niveau de certitude
- Si tu n'es pas sûr du titre, mets le titre le plus probable ET un confidence faible
- EXEMPLE CORRECT: "title": "La pensée-signe" avec "confidence": {{"title": 0.6}}
- EXEMPLE INCORRECT: "title": "À confirmer avec le titre exact"
+RÈGLES CRITIQUES - FORMAT DES DONNÉES:
+❌ INTERDIT: N'ajoute JAMAIS d'annotations, commentaires ou instructions dans les valeurs
+❌ INTERDIT: "(correct)", "(à confirmer)", "(possiblement)", "(ex:)", "(si c'est bien...)"
+❌ INTERDIT: "À confirmer avec...", "Vérifier si...", "Possiblement...", "Titre corrigé..."
+❌ INTERDIT: "Auteur à identifier", "Nom de l'auteur si disponible"

-RÉPONDS UNIQUEMENT avec un JSON entre balises <JSON></JSON>:
+✅ OBLIGATOIRE: Retourne UNIQUEMENT le titre exact tel qu'il apparaît dans le document
+✅ OBLIGATOIRE: Retourne UNIQUEMENT le nom de l'auteur tel qu'il apparaît
+✅ Si incertain: utilise le champ "confidence" avec un score bas (0.3-0.6)
+✅ Si vraiment introuvable: utilise null (pas de phrase descriptive)

+EXEMPLE DE BONNE RÉPONSE (extrait d'un vrai livre):
 <JSON>
 {{
-    "title": "Le vrai titre de l'œuvre (avec sous-titre si présent)",
-    "author": "Prénom Nom de l'auteur principal",
-    "collection": "Nom de la collection ou série (null si absent)",
-    "publisher": "Nom de l'éditeur",
-    "year": 2023,
-    "doi": "10.xxxx/xxxxx (null si absent)",
-    "isbn": "978-x-xxxx-xxxx-x (null si absent)",
+    "title": "La technique et le temps: La faute d'Épiméthée",
+    "author": "Bernard Stiegler",
+    "collection": "Philosophie",
+    "publisher": "Éditions Galilée",
+    "year": 1994,
+    "doi": null,
+    "isbn": "978-2-7186-0489-8",
    "language": "fr",
    "confidence": {{
        "title": 0.95,
-        "author": 0.90
+        "author": 0.98
+    }}
+}}
+</JSON>
+
+EXEMPLE si métadonnées incertaines:
+<JSON>
+{{
+    "title": "Between Past and Future",
+    "author": "Hannah Arendt",
+    "collection": null,
+    "publisher": null,
+    "year": null,
+    "doi": null,
+    "isbn": null,
+    "language": "en",
+    "confidence": {{
+        "title": 0.7,
+        "author": 0.85
    }}
 }}
 </JSON>
@@ -246,7 +265,21 @@ RÉPONDS UNIQUEMENT avec un JSON entre balises <JSON></JSON>:
 DOCUMENT À ANALYSER:
 {content}

-Réponds UNIQUEMENT avec le JSON."""
+RAPPEL: Retourne UNIQUEMENT le JSON avec les valeurs EXACTES du document, SANS commentaires ni annotations.
+
+<JSON>
+{{
+    "title": "",
+    "author": "",
+    "collection": null,
+    "publisher": null,
+    "year": null,
+    "doi": null,
+    "isbn": null,
+    "language": "fr",
+    "confidence": {{}}
+}}
+</JSON>"""

    logger.info(f"Extraction métadonnées via {provider.upper()} ({model})")