From 0c8ea8fa482482c4e9078b45b797f486cd90717c Mon Sep 17 00:00:00 2001 From: David Blanc Brioir Date: Thu, 8 Jan 2026 23:59:25 +0100 Subject: [PATCH] fix: Correct Work titles and improve LLM metadata extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes issue where LLM was copying placeholder instructions from the prompt template into actual metadata fields. Changes: 1. Created fix_work_titles.py script to correct existing bad titles - Detects patterns like "(si c'est bien...)", "Titre corrigé...", "Auteur à identifier" - Extracts correct metadata from chunks JSON files - Updates Work entries and associated chunks (44 chunks updated) - Fixed 3 Works with placeholder contamination 2. Improved llm_metadata.py prompt to prevent future issues - Added explicit INTERDIT/OBLIGATOIRE rules with ❌/✅ markers - Replaced placeholder examples with real concrete examples - Added two example responses (high confidence + low confidence) - Final empty JSON template guides structure without placeholders - Reinforced: use "confidence" field for uncertainty, not annotations Results: - "A Cartesian critique... (si c'est bien le titre)" → "A Cartesian critique of the artificial intelligence" - "Titre corrigé si nécessaire (ex: ...)" → "Computationalism and The Case When the Brain Is Not a Computer" - "Titre de l'article principal (à identifier)" → "Computationalism in the Philosophy of Mind" All future document uploads will now extract clean metadata without LLM commentary or placeholder instructions. Co-Authored-By: Claude Sonnet 4.5 --- fix_work_titles.py | 225 ++++++++++++++++++ generations/library_rag/utils/llm_metadata.py | 69 ++++-- 2 files changed, 276 insertions(+), 18 deletions(-) create mode 100644 fix_work_titles.py diff --git a/fix_work_titles.py b/fix_work_titles.py new file mode 100644 index 0000000..982b3cb --- /dev/null +++ b/fix_work_titles.py @@ -0,0 +1,225 @@ +"""Fix Work titles that contain LLM placeholder instructions.""" + +import json +import sys +from pathlib import Path +from typing import Dict, Any, List, Tuple + +# Fix Windows encoding +if sys.platform == "win32": + import io + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +# Add parent directory to path +sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag")) + +import weaviate +import weaviate.classes.query as wvq + +# Patterns indicating bad titles/authors (LLM placeholders) +BAD_PATTERNS = [ + "si c'est bien le titre", + "à identifier", + "à confirmer", + "ex:", + "Titre corrigé", + "Auteur à identifier", + "Nom de l'auteur", + "(possiblement)", + "(correct)", +] + +def is_bad_metadata(text: str) -> bool: + """Check if metadata contains LLM placeholder patterns.""" + if not text: + return False + text_lower = text.lower() + return any(pattern.lower() in text_lower for pattern in BAD_PATTERNS) + +def clean_title(title: str) -> str: + """Extract clean title from placeholder text.""" + if not is_bad_metadata(title): + return title + + # Extract from patterns like: "Title (si c'est bien...)" + if "(" in title: + clean = title.split("(")[0].strip() + if clean: + return clean + + # Extract from patterns like: "ex: \"Real Title\"" + if "ex:" in title.lower(): + import re + match = re.search(r'ex:\s*["\']([^"\']+)["\']', title, re.IGNORECASE) + if match: + return match.group(1) + + return title + +def get_correct_metadata_from_chunks( + output_dir: Path, source_id: str +) -> Tuple[str | None, str | None]: + """Extract correct title/author from chunks JSON file. + + Returns: + Tuple of (title, author) or (None, None) if not found. + """ + chunks_file = output_dir / source_id / f"{source_id}_chunks.json" + if not chunks_file.exists(): + return None, None + + try: + with open(chunks_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + metadata = data.get("metadata", {}) + + # Priority: work > original_title > title + title = ( + metadata.get("work") or + metadata.get("original_title") or + metadata.get("title") + ) + + author = ( + metadata.get("original_author") or + metadata.get("author") + ) + + return title, author + except Exception as e: + print(f" ⚠️ Error reading {chunks_file}: {e}") + return None, None + +def fix_works_and_chunks(): + """Fix Work titles and update associated chunks.""" + output_dir = Path(__file__).parent / "generations" / "library_rag" / "output" + + print("🔧 Fixing Work titles with LLM placeholders...\n") + + client = weaviate.connect_to_local() + + try: + work_collection = client.collections.get("Work") + chunk_collection = client.collections.get("Chunk_v2") + + # Find all Works with bad titles/authors + works_to_fix: List[Dict[str, Any]] = [] + + print("📊 Scanning Works for placeholder patterns...\n") + + for work in work_collection.iterator(include_vector=False): + props = work.properties + source_id = props.get("sourceId") + title = props.get("title", "") + author = props.get("author", "") + + if not source_id: + continue + + needs_fix = is_bad_metadata(title) or is_bad_metadata(author) + + if needs_fix: + works_to_fix.append({ + "uuid": str(work.uuid), + "source_id": source_id, + "old_title": title, + "old_author": author, + }) + print(f"❌ Found bad Work: {source_id}") + print(f" Title: {title[:80]}") + print(f" Author: {author[:80]}\n") + + if not works_to_fix: + print("✅ No Works need fixing!") + return + + print(f"\n🔍 Found {len(works_to_fix)} Works to fix\n") + print("=" * 70) + + # Fix each Work + fixed_count = 0 + failed_count = 0 + + for work_data in works_to_fix: + source_id = work_data["source_id"] + work_uuid = work_data["uuid"] + old_title = work_data["old_title"] + old_author = work_data["old_author"] + + print(f"\n📝 Fixing: {source_id}") + + # Get correct metadata from chunks file + correct_title, correct_author = get_correct_metadata_from_chunks( + output_dir, source_id + ) + + if not correct_title: + print(f" ⚠️ Could not find correct metadata, skipping") + failed_count += 1 + continue + + # Clean title if still has placeholders + if is_bad_metadata(correct_title): + correct_title = clean_title(correct_title) + + if is_bad_metadata(correct_author or ""): + correct_author = None # Better to leave empty than keep placeholder + + print(f" Old title: {old_title[:60]}") + print(f" New title: {correct_title[:60]}") + print(f" Old author: {old_author[:60]}") + print(f" New author: {correct_author or 'None'}") + + # Update Work + try: + work_collection.data.update( + uuid=work_uuid, + properties={ + "title": correct_title, + "author": correct_author, + } + ) + print(f" ✅ Updated Work") + + # Update associated chunks + chunks = chunk_collection.query.fetch_objects( + filters=wvq.Filter.by_property("workTitle").equal(old_title), + limit=1000 + ) + + chunk_count = len(chunks.objects) + if chunk_count > 0: + print(f" 🔄 Updating {chunk_count} chunks...") + + for chunk in chunks.objects: + try: + chunk_collection.data.update( + uuid=str(chunk.uuid), + properties={ + "workTitle": correct_title, + "workAuthor": correct_author, + } + ) + except Exception as e: + print(f" ⚠️ Failed to update chunk {chunk.uuid}: {e}") + + print(f" ✅ Updated {chunk_count} chunks") + + fixed_count += 1 + + except Exception as e: + print(f" ❌ Failed to update Work: {e}") + failed_count += 1 + + print("\n" + "=" * 70) + print(f"\n✅ Fixed {fixed_count} Works") + if failed_count > 0: + print(f"⚠️ Failed to fix {failed_count} Works") + + finally: + client.close() + +if __name__ == "__main__": + fix_works_and_chunks() + print("\n✓ Done") diff --git a/generations/library_rag/utils/llm_metadata.py b/generations/library_rag/utils/llm_metadata.py index 4d84d6a..c7d1b64 100644 --- a/generations/library_rag/utils/llm_metadata.py +++ b/generations/library_rag/utils/llm_metadata.py @@ -215,30 +215,49 @@ INDICES POUR TROUVER LE VRAI TITRE: - Répété sur la page de garde et la page de titre - Peut contenir un sous-titre après ":" -IMPORTANT - FORMAT DES DONNÉES: -- N'ajoute JAMAIS d'annotations comme "(correct)", "(à confirmer)", "(possiblement)", etc. -- Retourne uniquement les noms propres et titres sans commentaires -- NE METS PAS de phrases comme "À confirmer avec...", "Vérifier si...", "Possiblement..." -- Le champ "confidence" sert à exprimer ton niveau de certitude -- Si tu n'es pas sûr du titre, mets le titre le plus probable ET un confidence faible -- EXEMPLE CORRECT: "title": "La pensée-signe" avec "confidence": {{"title": 0.6}} -- EXEMPLE INCORRECT: "title": "À confirmer avec le titre exact" +RÈGLES CRITIQUES - FORMAT DES DONNÉES: +❌ INTERDIT: N'ajoute JAMAIS d'annotations, commentaires ou instructions dans les valeurs +❌ INTERDIT: "(correct)", "(à confirmer)", "(possiblement)", "(ex:)", "(si c'est bien...)" +❌ INTERDIT: "À confirmer avec...", "Vérifier si...", "Possiblement...", "Titre corrigé..." +❌ INTERDIT: "Auteur à identifier", "Nom de l'auteur si disponible" -RÉPONDS UNIQUEMENT avec un JSON entre balises : +✅ OBLIGATOIRE: Retourne UNIQUEMENT le titre exact tel qu'il apparaît dans le document +✅ OBLIGATOIRE: Retourne UNIQUEMENT le nom de l'auteur tel qu'il apparaît +✅ Si incertain: utilise le champ "confidence" avec un score bas (0.3-0.6) +✅ Si vraiment introuvable: utilise null (pas de phrase descriptive) +EXEMPLE DE BONNE RÉPONSE (extrait d'un vrai livre): {{ - "title": "Le vrai titre de l'œuvre (avec sous-titre si présent)", - "author": "Prénom Nom de l'auteur principal", - "collection": "Nom de la collection ou série (null si absent)", - "publisher": "Nom de l'éditeur", - "year": 2023, - "doi": "10.xxxx/xxxxx (null si absent)", - "isbn": "978-x-xxxx-xxxx-x (null si absent)", + "title": "La technique et le temps: La faute d'Épiméthée", + "author": "Bernard Stiegler", + "collection": "Philosophie", + "publisher": "Éditions Galilée", + "year": 1994, + "doi": null, + "isbn": "978-2-7186-0489-8", "language": "fr", "confidence": {{ "title": 0.95, - "author": 0.90 + "author": 0.98 + }} +}} + + +EXEMPLE si métadonnées incertaines: + +{{ + "title": "Between Past and Future", + "author": "Hannah Arendt", + "collection": null, + "publisher": null, + "year": null, + "doi": null, + "isbn": null, + "language": "en", + "confidence": {{ + "title": 0.7, + "author": 0.85 }} }} @@ -246,7 +265,21 @@ RÉPONDS UNIQUEMENT avec un JSON entre balises : DOCUMENT À ANALYSER: {content} -Réponds UNIQUEMENT avec le JSON.""" +RAPPEL: Retourne UNIQUEMENT le JSON avec les valeurs EXACTES du document, SANS commentaires ni annotations. + + +{{ + "title": "", + "author": "", + "collection": null, + "publisher": null, + "year": null, + "doi": null, + "isbn": null, + "language": "fr", + "confidence": {{}} +}} +""" logger.info(f"Extraction métadonnées via {provider.upper()} ({model})")