diff --git a/fix_work_titles.py b/fix_work_titles.py
new file mode 100644
index 0000000..982b3cb
--- /dev/null
+++ b/fix_work_titles.py
@@ -0,0 +1,225 @@
+"""Fix Work titles that contain LLM placeholder instructions."""
+
+import json
+import sys
+from pathlib import Path
+from typing import Dict, Any, List, Tuple
+
+# Fix Windows encoding
+if sys.platform == "win32":
+ import io
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
+
+import weaviate
+import weaviate.classes.query as wvq
+
+# Patterns indicating bad titles/authors (LLM placeholders)
+BAD_PATTERNS = [
+ "si c'est bien le titre",
+ "à identifier",
+ "à confirmer",
+ "ex:",
+ "Titre corrigé",
+ "Auteur à identifier",
+ "Nom de l'auteur",
+ "(possiblement)",
+ "(correct)",
+]
+
+def is_bad_metadata(text: str) -> bool:
+ """Check if metadata contains LLM placeholder patterns."""
+ if not text:
+ return False
+ text_lower = text.lower()
+ return any(pattern.lower() in text_lower for pattern in BAD_PATTERNS)
+
+def clean_title(title: str) -> str:
+ """Extract clean title from placeholder text."""
+ if not is_bad_metadata(title):
+ return title
+
+ # Extract from patterns like: "Title (si c'est bien...)"
+ if "(" in title:
+ clean = title.split("(")[0].strip()
+ if clean:
+ return clean
+
+ # Extract from patterns like: "ex: \"Real Title\""
+ if "ex:" in title.lower():
+ import re
+ match = re.search(r'ex:\s*["\']([^"\']+)["\']', title, re.IGNORECASE)
+ if match:
+ return match.group(1)
+
+ return title
+
+def get_correct_metadata_from_chunks(
+ output_dir: Path, source_id: str
+) -> Tuple[str | None, str | None]:
+ """Extract correct title/author from chunks JSON file.
+
+ Returns:
+ Tuple of (title, author) or (None, None) if not found.
+ """
+ chunks_file = output_dir / source_id / f"{source_id}_chunks.json"
+ if not chunks_file.exists():
+ return None, None
+
+ try:
+ with open(chunks_file, 'r', encoding='utf-8') as f:
+ data = json.load(f)
+
+ metadata = data.get("metadata", {})
+
+ # Priority: work > original_title > title
+ title = (
+ metadata.get("work") or
+ metadata.get("original_title") or
+ metadata.get("title")
+ )
+
+ author = (
+ metadata.get("original_author") or
+ metadata.get("author")
+ )
+
+ return title, author
+ except Exception as e:
+ print(f" ⚠️ Error reading {chunks_file}: {e}")
+ return None, None
+
+def fix_works_and_chunks():
+ """Fix Work titles and update associated chunks."""
+ output_dir = Path(__file__).parent / "generations" / "library_rag" / "output"
+
+ print("🔧 Fixing Work titles with LLM placeholders...\n")
+
+ client = weaviate.connect_to_local()
+
+ try:
+ work_collection = client.collections.get("Work")
+ chunk_collection = client.collections.get("Chunk_v2")
+
+ # Find all Works with bad titles/authors
+ works_to_fix: List[Dict[str, Any]] = []
+
+ print("📊 Scanning Works for placeholder patterns...\n")
+
+ for work in work_collection.iterator(include_vector=False):
+ props = work.properties
+ source_id = props.get("sourceId")
+ title = props.get("title", "")
+ author = props.get("author", "")
+
+ if not source_id:
+ continue
+
+ needs_fix = is_bad_metadata(title) or is_bad_metadata(author)
+
+ if needs_fix:
+ works_to_fix.append({
+ "uuid": str(work.uuid),
+ "source_id": source_id,
+ "old_title": title,
+ "old_author": author,
+ })
+ print(f"❌ Found bad Work: {source_id}")
+ print(f" Title: {title[:80]}")
+ print(f" Author: {author[:80]}\n")
+
+ if not works_to_fix:
+ print("✅ No Works need fixing!")
+ return
+
+ print(f"\n🔍 Found {len(works_to_fix)} Works to fix\n")
+ print("=" * 70)
+
+ # Fix each Work
+ fixed_count = 0
+ failed_count = 0
+
+ for work_data in works_to_fix:
+ source_id = work_data["source_id"]
+ work_uuid = work_data["uuid"]
+ old_title = work_data["old_title"]
+ old_author = work_data["old_author"]
+
+ print(f"\n📝 Fixing: {source_id}")
+
+ # Get correct metadata from chunks file
+ correct_title, correct_author = get_correct_metadata_from_chunks(
+ output_dir, source_id
+ )
+
+ if not correct_title:
+ print(f" ⚠️ Could not find correct metadata, skipping")
+ failed_count += 1
+ continue
+
+ # Clean title if still has placeholders
+ if is_bad_metadata(correct_title):
+ correct_title = clean_title(correct_title)
+
+ if is_bad_metadata(correct_author or ""):
+ correct_author = None # Better to leave empty than keep placeholder
+
+ print(f" Old title: {old_title[:60]}")
+ print(f" New title: {correct_title[:60]}")
+ print(f" Old author: {old_author[:60]}")
+ print(f" New author: {correct_author or 'None'}")
+
+ # Update Work
+ try:
+ work_collection.data.update(
+ uuid=work_uuid,
+ properties={
+ "title": correct_title,
+ "author": correct_author,
+ }
+ )
+ print(f" ✅ Updated Work")
+
+ # Update associated chunks
+ chunks = chunk_collection.query.fetch_objects(
+ filters=wvq.Filter.by_property("workTitle").equal(old_title),
+ limit=1000
+ )
+
+ chunk_count = len(chunks.objects)
+ if chunk_count > 0:
+ print(f" 🔄 Updating {chunk_count} chunks...")
+
+ for chunk in chunks.objects:
+ try:
+ chunk_collection.data.update(
+ uuid=str(chunk.uuid),
+ properties={
+ "workTitle": correct_title,
+ "workAuthor": correct_author,
+ }
+ )
+ except Exception as e:
+ print(f" ⚠️ Failed to update chunk {chunk.uuid}: {e}")
+
+ print(f" ✅ Updated {chunk_count} chunks")
+
+ fixed_count += 1
+
+ except Exception as e:
+ print(f" ❌ Failed to update Work: {e}")
+ failed_count += 1
+
+ print("\n" + "=" * 70)
+ print(f"\n✅ Fixed {fixed_count} Works")
+ if failed_count > 0:
+ print(f"⚠️ Failed to fix {failed_count} Works")
+
+ finally:
+ client.close()
+
+if __name__ == "__main__":
+ fix_works_and_chunks()
+ print("\n✓ Done")
diff --git a/generations/library_rag/utils/llm_metadata.py b/generations/library_rag/utils/llm_metadata.py
index 4d84d6a..c7d1b64 100644
--- a/generations/library_rag/utils/llm_metadata.py
+++ b/generations/library_rag/utils/llm_metadata.py
@@ -215,30 +215,49 @@ INDICES POUR TROUVER LE VRAI TITRE:
- Répété sur la page de garde et la page de titre
- Peut contenir un sous-titre après ":"
-IMPORTANT - FORMAT DES DONNÉES:
-- N'ajoute JAMAIS d'annotations comme "(correct)", "(à confirmer)", "(possiblement)", etc.
-- Retourne uniquement les noms propres et titres sans commentaires
-- NE METS PAS de phrases comme "À confirmer avec...", "Vérifier si...", "Possiblement..."
-- Le champ "confidence" sert à exprimer ton niveau de certitude
-- Si tu n'es pas sûr du titre, mets le titre le plus probable ET un confidence faible
-- EXEMPLE CORRECT: "title": "La pensée-signe" avec "confidence": {{"title": 0.6}}
-- EXEMPLE INCORRECT: "title": "À confirmer avec le titre exact"
+RÈGLES CRITIQUES - FORMAT DES DONNÉES:
+❌ INTERDIT: N'ajoute JAMAIS d'annotations, commentaires ou instructions dans les valeurs
+❌ INTERDIT: "(correct)", "(à confirmer)", "(possiblement)", "(ex:)", "(si c'est bien...)"
+❌ INTERDIT: "À confirmer avec...", "Vérifier si...", "Possiblement...", "Titre corrigé..."
+❌ INTERDIT: "Auteur à identifier", "Nom de l'auteur si disponible"
-RÉPONDS UNIQUEMENT avec un JSON entre balises :
+✅ OBLIGATOIRE: Retourne UNIQUEMENT le titre exact tel qu'il apparaît dans le document
+✅ OBLIGATOIRE: Retourne UNIQUEMENT le nom de l'auteur tel qu'il apparaît
+✅ Si incertain: utilise le champ "confidence" avec un score bas (0.3-0.6)
+✅ Si vraiment introuvable: utilise null (pas de phrase descriptive)
+EXEMPLE DE BONNE RÉPONSE (extrait d'un vrai livre):
{{
- "title": "Le vrai titre de l'œuvre (avec sous-titre si présent)",
- "author": "Prénom Nom de l'auteur principal",
- "collection": "Nom de la collection ou série (null si absent)",
- "publisher": "Nom de l'éditeur",
- "year": 2023,
- "doi": "10.xxxx/xxxxx (null si absent)",
- "isbn": "978-x-xxxx-xxxx-x (null si absent)",
+ "title": "La technique et le temps: La faute d'Épiméthée",
+ "author": "Bernard Stiegler",
+ "collection": "Philosophie",
+ "publisher": "Éditions Galilée",
+ "year": 1994,
+ "doi": null,
+ "isbn": "978-2-7186-0489-8",
"language": "fr",
"confidence": {{
"title": 0.95,
- "author": 0.90
+ "author": 0.98
+ }}
+}}
+
+
+EXEMPLE si métadonnées incertaines:
+
+{{
+ "title": "Between Past and Future",
+ "author": "Hannah Arendt",
+ "collection": null,
+ "publisher": null,
+ "year": null,
+ "doi": null,
+ "isbn": null,
+ "language": "en",
+ "confidence": {{
+ "title": 0.7,
+ "author": 0.85
}}
}}
@@ -246,7 +265,21 @@ RÉPONDS UNIQUEMENT avec un JSON entre balises :
DOCUMENT À ANALYSER:
{content}
-Réponds UNIQUEMENT avec le JSON."""
+RAPPEL: Retourne UNIQUEMENT le JSON avec les valeurs EXACTES du document, SANS commentaires ni annotations.
+
+
+{{
+ "title": "",
+ "author": "",
+ "collection": null,
+ "publisher": null,
+ "year": null,
+ "doi": null,
+ "isbn": null,
+ "language": "fr",
+ "confidence": {{}}
+}}
+"""
logger.info(f"Extraction métadonnées via {provider.upper()} ({model})")