fix: Correct Work titles and improve LLM metadata extraction

Fixes issue where LLM was copying placeholder instructions from the
prompt template into actual metadata fields.

Changes:
1. Created fix_work_titles.py script to correct existing bad titles
   - Detects patterns like "(si c'est bien...)", "Titre corrigé...", "Auteur à identifier"
   - Extracts correct metadata from chunks JSON files
   - Updates Work entries and associated chunks (44 chunks updated)
   - Fixed 3 Works with placeholder contamination

2. Improved llm_metadata.py prompt to prevent future issues
   - Added explicit INTERDIT/OBLIGATOIRE rules with / markers
   - Replaced placeholder examples with real concrete examples
   - Added two example responses (high confidence + low confidence)
   - Final empty JSON template guides structure without placeholders
   - Reinforced: use "confidence" field for uncertainty, not annotations

Results:
- "A Cartesian critique... (si c'est bien le titre)" → "A Cartesian critique of the artificial intelligence"
- "Titre corrigé si nécessaire (ex: ...)" → "Computationalism and The Case When the Brain Is Not a Computer"
- "Titre de l'article principal (à identifier)" → "Computationalism in the Philosophy of Mind"

All future document uploads will now extract clean metadata without
LLM commentary or placeholder instructions.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-08 23:59:25 +01:00
parent 0c3b6c5fea
commit 0c8ea8fa48
2 changed files with 276 additions and 18 deletions

225
fix_work_titles.py Normal file
View File

@@ -0,0 +1,225 @@
"""Fix Work titles that contain LLM placeholder instructions."""
import json
import sys
from pathlib import Path
from typing import Dict, Any, List, Tuple
# Fix Windows encoding
if sys.platform == "win32":
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
import weaviate
import weaviate.classes.query as wvq
# Patterns indicating bad titles/authors (LLM placeholders)
BAD_PATTERNS = [
"si c'est bien le titre",
"à identifier",
"à confirmer",
"ex:",
"Titre corrigé",
"Auteur à identifier",
"Nom de l'auteur",
"(possiblement)",
"(correct)",
]
def is_bad_metadata(text: str) -> bool:
"""Check if metadata contains LLM placeholder patterns."""
if not text:
return False
text_lower = text.lower()
return any(pattern.lower() in text_lower for pattern in BAD_PATTERNS)
def clean_title(title: str) -> str:
"""Extract clean title from placeholder text."""
if not is_bad_metadata(title):
return title
# Extract from patterns like: "Title (si c'est bien...)"
if "(" in title:
clean = title.split("(")[0].strip()
if clean:
return clean
# Extract from patterns like: "ex: \"Real Title\""
if "ex:" in title.lower():
import re
match = re.search(r'ex:\s*["\']([^"\']+)["\']', title, re.IGNORECASE)
if match:
return match.group(1)
return title
def get_correct_metadata_from_chunks(
output_dir: Path, source_id: str
) -> Tuple[str | None, str | None]:
"""Extract correct title/author from chunks JSON file.
Returns:
Tuple of (title, author) or (None, None) if not found.
"""
chunks_file = output_dir / source_id / f"{source_id}_chunks.json"
if not chunks_file.exists():
return None, None
try:
with open(chunks_file, 'r', encoding='utf-8') as f:
data = json.load(f)
metadata = data.get("metadata", {})
# Priority: work > original_title > title
title = (
metadata.get("work") or
metadata.get("original_title") or
metadata.get("title")
)
author = (
metadata.get("original_author") or
metadata.get("author")
)
return title, author
except Exception as e:
print(f" ⚠️ Error reading {chunks_file}: {e}")
return None, None
def fix_works_and_chunks():
"""Fix Work titles and update associated chunks."""
output_dir = Path(__file__).parent / "generations" / "library_rag" / "output"
print("🔧 Fixing Work titles with LLM placeholders...\n")
client = weaviate.connect_to_local()
try:
work_collection = client.collections.get("Work")
chunk_collection = client.collections.get("Chunk_v2")
# Find all Works with bad titles/authors
works_to_fix: List[Dict[str, Any]] = []
print("📊 Scanning Works for placeholder patterns...\n")
for work in work_collection.iterator(include_vector=False):
props = work.properties
source_id = props.get("sourceId")
title = props.get("title", "")
author = props.get("author", "")
if not source_id:
continue
needs_fix = is_bad_metadata(title) or is_bad_metadata(author)
if needs_fix:
works_to_fix.append({
"uuid": str(work.uuid),
"source_id": source_id,
"old_title": title,
"old_author": author,
})
print(f"❌ Found bad Work: {source_id}")
print(f" Title: {title[:80]}")
print(f" Author: {author[:80]}\n")
if not works_to_fix:
print("✅ No Works need fixing!")
return
print(f"\n🔍 Found {len(works_to_fix)} Works to fix\n")
print("=" * 70)
# Fix each Work
fixed_count = 0
failed_count = 0
for work_data in works_to_fix:
source_id = work_data["source_id"]
work_uuid = work_data["uuid"]
old_title = work_data["old_title"]
old_author = work_data["old_author"]
print(f"\n📝 Fixing: {source_id}")
# Get correct metadata from chunks file
correct_title, correct_author = get_correct_metadata_from_chunks(
output_dir, source_id
)
if not correct_title:
print(f" ⚠️ Could not find correct metadata, skipping")
failed_count += 1
continue
# Clean title if still has placeholders
if is_bad_metadata(correct_title):
correct_title = clean_title(correct_title)
if is_bad_metadata(correct_author or ""):
correct_author = None # Better to leave empty than keep placeholder
print(f" Old title: {old_title[:60]}")
print(f" New title: {correct_title[:60]}")
print(f" Old author: {old_author[:60]}")
print(f" New author: {correct_author or 'None'}")
# Update Work
try:
work_collection.data.update(
uuid=work_uuid,
properties={
"title": correct_title,
"author": correct_author,
}
)
print(f" ✅ Updated Work")
# Update associated chunks
chunks = chunk_collection.query.fetch_objects(
filters=wvq.Filter.by_property("workTitle").equal(old_title),
limit=1000
)
chunk_count = len(chunks.objects)
if chunk_count > 0:
print(f" 🔄 Updating {chunk_count} chunks...")
for chunk in chunks.objects:
try:
chunk_collection.data.update(
uuid=str(chunk.uuid),
properties={
"workTitle": correct_title,
"workAuthor": correct_author,
}
)
except Exception as e:
print(f" ⚠️ Failed to update chunk {chunk.uuid}: {e}")
print(f" ✅ Updated {chunk_count} chunks")
fixed_count += 1
except Exception as e:
print(f" ❌ Failed to update Work: {e}")
failed_count += 1
print("\n" + "=" * 70)
print(f"\n✅ Fixed {fixed_count} Works")
if failed_count > 0:
print(f"⚠️ Failed to fix {failed_count} Works")
finally:
client.close()
if __name__ == "__main__":
fix_works_and_chunks()
print("\n✓ Done")