linear-coding-agent/fix_work_titles.py

"""Fix Work titles that contain LLM placeholder instructions."""

import json
import sys
from pathlib import Path
from typing import Dict, Any, List, Tuple

# Fix Windows encoding
if sys.platform == "win32":
    import io
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))

import weaviate
import weaviate.classes.query as wvq

# Patterns indicating bad titles/authors (LLM placeholders)
BAD_PATTERNS = [
    "si c'est bien le titre",
    "à identifier",
    "à confirmer",
    "ex:",
    "Titre corrigé",
    "Auteur à identifier",
    "Nom de l'auteur",
    "(possiblement)",
    "(correct)",
]

def is_bad_metadata(text: str) -> bool:
    """Check if metadata contains LLM placeholder patterns."""
    if not text:
        return False
    text_lower = text.lower()
    return any(pattern.lower() in text_lower for pattern in BAD_PATTERNS)

def clean_title(title: str) -> str:
    """Extract clean title from placeholder text."""
    if not is_bad_metadata(title):
        return title

    # Extract from patterns like: "Title (si c'est bien...)"
    if "(" in title:
        clean = title.split("(")[0].strip()
        if clean:
            return clean

    # Extract from patterns like: "ex: \"Real Title\""
    if "ex:" in title.lower():
        import re
        match = re.search(r'ex:\s*["\']([^"\']+)["\']', title, re.IGNORECASE)
        if match:
            return match.group(1)

    return title

def get_correct_metadata_from_chunks(
    output_dir: Path, source_id: str
) -> Tuple[str | None, str | None]:
    """Extract correct title/author from chunks JSON file.

    Returns:
        Tuple of (title, author) or (None, None) if not found.
    """
    chunks_file = output_dir / source_id / f"{source_id}_chunks.json"
    if not chunks_file.exists():
        return None, None

    try:
        with open(chunks_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        metadata = data.get("metadata", {})

        # Priority: work > original_title > title
        title = (
            metadata.get("work") or
            metadata.get("original_title") or
            metadata.get("title")
        )

        author = (
            metadata.get("original_author") or
            metadata.get("author")
        )

        return title, author
    except Exception as e:
        print(f"  ⚠️  Error reading {chunks_file}: {e}")
        return None, None

def fix_works_and_chunks():
    """Fix Work titles and update associated chunks."""
    output_dir = Path(__file__).parent / "generations" / "library_rag" / "output"

    print("🔧 Fixing Work titles with LLM placeholders...\n")

    client = weaviate.connect_to_local()

    try:
        work_collection = client.collections.get("Work")
        chunk_collection = client.collections.get("Chunk_v2")

        # Find all Works with bad titles/authors
        works_to_fix: List[Dict[str, Any]] = []

        print("📊 Scanning Works for placeholder patterns...\n")

        for work in work_collection.iterator(include_vector=False):
            props = work.properties
            source_id = props.get("sourceId")
            title = props.get("title", "")
            author = props.get("author", "")

            if not source_id:
                continue

            needs_fix = is_bad_metadata(title) or is_bad_metadata(author)

            if needs_fix:
                works_to_fix.append({
                    "uuid": str(work.uuid),
                    "source_id": source_id,
                    "old_title": title,
                    "old_author": author,
                })
                print(f"❌ Found bad Work: {source_id}")
                print(f"   Title: {title[:80]}")
                print(f"   Author: {author[:80]}\n")

        if not works_to_fix:
            print("✅ No Works need fixing!")
            return

        print(f"\n🔍 Found {len(works_to_fix)} Works to fix\n")
        print("=" * 70)

        # Fix each Work
        fixed_count = 0
        failed_count = 0

        for work_data in works_to_fix:
            source_id = work_data["source_id"]
            work_uuid = work_data["uuid"]
            old_title = work_data["old_title"]
            old_author = work_data["old_author"]

            print(f"\n📝 Fixing: {source_id}")

            # Get correct metadata from chunks file
            correct_title, correct_author = get_correct_metadata_from_chunks(
                output_dir, source_id
            )

            if not correct_title:
                print(f"   ⚠️  Could not find correct metadata, skipping")
                failed_count += 1
                continue

            # Clean title if still has placeholders
            if is_bad_metadata(correct_title):
                correct_title = clean_title(correct_title)

            if is_bad_metadata(correct_author or ""):
                correct_author = None  # Better to leave empty than keep placeholder

            print(f"   Old title: {old_title[:60]}")
            print(f"   New title: {correct_title[:60]}")
            print(f"   Old author: {old_author[:60]}")
            print(f"   New author: {correct_author or 'None'}")

            # Update Work
            try:
                work_collection.data.update(
                    uuid=work_uuid,
                    properties={
                        "title": correct_title,
                        "author": correct_author,
                    }
                )
                print(f"   ✅ Updated Work")

                # Update associated chunks
                chunks = chunk_collection.query.fetch_objects(
                    filters=wvq.Filter.by_property("workTitle").equal(old_title),
                    limit=1000
                )

                chunk_count = len(chunks.objects)
                if chunk_count > 0:
                    print(f"   🔄 Updating {chunk_count} chunks...")

                    for chunk in chunks.objects:
                        try:
                            chunk_collection.data.update(
                                uuid=str(chunk.uuid),
                                properties={
                                    "workTitle": correct_title,
                                    "workAuthor": correct_author,
                                }
                            )
                        except Exception as e:
                            print(f"      ⚠️  Failed to update chunk {chunk.uuid}: {e}")

                    print(f"   ✅ Updated {chunk_count} chunks")

                fixed_count += 1

            except Exception as e:
                print(f"   ❌ Failed to update Work: {e}")
                failed_count += 1

        print("\n" + "=" * 70)
        print(f"\n✅ Fixed {fixed_count} Works")
        if failed_count > 0:
            print(f"⚠️  Failed to fix {failed_count} Works")

    finally:
        client.close()

if __name__ == "__main__":
    fix_works_and_chunks()
    print("\n✓ Done")