linear-coding-agent/generations/library_rag/fix_turings_machines.py

"""
Fix Turings_Machines ingestion with corrected metadata.

The LLM returned prompt instructions instead of actual metadata.
This script:
1. Loads chunks from Turings_Machines_chunks.json
2. Corrects workTitle and workAuthor
3. Re-ingests to Weaviate with GPU embedder
"""

import json
import sys
from pathlib import Path

# Add current directory to path for imports
current_dir = Path(__file__).parent.absolute()
sys.path.insert(0, str(current_dir))

# Now import can work
import utils.weaviate_ingest as weaviate_ingest

def fix_turings_machines():
    """Fix and re-ingest Turings_Machines with corrected metadata."""

    # Load chunks JSON
    chunks_file = Path("output/Turings_Machines/Turings_Machines_chunks.json")

    if not chunks_file.exists():
        print(f"ERROR: File not found: {chunks_file}")
        return

    with open(chunks_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    print("Loaded chunks JSON")
    print(f"   - Chunks: {len(data.get('chunks', []))}")
    print(f"   - Current title: {data.get('metadata', {}).get('title', 'N/A')[:80]}")
    print(f"   - Current author: {data.get('metadata', {}).get('author', 'N/A')[:80]}")

    # Correct metadata
    corrected_metadata = {
        "title": "Turing's Machines",
        "author": "Dorian Wiszniewski, Richard Coyne, Christopher Pierce",
        "year": 2000,  # Approximate - from references (Coyne 1999, etc.)
        "language": "en"
    }

    # Update metadata
    data["metadata"] = corrected_metadata

    # Update all chunks with corrected metadata
    for chunk in data.get("chunks", []):
        chunk["workTitle"] = corrected_metadata["title"]
        chunk["workAuthor"] = corrected_metadata["author"]
        chunk["year"] = corrected_metadata["year"]

    print("\nCorrected metadata:")
    print(f"   - Title: {corrected_metadata['title']}")
    print(f"   - Author: {corrected_metadata['author']}")
    print(f"   - Year: {corrected_metadata['year']}")

    # Prepare chunks for ingestion (format expected by ingest_document)
    chunks_for_ingestion = []
    for i, chunk in enumerate(data.get("chunks", [])):
        chunks_for_ingestion.append({
            "text": chunk["text"],
            "sectionPath": chunk.get("section", ""),
            "sectionLevel": chunk.get("section_level", 1),
            "chapterTitle": "",
            "canonicalReference": "",
            "unitType": chunk.get("type", "main_content"),
            "keywords": chunk.get("concepts", []),
            "language": "en",
            "orderIndex": i,
        })

    print(f"\nPrepared {len(chunks_for_ingestion)} chunks for ingestion")

    # Re-ingest to Weaviate
    print("\nStarting re-ingestion with GPU embedder...")

    result = weaviate_ingest.ingest_document(
        doc_name="Turings_Machines",
        chunks=chunks_for_ingestion,
        metadata=corrected_metadata,
        language="en"
    )

    if result.get("success"):
        print(f"\nRe-ingestion successful!")
        print(f"   - Chunks inserted: {result.get('count', 0)}")
        print(f"   - Work UUID: {result.get('work_uuid', 'N/A')}")
    else:
        print(f"\nRe-ingestion failed!")
        print(f"   - Error: {result.get('error', 'Unknown')}")

    # Save corrected chunks JSON
    corrected_file = chunks_file.parent / f"{chunks_file.stem}_corrected.json"
    with open(corrected_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"\nSaved corrected chunks to: {corrected_file}")

    return result

if __name__ == "__main__":
    print("=" * 70)
    print("Fix Turings_Machines Ingestion")
    print("=" * 70)

    result = fix_turings_machines()

    if result and result.get("success"):
        print("\n" + "=" * 70)
        print("FIX COMPLETED SUCCESSFULLY")
        print("=" * 70)
        sys.exit(0)
    else:
        print("\n" + "=" * 70)
        print("FIX FAILED")
        print("=" * 70)
        sys.exit(1)