feat: Remove Document collection from schema

BREAKING CHANGE: Document collection removed from Weaviate schema Architecture simplification: - Removed Document collection (unused by Flask app) - All metadata now in Work collection or file-based (chunks.json) - Simplified from 4 collections to 3 (Work, Chunk_v2, Summary_v2) Schema changes (schema.py): - Removed create_document_collection() function - Updated verify_schema() to expect 3 collections - Updated display_schema() and print_summary() - Updated documentation to reflect Chunk_v2/Summary_v2 Ingestion changes (weaviate_ingest.py): - Removed ingest_document_metadata() function - Removed ingest_document_collection parameter - Updated IngestResult to use work_uuid instead of document_uuid - Removed Document deletion from delete_document_chunks() - Updated DeleteResult TypedDict Type changes (types.py): - WeaviateIngestResult: document_uuid → work_uuid Documentation updates (.claude/CLAUDE.md): - Updated schema diagram (4 → 3 collections) - Removed Document references - Updated to reflect manual GPU vectorization Database changes: - Deleted Document collection (13 objects) - Deleted Chunk collection (0 objects, old schema) Benefits: - Simpler architecture (3 collections vs 4) - No redundant data storage - All metadata available via Work or file-based storage - Reduced Weaviate memory footprint Migration: - See DOCUMENT_COLLECTION_ANALYSIS.md for detailed analysis - See migrate_chunk_v2_to_none_vectorizer.py for vectorizer migration Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-09 14:13:51 +01:00
parent 625c52a925
commit 53f6a92365
8 changed files with 698 additions and 238 deletions
--- a/generations/library_rag/fix_turings_machines.py
+++ b/generations/library_rag/fix_turings_machines.py
@@ -0,0 +1,122 @@
+"""
+Fix Turings_Machines ingestion with corrected metadata.
+
+The LLM returned prompt instructions instead of actual metadata.
+This script:
+1. Loads chunks from Turings_Machines_chunks.json
+2. Corrects workTitle and workAuthor
+3. Re-ingests to Weaviate with GPU embedder
+"""
+
+import json
+import sys
+from pathlib import Path
+
+# Add current directory to path for imports
+current_dir = Path(__file__).parent.absolute()
+sys.path.insert(0, str(current_dir))
+
+# Now import can work
+import utils.weaviate_ingest as weaviate_ingest
+
+def fix_turings_machines():
+    """Fix and re-ingest Turings_Machines with corrected metadata."""
+
+    # Load chunks JSON
+    chunks_file = Path("output/Turings_Machines/Turings_Machines_chunks.json")
+
+    if not chunks_file.exists():
+        print(f"ERROR: File not found: {chunks_file}")
+        return
+
+    with open(chunks_file, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    print("Loaded chunks JSON")
+    print(f"   - Chunks: {len(data.get('chunks', []))}")
+    print(f"   - Current title: {data.get('metadata', {}).get('title', 'N/A')[:80]}")
+    print(f"   - Current author: {data.get('metadata', {}).get('author', 'N/A')[:80]}")
+
+    # Correct metadata
+    corrected_metadata = {
+        "title": "Turing's Machines",
+        "author": "Dorian Wiszniewski, Richard Coyne, Christopher Pierce",
+        "year": 2000,  # Approximate - from references (Coyne 1999, etc.)
+        "language": "en"
+    }
+
+    # Update metadata
+    data["metadata"] = corrected_metadata
+
+    # Update all chunks with corrected metadata
+    for chunk in data.get("chunks", []):
+        chunk["workTitle"] = corrected_metadata["title"]
+        chunk["workAuthor"] = corrected_metadata["author"]
+        chunk["year"] = corrected_metadata["year"]
+
+    print("\nCorrected metadata:")
+    print(f"   - Title: {corrected_metadata['title']}")
+    print(f"   - Author: {corrected_metadata['author']}")
+    print(f"   - Year: {corrected_metadata['year']}")
+
+    # Prepare chunks for ingestion (format expected by ingest_document)
+    chunks_for_ingestion = []
+    for i, chunk in enumerate(data.get("chunks", [])):
+        chunks_for_ingestion.append({
+            "text": chunk["text"],
+            "sectionPath": chunk.get("section", ""),
+            "sectionLevel": chunk.get("section_level", 1),
+            "chapterTitle": "",
+            "canonicalReference": "",
+            "unitType": chunk.get("type", "main_content"),
+            "keywords": chunk.get("concepts", []),
+            "language": "en",
+            "orderIndex": i,
+        })
+
+    print(f"\nPrepared {len(chunks_for_ingestion)} chunks for ingestion")
+
+    # Re-ingest to Weaviate
+    print("\nStarting re-ingestion with GPU embedder...")
+
+    result = weaviate_ingest.ingest_document(
+        doc_name="Turings_Machines",
+        chunks=chunks_for_ingestion,
+        metadata=corrected_metadata,
+        language="en"
+    )
+
+    if result.get("success"):
+        print(f"\nRe-ingestion successful!")
+        print(f"   - Chunks inserted: {result.get('count', 0)}")
+        print(f"   - Work UUID: {result.get('work_uuid', 'N/A')}")
+    else:
+        print(f"\nRe-ingestion failed!")
+        print(f"   - Error: {result.get('error', 'Unknown')}")
+
+    # Save corrected chunks JSON
+    corrected_file = chunks_file.parent / f"{chunks_file.stem}_corrected.json"
+    with open(corrected_file, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+
+    print(f"\nSaved corrected chunks to: {corrected_file}")
+
+    return result
+
+if __name__ == "__main__":
+    print("=" * 70)
+    print("Fix Turings_Machines Ingestion")
+    print("=" * 70)
+
+    result = fix_turings_machines()
+
+    if result and result.get("success"):
+        print("\n" + "=" * 70)
+        print("FIX COMPLETED SUCCESSFULLY")
+        print("=" * 70)
+        sys.exit(0)
+    else:
+        print("\n" + "=" * 70)
+        print("FIX FAILED")
+        print("=" * 70)
+        sys.exit(1)