BREAKING CHANGE: Document collection removed from Weaviate schema Architecture simplification: - Removed Document collection (unused by Flask app) - All metadata now in Work collection or file-based (chunks.json) - Simplified from 4 collections to 3 (Work, Chunk_v2, Summary_v2) Schema changes (schema.py): - Removed create_document_collection() function - Updated verify_schema() to expect 3 collections - Updated display_schema() and print_summary() - Updated documentation to reflect Chunk_v2/Summary_v2 Ingestion changes (weaviate_ingest.py): - Removed ingest_document_metadata() function - Removed ingest_document_collection parameter - Updated IngestResult to use work_uuid instead of document_uuid - Removed Document deletion from delete_document_chunks() - Updated DeleteResult TypedDict Type changes (types.py): - WeaviateIngestResult: document_uuid → work_uuid Documentation updates (.claude/CLAUDE.md): - Updated schema diagram (4 → 3 collections) - Removed Document references - Updated to reflect manual GPU vectorization Database changes: - Deleted Document collection (13 objects) - Deleted Chunk collection (0 objects, old schema) Benefits: - Simpler architecture (3 collections vs 4) - No redundant data storage - All metadata available via Work or file-based storage - Reduced Weaviate memory footprint Migration: - See DOCUMENT_COLLECTION_ANALYSIS.md for detailed analysis - See migrate_chunk_v2_to_none_vectorizer.py for vectorizer migration Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
123 lines
3.9 KiB
Python
123 lines
3.9 KiB
Python
"""
|
|
Fix Turings_Machines ingestion with corrected metadata.
|
|
|
|
The LLM returned prompt instructions instead of actual metadata.
|
|
This script:
|
|
1. Loads chunks from Turings_Machines_chunks.json
|
|
2. Corrects workTitle and workAuthor
|
|
3. Re-ingests to Weaviate with GPU embedder
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add current directory to path for imports
|
|
current_dir = Path(__file__).parent.absolute()
|
|
sys.path.insert(0, str(current_dir))
|
|
|
|
# Now import can work
|
|
import utils.weaviate_ingest as weaviate_ingest
|
|
|
|
def fix_turings_machines():
|
|
"""Fix and re-ingest Turings_Machines with corrected metadata."""
|
|
|
|
# Load chunks JSON
|
|
chunks_file = Path("output/Turings_Machines/Turings_Machines_chunks.json")
|
|
|
|
if not chunks_file.exists():
|
|
print(f"ERROR: File not found: {chunks_file}")
|
|
return
|
|
|
|
with open(chunks_file, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
print("Loaded chunks JSON")
|
|
print(f" - Chunks: {len(data.get('chunks', []))}")
|
|
print(f" - Current title: {data.get('metadata', {}).get('title', 'N/A')[:80]}")
|
|
print(f" - Current author: {data.get('metadata', {}).get('author', 'N/A')[:80]}")
|
|
|
|
# Correct metadata
|
|
corrected_metadata = {
|
|
"title": "Turing's Machines",
|
|
"author": "Dorian Wiszniewski, Richard Coyne, Christopher Pierce",
|
|
"year": 2000, # Approximate - from references (Coyne 1999, etc.)
|
|
"language": "en"
|
|
}
|
|
|
|
# Update metadata
|
|
data["metadata"] = corrected_metadata
|
|
|
|
# Update all chunks with corrected metadata
|
|
for chunk in data.get("chunks", []):
|
|
chunk["workTitle"] = corrected_metadata["title"]
|
|
chunk["workAuthor"] = corrected_metadata["author"]
|
|
chunk["year"] = corrected_metadata["year"]
|
|
|
|
print("\nCorrected metadata:")
|
|
print(f" - Title: {corrected_metadata['title']}")
|
|
print(f" - Author: {corrected_metadata['author']}")
|
|
print(f" - Year: {corrected_metadata['year']}")
|
|
|
|
# Prepare chunks for ingestion (format expected by ingest_document)
|
|
chunks_for_ingestion = []
|
|
for i, chunk in enumerate(data.get("chunks", [])):
|
|
chunks_for_ingestion.append({
|
|
"text": chunk["text"],
|
|
"sectionPath": chunk.get("section", ""),
|
|
"sectionLevel": chunk.get("section_level", 1),
|
|
"chapterTitle": "",
|
|
"canonicalReference": "",
|
|
"unitType": chunk.get("type", "main_content"),
|
|
"keywords": chunk.get("concepts", []),
|
|
"language": "en",
|
|
"orderIndex": i,
|
|
})
|
|
|
|
print(f"\nPrepared {len(chunks_for_ingestion)} chunks for ingestion")
|
|
|
|
# Re-ingest to Weaviate
|
|
print("\nStarting re-ingestion with GPU embedder...")
|
|
|
|
result = weaviate_ingest.ingest_document(
|
|
doc_name="Turings_Machines",
|
|
chunks=chunks_for_ingestion,
|
|
metadata=corrected_metadata,
|
|
language="en"
|
|
)
|
|
|
|
if result.get("success"):
|
|
print(f"\nRe-ingestion successful!")
|
|
print(f" - Chunks inserted: {result.get('count', 0)}")
|
|
print(f" - Work UUID: {result.get('work_uuid', 'N/A')}")
|
|
else:
|
|
print(f"\nRe-ingestion failed!")
|
|
print(f" - Error: {result.get('error', 'Unknown')}")
|
|
|
|
# Save corrected chunks JSON
|
|
corrected_file = chunks_file.parent / f"{chunks_file.stem}_corrected.json"
|
|
with open(corrected_file, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\nSaved corrected chunks to: {corrected_file}")
|
|
|
|
return result
|
|
|
|
if __name__ == "__main__":
|
|
print("=" * 70)
|
|
print("Fix Turings_Machines Ingestion")
|
|
print("=" * 70)
|
|
|
|
result = fix_turings_machines()
|
|
|
|
if result and result.get("success"):
|
|
print("\n" + "=" * 70)
|
|
print("FIX COMPLETED SUCCESSFULLY")
|
|
print("=" * 70)
|
|
sys.exit(0)
|
|
else:
|
|
print("\n" + "=" * 70)
|
|
print("FIX FAILED")
|
|
print("=" * 70)
|
|
sys.exit(1)
|