feat: Remove Document collection from schema
BREAKING CHANGE: Document collection removed from Weaviate schema Architecture simplification: - Removed Document collection (unused by Flask app) - All metadata now in Work collection or file-based (chunks.json) - Simplified from 4 collections to 3 (Work, Chunk_v2, Summary_v2) Schema changes (schema.py): - Removed create_document_collection() function - Updated verify_schema() to expect 3 collections - Updated display_schema() and print_summary() - Updated documentation to reflect Chunk_v2/Summary_v2 Ingestion changes (weaviate_ingest.py): - Removed ingest_document_metadata() function - Removed ingest_document_collection parameter - Updated IngestResult to use work_uuid instead of document_uuid - Removed Document deletion from delete_document_chunks() - Updated DeleteResult TypedDict Type changes (types.py): - WeaviateIngestResult: document_uuid → work_uuid Documentation updates (.claude/CLAUDE.md): - Updated schema diagram (4 → 3 collections) - Removed Document references - Updated to reflect manual GPU vectorization Database changes: - Deleted Document collection (13 objects) - Deleted Chunk collection (0 objects, old schema) Benefits: - Simpler architecture (3 collections vs 4) - No redundant data storage - All metadata available via Work or file-based storage - Reduced Weaviate memory footprint Migration: - See DOCUMENT_COLLECTION_ANALYSIS.md for detailed analysis - See migrate_chunk_v2_to_none_vectorizer.py for vectorizer migration Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
122
generations/library_rag/fix_turings_machines.py
Normal file
122
generations/library_rag/fix_turings_machines.py
Normal file
@@ -0,0 +1,122 @@
|
||||
"""
|
||||
Fix Turings_Machines ingestion with corrected metadata.
|
||||
|
||||
The LLM returned prompt instructions instead of actual metadata.
|
||||
This script:
|
||||
1. Loads chunks from Turings_Machines_chunks.json
|
||||
2. Corrects workTitle and workAuthor
|
||||
3. Re-ingests to Weaviate with GPU embedder
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add current directory to path for imports
|
||||
current_dir = Path(__file__).parent.absolute()
|
||||
sys.path.insert(0, str(current_dir))
|
||||
|
||||
# Now import can work
|
||||
import utils.weaviate_ingest as weaviate_ingest
|
||||
|
||||
def fix_turings_machines():
|
||||
"""Fix and re-ingest Turings_Machines with corrected metadata."""
|
||||
|
||||
# Load chunks JSON
|
||||
chunks_file = Path("output/Turings_Machines/Turings_Machines_chunks.json")
|
||||
|
||||
if not chunks_file.exists():
|
||||
print(f"ERROR: File not found: {chunks_file}")
|
||||
return
|
||||
|
||||
with open(chunks_file, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
print("Loaded chunks JSON")
|
||||
print(f" - Chunks: {len(data.get('chunks', []))}")
|
||||
print(f" - Current title: {data.get('metadata', {}).get('title', 'N/A')[:80]}")
|
||||
print(f" - Current author: {data.get('metadata', {}).get('author', 'N/A')[:80]}")
|
||||
|
||||
# Correct metadata
|
||||
corrected_metadata = {
|
||||
"title": "Turing's Machines",
|
||||
"author": "Dorian Wiszniewski, Richard Coyne, Christopher Pierce",
|
||||
"year": 2000, # Approximate - from references (Coyne 1999, etc.)
|
||||
"language": "en"
|
||||
}
|
||||
|
||||
# Update metadata
|
||||
data["metadata"] = corrected_metadata
|
||||
|
||||
# Update all chunks with corrected metadata
|
||||
for chunk in data.get("chunks", []):
|
||||
chunk["workTitle"] = corrected_metadata["title"]
|
||||
chunk["workAuthor"] = corrected_metadata["author"]
|
||||
chunk["year"] = corrected_metadata["year"]
|
||||
|
||||
print("\nCorrected metadata:")
|
||||
print(f" - Title: {corrected_metadata['title']}")
|
||||
print(f" - Author: {corrected_metadata['author']}")
|
||||
print(f" - Year: {corrected_metadata['year']}")
|
||||
|
||||
# Prepare chunks for ingestion (format expected by ingest_document)
|
||||
chunks_for_ingestion = []
|
||||
for i, chunk in enumerate(data.get("chunks", [])):
|
||||
chunks_for_ingestion.append({
|
||||
"text": chunk["text"],
|
||||
"sectionPath": chunk.get("section", ""),
|
||||
"sectionLevel": chunk.get("section_level", 1),
|
||||
"chapterTitle": "",
|
||||
"canonicalReference": "",
|
||||
"unitType": chunk.get("type", "main_content"),
|
||||
"keywords": chunk.get("concepts", []),
|
||||
"language": "en",
|
||||
"orderIndex": i,
|
||||
})
|
||||
|
||||
print(f"\nPrepared {len(chunks_for_ingestion)} chunks for ingestion")
|
||||
|
||||
# Re-ingest to Weaviate
|
||||
print("\nStarting re-ingestion with GPU embedder...")
|
||||
|
||||
result = weaviate_ingest.ingest_document(
|
||||
doc_name="Turings_Machines",
|
||||
chunks=chunks_for_ingestion,
|
||||
metadata=corrected_metadata,
|
||||
language="en"
|
||||
)
|
||||
|
||||
if result.get("success"):
|
||||
print(f"\nRe-ingestion successful!")
|
||||
print(f" - Chunks inserted: {result.get('count', 0)}")
|
||||
print(f" - Work UUID: {result.get('work_uuid', 'N/A')}")
|
||||
else:
|
||||
print(f"\nRe-ingestion failed!")
|
||||
print(f" - Error: {result.get('error', 'Unknown')}")
|
||||
|
||||
# Save corrected chunks JSON
|
||||
corrected_file = chunks_file.parent / f"{chunks_file.stem}_corrected.json"
|
||||
with open(corrected_file, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\nSaved corrected chunks to: {corrected_file}")
|
||||
|
||||
return result
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("=" * 70)
|
||||
print("Fix Turings_Machines Ingestion")
|
||||
print("=" * 70)
|
||||
|
||||
result = fix_turings_machines()
|
||||
|
||||
if result and result.get("success"):
|
||||
print("\n" + "=" * 70)
|
||||
print("FIX COMPLETED SUCCESSFULLY")
|
||||
print("=" * 70)
|
||||
sys.exit(0)
|
||||
else:
|
||||
print("\n" + "=" * 70)
|
||||
print("FIX FAILED")
|
||||
print("=" * 70)
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user