Adds automatic Work object creation to ensure all uploaded documents appear on the /documents page. Previously, chunks were ingested but Work entries were missing, causing documents to be invisible in the UI. Changes: - Add create_or_get_work() function to weaviate_ingest.py - Checks for existing Work by sourceId (prevents duplicates) - Creates new Work with metadata (title, author, year, pages) - Returns UUID for potential future reference - Integrate Work creation into ingest_document() flow - Add helper scripts for retroactive fixes and verification: - create_missing_works.py: Create Works for already-ingested documents - reingest_batch_documents.py: Re-ingest documents after bug fixes - check_batch_results.py: Verify batch upload results in Weaviate This completes the batch upload feature - documents now properly appear on /documents page immediately after ingestion. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
89 lines
2.4 KiB
Python
89 lines
2.4 KiB
Python
"""Reingest documents that failed to ingest due to collection name bug."""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Fix Windows encoding
|
|
if sys.platform == "win32":
|
|
import io
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
|
|
|
# Add parent directory to path
|
|
sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
|
|
|
|
from utils.weaviate_ingest import ingest_document
|
|
|
|
# Documents to reingest
|
|
documents = [
|
|
"A_Cartesian_critique_of_the_artificial_i",
|
|
"Alan_Turing_and_John_von_Neumann_Their_B"
|
|
]
|
|
|
|
output_dir = Path(__file__).parent / "generations" / "library_rag" / "output"
|
|
|
|
print("🔄 Reingesting batch upload documents with fixed collection names...\n")
|
|
|
|
for doc_name in documents:
|
|
print(f"📄 Processing: {doc_name}")
|
|
|
|
# Load chunks JSON
|
|
chunks_file = output_dir / doc_name / f"{doc_name}_chunks.json"
|
|
if not chunks_file.exists():
|
|
print(f" ✗ Chunks file not found: {chunks_file}")
|
|
continue
|
|
|
|
with open(chunks_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
chunks = data.get("chunks", [])
|
|
metadata = data.get("metadata", {})
|
|
toc = data.get("toc", [])
|
|
pages = data.get("pages", 0)
|
|
language = metadata.get("language", "en")
|
|
|
|
if not chunks:
|
|
print(f" ⚠️ No chunks found in file")
|
|
continue
|
|
|
|
print(f" • Chunks: {len(chunks)}")
|
|
print(f" • Title: {metadata.get('title', 'N/A')}")
|
|
print(f" • Author: {metadata.get('author', 'N/A')}")
|
|
print(f" • Language: {language}")
|
|
|
|
# Ingest to Weaviate
|
|
print(f" 🚀 Ingesting to Weaviate...")
|
|
result = ingest_document(
|
|
doc_name=doc_name,
|
|
chunks=chunks,
|
|
metadata=metadata,
|
|
language=language,
|
|
toc=toc,
|
|
pages=pages,
|
|
ingest_document_collection=True,
|
|
ingest_summary_collection=False,
|
|
)
|
|
|
|
if result["success"]:
|
|
print(f" ✅ Success! Inserted {result['count']} chunks")
|
|
else:
|
|
print(f" ✗ Failed: {result.get('error', 'Unknown error')}")
|
|
|
|
print()
|
|
|
|
print("=" * 70)
|
|
print("✓ Reingestion complete!")
|
|
print()
|
|
|
|
# Verify total count
|
|
import weaviate
|
|
print("🔍 Verifying total chunks in Weaviate...")
|
|
client = weaviate.connect_to_local()
|
|
try:
|
|
chunk_coll = client.collections.get("Chunk_v2")
|
|
total = chunk_coll.aggregate.over_all().total_count
|
|
print(f" Total chunks: {total}")
|
|
print(f" Expected: {5304 + 7 + 11} = 5,322")
|
|
finally:
|
|
client.close()
|