feat: Auto-create Work entries during document ingestion

Adds automatic Work object creation to ensure all uploaded documents appear on the /documents page. Previously, chunks were ingested but Work entries were missing, causing documents to be invisible in the UI. Changes: - Add create_or_get_work() function to weaviate_ingest.py - Checks for existing Work by sourceId (prevents duplicates) - Creates new Work with metadata (title, author, year, pages) - Returns UUID for potential future reference - Integrate Work creation into ingest_document() flow - Add helper scripts for retroactive fixes and verification: - create_missing_works.py: Create Works for already-ingested documents - reingest_batch_documents.py: Re-ingest documents after bug fixes - check_batch_results.py: Verify batch upload results in Weaviate This completes the batch upload feature - documents now properly appear on /documents page immediately after ingestion. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-08 23:34:06 +01:00
parent b8d94576de
commit 0c3b6c5fea
4 changed files with 286 additions and 0 deletions
--- a/reingest_batch_documents.py
+++ b/reingest_batch_documents.py
@@ -0,0 +1,88 @@
+"""Reingest documents that failed to ingest due to collection name bug."""
+
+import json
+import sys
+from pathlib import Path
+
+# Fix Windows encoding
+if sys.platform == "win32":
+    import io
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
+
+from utils.weaviate_ingest import ingest_document
+
+# Documents to reingest
+documents = [
+    "A_Cartesian_critique_of_the_artificial_i",
+    "Alan_Turing_and_John_von_Neumann_Their_B"
+]
+
+output_dir = Path(__file__).parent / "generations" / "library_rag" / "output"
+
+print("🔄 Reingesting batch upload documents with fixed collection names...\n")
+
+for doc_name in documents:
+    print(f"📄 Processing: {doc_name}")
+
+    # Load chunks JSON
+    chunks_file = output_dir / doc_name / f"{doc_name}_chunks.json"
+    if not chunks_file.exists():
+        print(f"   ✗ Chunks file not found: {chunks_file}")
+        continue
+
+    with open(chunks_file, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
+    chunks = data.get("chunks", [])
+    metadata = data.get("metadata", {})
+    toc = data.get("toc", [])
+    pages = data.get("pages", 0)
+    language = metadata.get("language", "en")
+
+    if not chunks:
+        print(f"   ⚠️  No chunks found in file")
+        continue
+
+    print(f"   • Chunks: {len(chunks)}")
+    print(f"   • Title: {metadata.get('title', 'N/A')}")
+    print(f"   • Author: {metadata.get('author', 'N/A')}")
+    print(f"   • Language: {language}")
+
+    # Ingest to Weaviate
+    print(f"   🚀 Ingesting to Weaviate...")
+    result = ingest_document(
+        doc_name=doc_name,
+        chunks=chunks,
+        metadata=metadata,
+        language=language,
+        toc=toc,
+        pages=pages,
+        ingest_document_collection=True,
+        ingest_summary_collection=False,
+    )
+
+    if result["success"]:
+        print(f"   ✅ Success! Inserted {result['count']} chunks")
+    else:
+        print(f"   ✗ Failed: {result.get('error', 'Unknown error')}")
+
+    print()
+
+print("=" * 70)
+print("✓ Reingestion complete!")
+print()
+
+# Verify total count
+import weaviate
+print("🔍 Verifying total chunks in Weaviate...")
+client = weaviate.connect_to_local()
+try:
+    chunk_coll = client.collections.get("Chunk_v2")
+    total = chunk_coll.aggregate.over_all().total_count
+    print(f"   Total chunks: {total}")
+    print(f"   Expected: {5304 + 7 + 11} = 5,322")
+finally:
+    client.close()