Files
linear-coding-agent/check_batch_results.py
David Blanc Brioir 0c3b6c5fea feat: Auto-create Work entries during document ingestion
Adds automatic Work object creation to ensure all uploaded documents
appear on the /documents page. Previously, chunks were ingested but
Work entries were missing, causing documents to be invisible in the UI.

Changes:
- Add create_or_get_work() function to weaviate_ingest.py
  - Checks for existing Work by sourceId (prevents duplicates)
  - Creates new Work with metadata (title, author, year, pages)
  - Returns UUID for potential future reference
- Integrate Work creation into ingest_document() flow
- Add helper scripts for retroactive fixes and verification:
  - create_missing_works.py: Create Works for already-ingested documents
  - reingest_batch_documents.py: Re-ingest documents after bug fixes
  - check_batch_results.py: Verify batch upload results in Weaviate

This completes the batch upload feature - documents now properly appear
on /documents page immediately after ingestion.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-08 23:34:06 +01:00

73 lines
2.4 KiB
Python

"""Check batch upload results in Weaviate."""
import sys
from pathlib import Path
# Fix Windows encoding
if sys.platform == "win32":
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
import weaviate
print("Connecting to Weaviate...")
client = weaviate.connect_to_local(host="localhost", port=8080, grpc_port=50051)
try:
chunk_collection = client.collections.get("Chunk_v2")
# Fetch recent chunks and look for the new documents
print("\n1. Fetching recent chunks (last 50)...")
all_chunks = chunk_collection.query.fetch_objects(limit=50)
# Group by work title
works = {}
for chunk in all_chunks.objects:
work_info = chunk.properties.get('work', {})
title = work_info.get('title', 'N/A')
author = work_info.get('author', 'N/A')
if title not in works:
works[title] = {'author': author, 'count': 0}
works[title]['count'] += 1
# Check for our test documents
cartesian_found = False
turing_found = False
print("\n2. Looking for test documents in recent chunks...")
for title, info in works.items():
if 'Cartesian' in title or 'artificial intelligence' in title.lower():
print(f" ✓ Found: {title[:70]}")
print(f" Author: {info['author']}")
print(f" Chunks: {info['count']}")
cartesian_found = True
if 'Turing' in title or 'von Neumann' in title:
print(f" ✓ Found: {title[:70]}")
print(f" Author: {info['author']}")
print(f" Chunks: {info['count']}")
turing_found = True
if not cartesian_found:
print(" ✗ Cartesian document not found in recent chunks")
if not turing_found:
print(" ✗ Turing document not found in recent chunks")
# Count all chunks
print("\n3. Total chunks in database:")
result = chunk_collection.aggregate.over_all()
print(f" Total: {result.total_count}")
# List recent works (last 5)
print("\n4. Recent works (showing first 5 chunks by creation time):")
all_chunks = chunk_collection.query.fetch_objects(limit=5)
for i, chunk in enumerate(all_chunks.objects, 1):
work_title = chunk.properties.get('work', {}).get('title', 'N/A')
print(f" {i}. {work_title[:60]}...")
finally:
client.close()
print("\n✓ Done")