linear-coding-agent/check_batch_results.py

"""Check batch upload results in Weaviate."""

import sys
from pathlib import Path

# Fix Windows encoding
if sys.platform == "win32":
    import io
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))

import weaviate

print("Connecting to Weaviate...")
client = weaviate.connect_to_local(host="localhost", port=8080, grpc_port=50051)

try:
    chunk_collection = client.collections.get("Chunk_v2")

    # Fetch recent chunks and look for the new documents
    print("\n1. Fetching recent chunks (last 50)...")
    all_chunks = chunk_collection.query.fetch_objects(limit=50)

    # Group by work title
    works = {}
    for chunk in all_chunks.objects:
        work_info = chunk.properties.get('work', {})
        title = work_info.get('title', 'N/A')
        author = work_info.get('author', 'N/A')
        if title not in works:
            works[title] = {'author': author, 'count': 0}
        works[title]['count'] += 1

    # Check for our test documents
    cartesian_found = False
    turing_found = False

    print("\n2. Looking for test documents in recent chunks...")
    for title, info in works.items():
        if 'Cartesian' in title or 'artificial intelligence' in title.lower():
            print(f"   ✓ Found: {title[:70]}")
            print(f"     Author: {info['author']}")
            print(f"     Chunks: {info['count']}")
            cartesian_found = True
        if 'Turing' in title or 'von Neumann' in title:
            print(f"   ✓ Found: {title[:70]}")
            print(f"     Author: {info['author']}")
            print(f"     Chunks: {info['count']}")
            turing_found = True

    if not cartesian_found:
        print("   ✗ Cartesian document not found in recent chunks")
    if not turing_found:
        print("   ✗ Turing document not found in recent chunks")

    # Count all chunks
    print("\n3. Total chunks in database:")
    result = chunk_collection.aggregate.over_all()
    print(f"   Total: {result.total_count}")

    # List recent works (last 5)
    print("\n4. Recent works (showing first 5 chunks by creation time):")
    all_chunks = chunk_collection.query.fetch_objects(limit=5)
    for i, chunk in enumerate(all_chunks.objects, 1):
        work_title = chunk.properties.get('work', {}).get('title', 'N/A')
        print(f"   {i}. {work_title[:60]}...")

finally:
    client.close()
    print("\n✓ Done")