feat: Auto-create Work entries during document ingestion

Adds automatic Work object creation to ensure all uploaded documents appear on the /documents page. Previously, chunks were ingested but Work entries were missing, causing documents to be invisible in the UI. Changes: - Add create_or_get_work() function to weaviate_ingest.py - Checks for existing Work by sourceId (prevents duplicates) - Creates new Work with metadata (title, author, year, pages) - Returns UUID for potential future reference - Integrate Work creation into ingest_document() flow - Add helper scripts for retroactive fixes and verification: - create_missing_works.py: Create Works for already-ingested documents - reingest_batch_documents.py: Re-ingest documents after bug fixes - check_batch_results.py: Verify batch upload results in Weaviate This completes the batch upload feature - documents now properly appear on /documents page immediately after ingestion. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-08 23:34:06 +01:00
parent b8d94576de
commit 0c3b6c5fea
4 changed files with 286 additions and 0 deletions
--- a/check_batch_results.py
+++ b/check_batch_results.py
@@ -0,0 +1,72 @@
 """Check batch upload results in Weaviate."""
 import sys
 from pathlib import Path
 # Fix Windows encoding
 if sys.platform == "win32":
    import io
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
 # Add parent directory to path
 sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
 import weaviate
 print("Connecting to Weaviate...")
 client = weaviate.connect_to_local(host="localhost", port=8080, grpc_port=50051)
 try:
    chunk_collection = client.collections.get("Chunk_v2")
    # Fetch recent chunks and look for the new documents
    print("\n1. Fetching recent chunks (last 50)...")
    all_chunks = chunk_collection.query.fetch_objects(limit=50)
    # Group by work title
    works = {}
    for chunk in all_chunks.objects:
        work_info = chunk.properties.get('work', {})
        title = work_info.get('title', 'N/A')
        author = work_info.get('author', 'N/A')
        if title not in works:
            works[title] = {'author': author, 'count': 0}
        works[title]['count'] += 1
    # Check for our test documents
    cartesian_found = False
    turing_found = False
    print("\n2. Looking for test documents in recent chunks...")
    for title, info in works.items():
        if 'Cartesian' in title or 'artificial intelligence' in title.lower():
            print(f"   ✓ Found: {title[:70]}")
            print(f"     Author: {info['author']}")
            print(f"     Chunks: {info['count']}")
            cartesian_found = True
        if 'Turing' in title or 'von Neumann' in title:
            print(f"   ✓ Found: {title[:70]}")
            print(f"     Author: {info['author']}")
            print(f"     Chunks: {info['count']}")
            turing_found = True
    if not cartesian_found:
        print("   ✗ Cartesian document not found in recent chunks")
    if not turing_found:
        print("   ✗ Turing document not found in recent chunks")
    # Count all chunks
    print("\n3. Total chunks in database:")
    result = chunk_collection.aggregate.over_all()
    print(f"   Total: {result.total_count}")
    # List recent works (last 5)
    print("\n4. Recent works (showing first 5 chunks by creation time):")
    all_chunks = chunk_collection.query.fetch_objects(limit=5)
    for i, chunk in enumerate(all_chunks.objects, 1):
        work_title = chunk.properties.get('work', {}).get('title', 'N/A')
        print(f"   {i}. {work_title[:60]}...")
 finally:
    client.close()
    print("\n✓ Done")
--- a/create_missing_works.py
+++ b/create_missing_works.py
@@ -0,0 +1,58 @@
 """Create missing Work entries for test documents."""
 import json
 import sys
 from pathlib import Path
 # Fix Windows encoding
 if sys.platform == "win32":
    import io
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
 # Add parent directory to path
 sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
 from utils.weaviate_ingest import create_or_get_work, get_weaviate_client
 # Documents to create Works for
 documents = [
    "A_Cartesian_critique_of_the_artificial_i",
    "Alan_Turing_and_John_von_Neumann_Their_B"
 ]
 output_dir = Path(__file__).parent / "generations" / "library_rag" / "output"
 print("Creating missing Work entries...\n")
 with get_weaviate_client() as client:
    if client is None:
        print("Error: Could not connect to Weaviate")
        sys.exit(1)
    for doc_name in documents:
        print(f"Processing: {doc_name}")
        # Load metadata from chunks JSON
        chunks_file = output_dir / doc_name / f"{doc_name}_chunks.json"
        if not chunks_file.exists():
            print(f"  ✗ Chunks file not found")
            continue
        with open(chunks_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        metadata = data.get("metadata", {})
        pages = data.get("pages", 0)
        # Create or get Work
        work_uuid = create_or_get_work(client, doc_name, metadata, pages)
        if work_uuid:
            print(f"  ✓ Work created/retrieved: {work_uuid[:8]}...")
        else:
            print(f"  ✗ Failed to create Work")
        print()
 print("=" * 70)
 print("Done! Checking /documents page now should show the new works.")
--- a/generations/library_rag/utils/weaviate_ingest.py
+++ b/generations/library_rag/utils/weaviate_ingest.py
@@ -559,6 +559,69 @@ def get_weaviate_client() -> Generator[Optional[WeaviateClient], None, None]:
            client.close()
 def create_or_get_work(
    client: WeaviateClient,
    doc_name: str,
    metadata: Dict[str, Any],
    pages: int = 0,
 ) -> Optional[str]:
    """Create or retrieve a Work entry for a document.
    Creates a Work object representing the philosophical work/document.
    If a Work with the same sourceId already exists, returns its UUID.
    Args:
        client: Active Weaviate client connection.
        doc_name: Unique document identifier (sourceId).
        metadata: Extracted metadata dict with keys: title, author, year, etc.
        pages: Number of pages in the source document.
    Returns:
        UUID string of the Work object, or None if creation failed.
    """
    try:
        work_collection: Collection[Any, Any] = client.collections.get("Work")
    except Exception as e:
        logger.warning(f"Collection Work non trouvée: {e}")
        return None
    title = metadata.get("title") or doc_name
    author = metadata.get("author") or "Inconnu"
    year = metadata.get("year", 0) if metadata.get("year") else 0
    try:
        # Check if Work already exists with this sourceId
        existing = work_collection.query.fetch_objects(
            filters=wvq.Filter.by_property("sourceId").equal(doc_name),
            limit=1
        )
        if existing.objects:
            work_uuid = str(existing.objects[0].uuid)
            logger.info(f"Work déjà existant: {title} (UUID: {work_uuid[:8]}...)")
            return work_uuid
        # Create new Work
        work_obj: Dict[str, Any] = {
            "title": title,
            "author": author,
            "year": year,
            "language": metadata.get("language", "en"),
            "genre": metadata.get("genre", "philosophy"),
            "sourceId": doc_name,
            "pages": pages,
        }
        result = work_collection.data.insert(work_obj)
        work_uuid = str(result)
        logger.info(f"Work créé: {title} par {author} (UUID: {work_uuid[:8]}...)")
        return work_uuid
    except Exception as e:
        logger.warning(f"Erreur création Work: {e}")
        return None
 def ingest_document_metadata(
    client: WeaviateClient,
    doc_name: str,
@@ -832,6 +895,11 @@ def ingest_document(
                    inserted=[],
                )
            # Créer ou récupérer le Work (toujours, pour la page /documents)
            work_uuid: Optional[str] = create_or_get_work(
                client, doc_name, metadata, pages
            )
            # Insérer les métadonnées du document (optionnel)
            doc_uuid: Optional[str] = None
            if ingest_document_collection:
--- a/reingest_batch_documents.py
+++ b/reingest_batch_documents.py
@@ -0,0 +1,88 @@
 """Reingest documents that failed to ingest due to collection name bug."""
 import json
 import sys
 from pathlib import Path
 # Fix Windows encoding
 if sys.platform == "win32":
    import io
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
 # Add parent directory to path
 sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
 from utils.weaviate_ingest import ingest_document
 # Documents to reingest
 documents = [
    "A_Cartesian_critique_of_the_artificial_i",
    "Alan_Turing_and_John_von_Neumann_Their_B"
 ]
 output_dir = Path(__file__).parent / "generations" / "library_rag" / "output"
 print("🔄 Reingesting batch upload documents with fixed collection names...\n")
 for doc_name in documents:
    print(f"📄 Processing: {doc_name}")
    # Load chunks JSON
    chunks_file = output_dir / doc_name / f"{doc_name}_chunks.json"
    if not chunks_file.exists():
        print(f"   ✗ Chunks file not found: {chunks_file}")
        continue
    with open(chunks_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    chunks = data.get("chunks", [])
    metadata = data.get("metadata", {})
    toc = data.get("toc", [])
    pages = data.get("pages", 0)
    language = metadata.get("language", "en")
    if not chunks:
        print(f"   ⚠️  No chunks found in file")
        continue
    print(f"   • Chunks: {len(chunks)}")
    print(f"   • Title: {metadata.get('title', 'N/A')}")
    print(f"   • Author: {metadata.get('author', 'N/A')}")
    print(f"   • Language: {language}")
    # Ingest to Weaviate
    print(f"   🚀 Ingesting to Weaviate...")
    result = ingest_document(
        doc_name=doc_name,
        chunks=chunks,
        metadata=metadata,
        language=language,
        toc=toc,
        pages=pages,
        ingest_document_collection=True,
        ingest_summary_collection=False,
    )
    if result["success"]:
        print(f"   ✅ Success! Inserted {result['count']} chunks")
    else:
        print(f"   ✗ Failed: {result.get('error', 'Unknown error')}")
    print()
 print("=" * 70)
 print("✓ Reingestion complete!")
 print()
 # Verify total count
 import weaviate
 print("🔍 Verifying total chunks in Weaviate...")
 client = weaviate.connect_to_local()
 try:
    chunk_coll = client.collections.get("Chunk_v2")
    total = chunk_coll.aggregate.over_all().total_count
    print(f"   Total chunks: {total}")
    print(f"   Expected: {5304 + 7 + 11} = 5,322")
 finally:
    client.close()