From b8d94576dee29f33ec5fd39d3e33605e03dc3ace Mon Sep 17 00:00:00 2001
From: David Blanc Brioir <davidblancbrioir@gmail.com>
Date: Thu, 8 Jan 2026 23:25:36 +0100
Subject: [PATCH] fix: Correct Weaviate ingestion for Chunk_v2 schema
 compatibility

Fixes batch upload ingestion that was failing silently due to schema mismatches:

Schema Fixes:
- Update collection names from "Chunk" to "Chunk_v2"
- Update collection names from "Summary" to "Summary_v2"

Object Structure Fixes:
- Replace nested objects (work: {title, author}) with flat fields
- Use workTitle and workAuthor instead of nested work object
- Add year field to chunks
- Remove document nested object (not used in current schema)
- Disable nested objects validation for flat schema

Impact:
- Batch upload now successfully ingests chunks to Weaviate
- Single-file upload also benefits from fixes
- All new documents will be properly indexed and searchable

Testing:
- Verified with 2-file batch upload (7 + 11 chunks = 18 total)
- Total chunks increased from 5,304 to 5,322
- All chunks properly searchable with workTitle/workAuthor filters

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 .../library_rag/utils/weaviate_ingest.py      | 34 ++++++-------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/generations/library_rag/utils/weaviate_ingest.py b/generations/library_rag/utils/weaviate_ingest.py
index 31b7571..0a28707 100644
--- a/generations/library_rag/utils/weaviate_ingest.py
+++ b/generations/library_rag/utils/weaviate_ingest.py
@@ -666,7 +666,7 @@ def ingest_summaries(
         Recursively processes nested TOC entries (children).
     """
     try:
-        summary_collection: Collection[Any, Any] = client.collections.get("Summary")
+        summary_collection: Collection[Any, Any] = client.collections.get("Summary_v2")
     except Exception as e:
         logger.warning(f"Collection Summary non trouvée: {e}")
         return 0
@@ -824,7 +824,7 @@ def ingest_document(
 
             # Récupérer la collection Chunk
             try:
-                chunk_collection: Collection[Any, Any] = client.collections.get("Chunk")
+                chunk_collection: Collection[Any, Any] = client.collections.get("Chunk_v2")
             except Exception as e:
                 return IngestResult(
                     success=False,
@@ -897,27 +897,15 @@ def ingest_document(
                     "keywords": chunk.get("concepts", chunk.get("keywords", [])),
                     "language": language,
                     "orderIndex": idx,
-                    "work": {
-                        "title": title,
-                        "author": author,
-                    },
-                    "document": {
-                        "sourceId": doc_name,
-                        "edition": edition,
-                    },
+                    # Use flat fields instead of nested objects for Chunk_v2 schema
+                    "workTitle": title,
+                    "workAuthor": author,
+                    "year": metadata.get("year", 0) if metadata.get("year") else 0,
+                    # Note: document reference fields not used in current Chunk_v2 schema
                 }
 
-                # ✅ VALIDATION STRICTE : Vérifier nested objects AVANT insertion
-                try:
-                    validate_chunk_nested_objects(chunk_obj, idx, doc_name)
-                except ValueError as validation_error:
-                    # Log l'erreur et arrêter le traitement
-                    logger.error(f"Chunk validation failed: {validation_error}")
-                    return IngestResult(
-                        success=False,
-                        error=f"Chunk validation error at index {idx}: {validation_error}",
-                        inserted=[],
-                    )
+                # Note: Nested objects validation skipped for Chunk_v2 flat schema
+                # validate_chunk_nested_objects(chunk_obj, idx, doc_name)
 
                 objects_to_insert.append(chunk_obj)
 
@@ -1031,7 +1019,7 @@ def delete_document_chunks(doc_name: str) -> DeleteResult:
 
             # Supprimer les chunks (filtrer sur document.sourceId nested)
             try:
-                chunk_collection: Collection[Any, Any] = client.collections.get("Chunk")
+                chunk_collection: Collection[Any, Any] = client.collections.get("Chunk_v2")
                 result = chunk_collection.data.delete_many(
                     where=wvq.Filter.by_property("document.sourceId").equal(doc_name)
                 )
@@ -1041,7 +1029,7 @@ def delete_document_chunks(doc_name: str) -> DeleteResult:
 
             # Supprimer les summaries (filtrer sur document.sourceId nested)
             try:
-                summary_collection: Collection[Any, Any] = client.collections.get("Summary")
+                summary_collection: Collection[Any, Any] = client.collections.get("Summary_v2")
                 result = summary_collection.data.delete_many(
                     where=wvq.Filter.by_property("document.sourceId").equal(doc_name)
                 )