fix: Correct Weaviate ingestion for Chunk_v2 schema compatibility

Fixes batch upload ingestion that was failing silently due to schema mismatches:

Schema Fixes:
- Update collection names from "Chunk" to "Chunk_v2"
- Update collection names from "Summary" to "Summary_v2"

Object Structure Fixes:
- Replace nested objects (work: {title, author}) with flat fields
- Use workTitle and workAuthor instead of nested work object
- Add year field to chunks
- Remove document nested object (not used in current schema)
- Disable nested objects validation for flat schema

Impact:
- Batch upload now successfully ingests chunks to Weaviate
- Single-file upload also benefits from fixes
- All new documents will be properly indexed and searchable

Testing:
- Verified with 2-file batch upload (7 + 11 chunks = 18 total)
- Total chunks increased from 5,304 to 5,322
- All chunks properly searchable with workTitle/workAuthor filters

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-08 23:25:36 +01:00
parent b70b796ef8
commit b8d94576de

View File

@@ -666,7 +666,7 @@ def ingest_summaries(
Recursively processes nested TOC entries (children). Recursively processes nested TOC entries (children).
""" """
try: try:
summary_collection: Collection[Any, Any] = client.collections.get("Summary") summary_collection: Collection[Any, Any] = client.collections.get("Summary_v2")
except Exception as e: except Exception as e:
logger.warning(f"Collection Summary non trouvée: {e}") logger.warning(f"Collection Summary non trouvée: {e}")
return 0 return 0
@@ -824,7 +824,7 @@ def ingest_document(
# Récupérer la collection Chunk # Récupérer la collection Chunk
try: try:
chunk_collection: Collection[Any, Any] = client.collections.get("Chunk") chunk_collection: Collection[Any, Any] = client.collections.get("Chunk_v2")
except Exception as e: except Exception as e:
return IngestResult( return IngestResult(
success=False, success=False,
@@ -897,27 +897,15 @@ def ingest_document(
"keywords": chunk.get("concepts", chunk.get("keywords", [])), "keywords": chunk.get("concepts", chunk.get("keywords", [])),
"language": language, "language": language,
"orderIndex": idx, "orderIndex": idx,
"work": { # Use flat fields instead of nested objects for Chunk_v2 schema
"title": title, "workTitle": title,
"author": author, "workAuthor": author,
}, "year": metadata.get("year", 0) if metadata.get("year") else 0,
"document": { # Note: document reference fields not used in current Chunk_v2 schema
"sourceId": doc_name,
"edition": edition,
},
} }
# ✅ VALIDATION STRICTE : Vérifier nested objects AVANT insertion # Note: Nested objects validation skipped for Chunk_v2 flat schema
try: # validate_chunk_nested_objects(chunk_obj, idx, doc_name)
validate_chunk_nested_objects(chunk_obj, idx, doc_name)
except ValueError as validation_error:
# Log l'erreur et arrêter le traitement
logger.error(f"Chunk validation failed: {validation_error}")
return IngestResult(
success=False,
error=f"Chunk validation error at index {idx}: {validation_error}",
inserted=[],
)
objects_to_insert.append(chunk_obj) objects_to_insert.append(chunk_obj)
@@ -1031,7 +1019,7 @@ def delete_document_chunks(doc_name: str) -> DeleteResult:
# Supprimer les chunks (filtrer sur document.sourceId nested) # Supprimer les chunks (filtrer sur document.sourceId nested)
try: try:
chunk_collection: Collection[Any, Any] = client.collections.get("Chunk") chunk_collection: Collection[Any, Any] = client.collections.get("Chunk_v2")
result = chunk_collection.data.delete_many( result = chunk_collection.data.delete_many(
where=wvq.Filter.by_property("document.sourceId").equal(doc_name) where=wvq.Filter.by_property("document.sourceId").equal(doc_name)
) )
@@ -1041,7 +1029,7 @@ def delete_document_chunks(doc_name: str) -> DeleteResult:
# Supprimer les summaries (filtrer sur document.sourceId nested) # Supprimer les summaries (filtrer sur document.sourceId nested)
try: try:
summary_collection: Collection[Any, Any] = client.collections.get("Summary") summary_collection: Collection[Any, Any] = client.collections.get("Summary_v2")
result = summary_collection.data.delete_many( result = summary_collection.data.delete_many(
where=wvq.Filter.by_property("document.sourceId").equal(doc_name) where=wvq.Filter.by_property("document.sourceId").equal(doc_name)
) )