feat: Auto-create Work entries during document ingestion

Adds automatic Work object creation to ensure all uploaded documents
appear on the /documents page. Previously, chunks were ingested but
Work entries were missing, causing documents to be invisible in the UI.

Changes:
- Add create_or_get_work() function to weaviate_ingest.py
  - Checks for existing Work by sourceId (prevents duplicates)
  - Creates new Work with metadata (title, author, year, pages)
  - Returns UUID for potential future reference
- Integrate Work creation into ingest_document() flow
- Add helper scripts for retroactive fixes and verification:
  - create_missing_works.py: Create Works for already-ingested documents
  - reingest_batch_documents.py: Re-ingest documents after bug fixes
  - check_batch_results.py: Verify batch upload results in Weaviate

This completes the batch upload feature - documents now properly appear
on /documents page immediately after ingestion.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-08 23:34:06 +01:00
parent b8d94576de
commit 0c3b6c5fea
4 changed files with 286 additions and 0 deletions

View File

@@ -559,6 +559,69 @@ def get_weaviate_client() -> Generator[Optional[WeaviateClient], None, None]:
client.close()
def create_or_get_work(
client: WeaviateClient,
doc_name: str,
metadata: Dict[str, Any],
pages: int = 0,
) -> Optional[str]:
"""Create or retrieve a Work entry for a document.
Creates a Work object representing the philosophical work/document.
If a Work with the same sourceId already exists, returns its UUID.
Args:
client: Active Weaviate client connection.
doc_name: Unique document identifier (sourceId).
metadata: Extracted metadata dict with keys: title, author, year, etc.
pages: Number of pages in the source document.
Returns:
UUID string of the Work object, or None if creation failed.
"""
try:
work_collection: Collection[Any, Any] = client.collections.get("Work")
except Exception as e:
logger.warning(f"Collection Work non trouvée: {e}")
return None
title = metadata.get("title") or doc_name
author = metadata.get("author") or "Inconnu"
year = metadata.get("year", 0) if metadata.get("year") else 0
try:
# Check if Work already exists with this sourceId
existing = work_collection.query.fetch_objects(
filters=wvq.Filter.by_property("sourceId").equal(doc_name),
limit=1
)
if existing.objects:
work_uuid = str(existing.objects[0].uuid)
logger.info(f"Work déjà existant: {title} (UUID: {work_uuid[:8]}...)")
return work_uuid
# Create new Work
work_obj: Dict[str, Any] = {
"title": title,
"author": author,
"year": year,
"language": metadata.get("language", "en"),
"genre": metadata.get("genre", "philosophy"),
"sourceId": doc_name,
"pages": pages,
}
result = work_collection.data.insert(work_obj)
work_uuid = str(result)
logger.info(f"Work créé: {title} par {author} (UUID: {work_uuid[:8]}...)")
return work_uuid
except Exception as e:
logger.warning(f"Erreur création Work: {e}")
return None
def ingest_document_metadata(
client: WeaviateClient,
doc_name: str,
@@ -832,6 +895,11 @@ def ingest_document(
inserted=[],
)
# Créer ou récupérer le Work (toujours, pour la page /documents)
work_uuid: Optional[str] = create_or_get_work(
client, doc_name, metadata, pages
)
# Insérer les métadonnées du document (optionnel)
doc_uuid: Optional[str] = None
if ingest_document_collection: