feat: Auto-create Work entries during document ingestion
Adds automatic Work object creation to ensure all uploaded documents appear on the /documents page. Previously, chunks were ingested but Work entries were missing, causing documents to be invisible in the UI. Changes: - Add create_or_get_work() function to weaviate_ingest.py - Checks for existing Work by sourceId (prevents duplicates) - Creates new Work with metadata (title, author, year, pages) - Returns UUID for potential future reference - Integrate Work creation into ingest_document() flow - Add helper scripts for retroactive fixes and verification: - create_missing_works.py: Create Works for already-ingested documents - reingest_batch_documents.py: Re-ingest documents after bug fixes - check_batch_results.py: Verify batch upload results in Weaviate This completes the batch upload feature - documents now properly appear on /documents page immediately after ingestion. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
72
check_batch_results.py
Normal file
72
check_batch_results.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Check batch upload results in Weaviate."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Fix Windows encoding
|
||||
if sys.platform == "win32":
|
||||
import io
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
|
||||
|
||||
import weaviate
|
||||
|
||||
print("Connecting to Weaviate...")
|
||||
client = weaviate.connect_to_local(host="localhost", port=8080, grpc_port=50051)
|
||||
|
||||
try:
|
||||
chunk_collection = client.collections.get("Chunk_v2")
|
||||
|
||||
# Fetch recent chunks and look for the new documents
|
||||
print("\n1. Fetching recent chunks (last 50)...")
|
||||
all_chunks = chunk_collection.query.fetch_objects(limit=50)
|
||||
|
||||
# Group by work title
|
||||
works = {}
|
||||
for chunk in all_chunks.objects:
|
||||
work_info = chunk.properties.get('work', {})
|
||||
title = work_info.get('title', 'N/A')
|
||||
author = work_info.get('author', 'N/A')
|
||||
if title not in works:
|
||||
works[title] = {'author': author, 'count': 0}
|
||||
works[title]['count'] += 1
|
||||
|
||||
# Check for our test documents
|
||||
cartesian_found = False
|
||||
turing_found = False
|
||||
|
||||
print("\n2. Looking for test documents in recent chunks...")
|
||||
for title, info in works.items():
|
||||
if 'Cartesian' in title or 'artificial intelligence' in title.lower():
|
||||
print(f" ✓ Found: {title[:70]}")
|
||||
print(f" Author: {info['author']}")
|
||||
print(f" Chunks: {info['count']}")
|
||||
cartesian_found = True
|
||||
if 'Turing' in title or 'von Neumann' in title:
|
||||
print(f" ✓ Found: {title[:70]}")
|
||||
print(f" Author: {info['author']}")
|
||||
print(f" Chunks: {info['count']}")
|
||||
turing_found = True
|
||||
|
||||
if not cartesian_found:
|
||||
print(" ✗ Cartesian document not found in recent chunks")
|
||||
if not turing_found:
|
||||
print(" ✗ Turing document not found in recent chunks")
|
||||
|
||||
# Count all chunks
|
||||
print("\n3. Total chunks in database:")
|
||||
result = chunk_collection.aggregate.over_all()
|
||||
print(f" Total: {result.total_count}")
|
||||
|
||||
# List recent works (last 5)
|
||||
print("\n4. Recent works (showing first 5 chunks by creation time):")
|
||||
all_chunks = chunk_collection.query.fetch_objects(limit=5)
|
||||
for i, chunk in enumerate(all_chunks.objects, 1):
|
||||
work_title = chunk.properties.get('work', {}).get('title', 'N/A')
|
||||
print(f" {i}. {work_title[:60]}...")
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
print("\n✓ Done")
|
||||
58
create_missing_works.py
Normal file
58
create_missing_works.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""Create missing Work entries for test documents."""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Fix Windows encoding
|
||||
if sys.platform == "win32":
|
||||
import io
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
|
||||
|
||||
from utils.weaviate_ingest import create_or_get_work, get_weaviate_client
|
||||
|
||||
# Documents to create Works for
|
||||
documents = [
|
||||
"A_Cartesian_critique_of_the_artificial_i",
|
||||
"Alan_Turing_and_John_von_Neumann_Their_B"
|
||||
]
|
||||
|
||||
output_dir = Path(__file__).parent / "generations" / "library_rag" / "output"
|
||||
|
||||
print("Creating missing Work entries...\n")
|
||||
|
||||
with get_weaviate_client() as client:
|
||||
if client is None:
|
||||
print("Error: Could not connect to Weaviate")
|
||||
sys.exit(1)
|
||||
|
||||
for doc_name in documents:
|
||||
print(f"Processing: {doc_name}")
|
||||
|
||||
# Load metadata from chunks JSON
|
||||
chunks_file = output_dir / doc_name / f"{doc_name}_chunks.json"
|
||||
if not chunks_file.exists():
|
||||
print(f" ✗ Chunks file not found")
|
||||
continue
|
||||
|
||||
with open(chunks_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
metadata = data.get("metadata", {})
|
||||
pages = data.get("pages", 0)
|
||||
|
||||
# Create or get Work
|
||||
work_uuid = create_or_get_work(client, doc_name, metadata, pages)
|
||||
|
||||
if work_uuid:
|
||||
print(f" ✓ Work created/retrieved: {work_uuid[:8]}...")
|
||||
else:
|
||||
print(f" ✗ Failed to create Work")
|
||||
|
||||
print()
|
||||
|
||||
print("=" * 70)
|
||||
print("Done! Checking /documents page now should show the new works.")
|
||||
@@ -559,6 +559,69 @@ def get_weaviate_client() -> Generator[Optional[WeaviateClient], None, None]:
|
||||
client.close()
|
||||
|
||||
|
||||
def create_or_get_work(
|
||||
client: WeaviateClient,
|
||||
doc_name: str,
|
||||
metadata: Dict[str, Any],
|
||||
pages: int = 0,
|
||||
) -> Optional[str]:
|
||||
"""Create or retrieve a Work entry for a document.
|
||||
|
||||
Creates a Work object representing the philosophical work/document.
|
||||
If a Work with the same sourceId already exists, returns its UUID.
|
||||
|
||||
Args:
|
||||
client: Active Weaviate client connection.
|
||||
doc_name: Unique document identifier (sourceId).
|
||||
metadata: Extracted metadata dict with keys: title, author, year, etc.
|
||||
pages: Number of pages in the source document.
|
||||
|
||||
Returns:
|
||||
UUID string of the Work object, or None if creation failed.
|
||||
"""
|
||||
try:
|
||||
work_collection: Collection[Any, Any] = client.collections.get("Work")
|
||||
except Exception as e:
|
||||
logger.warning(f"Collection Work non trouvée: {e}")
|
||||
return None
|
||||
|
||||
title = metadata.get("title") or doc_name
|
||||
author = metadata.get("author") or "Inconnu"
|
||||
year = metadata.get("year", 0) if metadata.get("year") else 0
|
||||
|
||||
try:
|
||||
# Check if Work already exists with this sourceId
|
||||
existing = work_collection.query.fetch_objects(
|
||||
filters=wvq.Filter.by_property("sourceId").equal(doc_name),
|
||||
limit=1
|
||||
)
|
||||
|
||||
if existing.objects:
|
||||
work_uuid = str(existing.objects[0].uuid)
|
||||
logger.info(f"Work déjà existant: {title} (UUID: {work_uuid[:8]}...)")
|
||||
return work_uuid
|
||||
|
||||
# Create new Work
|
||||
work_obj: Dict[str, Any] = {
|
||||
"title": title,
|
||||
"author": author,
|
||||
"year": year,
|
||||
"language": metadata.get("language", "en"),
|
||||
"genre": metadata.get("genre", "philosophy"),
|
||||
"sourceId": doc_name,
|
||||
"pages": pages,
|
||||
}
|
||||
|
||||
result = work_collection.data.insert(work_obj)
|
||||
work_uuid = str(result)
|
||||
logger.info(f"Work créé: {title} par {author} (UUID: {work_uuid[:8]}...)")
|
||||
return work_uuid
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur création Work: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def ingest_document_metadata(
|
||||
client: WeaviateClient,
|
||||
doc_name: str,
|
||||
@@ -832,6 +895,11 @@ def ingest_document(
|
||||
inserted=[],
|
||||
)
|
||||
|
||||
# Créer ou récupérer le Work (toujours, pour la page /documents)
|
||||
work_uuid: Optional[str] = create_or_get_work(
|
||||
client, doc_name, metadata, pages
|
||||
)
|
||||
|
||||
# Insérer les métadonnées du document (optionnel)
|
||||
doc_uuid: Optional[str] = None
|
||||
if ingest_document_collection:
|
||||
|
||||
88
reingest_batch_documents.py
Normal file
88
reingest_batch_documents.py
Normal file
@@ -0,0 +1,88 @@
|
||||
"""Reingest documents that failed to ingest due to collection name bug."""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Fix Windows encoding
|
||||
if sys.platform == "win32":
|
||||
import io
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag"))
|
||||
|
||||
from utils.weaviate_ingest import ingest_document
|
||||
|
||||
# Documents to reingest
|
||||
documents = [
|
||||
"A_Cartesian_critique_of_the_artificial_i",
|
||||
"Alan_Turing_and_John_von_Neumann_Their_B"
|
||||
]
|
||||
|
||||
output_dir = Path(__file__).parent / "generations" / "library_rag" / "output"
|
||||
|
||||
print("🔄 Reingesting batch upload documents with fixed collection names...\n")
|
||||
|
||||
for doc_name in documents:
|
||||
print(f"📄 Processing: {doc_name}")
|
||||
|
||||
# Load chunks JSON
|
||||
chunks_file = output_dir / doc_name / f"{doc_name}_chunks.json"
|
||||
if not chunks_file.exists():
|
||||
print(f" ✗ Chunks file not found: {chunks_file}")
|
||||
continue
|
||||
|
||||
with open(chunks_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
chunks = data.get("chunks", [])
|
||||
metadata = data.get("metadata", {})
|
||||
toc = data.get("toc", [])
|
||||
pages = data.get("pages", 0)
|
||||
language = metadata.get("language", "en")
|
||||
|
||||
if not chunks:
|
||||
print(f" ⚠️ No chunks found in file")
|
||||
continue
|
||||
|
||||
print(f" • Chunks: {len(chunks)}")
|
||||
print(f" • Title: {metadata.get('title', 'N/A')}")
|
||||
print(f" • Author: {metadata.get('author', 'N/A')}")
|
||||
print(f" • Language: {language}")
|
||||
|
||||
# Ingest to Weaviate
|
||||
print(f" 🚀 Ingesting to Weaviate...")
|
||||
result = ingest_document(
|
||||
doc_name=doc_name,
|
||||
chunks=chunks,
|
||||
metadata=metadata,
|
||||
language=language,
|
||||
toc=toc,
|
||||
pages=pages,
|
||||
ingest_document_collection=True,
|
||||
ingest_summary_collection=False,
|
||||
)
|
||||
|
||||
if result["success"]:
|
||||
print(f" ✅ Success! Inserted {result['count']} chunks")
|
||||
else:
|
||||
print(f" ✗ Failed: {result.get('error', 'Unknown error')}")
|
||||
|
||||
print()
|
||||
|
||||
print("=" * 70)
|
||||
print("✓ Reingestion complete!")
|
||||
print()
|
||||
|
||||
# Verify total count
|
||||
import weaviate
|
||||
print("🔍 Verifying total chunks in Weaviate...")
|
||||
client = weaviate.connect_to_local()
|
||||
try:
|
||||
chunk_coll = client.collections.get("Chunk_v2")
|
||||
total = chunk_coll.aggregate.over_all().total_count
|
||||
print(f" Total chunks: {total}")
|
||||
print(f" Expected: {5304 + 7 + 11} = 5,322")
|
||||
finally:
|
||||
client.close()
|
||||
Reference in New Issue
Block a user