diff --git a/check_batch_results.py b/check_batch_results.py new file mode 100644 index 0000000..e809aba --- /dev/null +++ b/check_batch_results.py @@ -0,0 +1,72 @@ +"""Check batch upload results in Weaviate.""" + +import sys +from pathlib import Path + +# Fix Windows encoding +if sys.platform == "win32": + import io + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +# Add parent directory to path +sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag")) + +import weaviate + +print("Connecting to Weaviate...") +client = weaviate.connect_to_local(host="localhost", port=8080, grpc_port=50051) + +try: + chunk_collection = client.collections.get("Chunk_v2") + + # Fetch recent chunks and look for the new documents + print("\n1. Fetching recent chunks (last 50)...") + all_chunks = chunk_collection.query.fetch_objects(limit=50) + + # Group by work title + works = {} + for chunk in all_chunks.objects: + work_info = chunk.properties.get('work', {}) + title = work_info.get('title', 'N/A') + author = work_info.get('author', 'N/A') + if title not in works: + works[title] = {'author': author, 'count': 0} + works[title]['count'] += 1 + + # Check for our test documents + cartesian_found = False + turing_found = False + + print("\n2. Looking for test documents in recent chunks...") + for title, info in works.items(): + if 'Cartesian' in title or 'artificial intelligence' in title.lower(): + print(f" ✓ Found: {title[:70]}") + print(f" Author: {info['author']}") + print(f" Chunks: {info['count']}") + cartesian_found = True + if 'Turing' in title or 'von Neumann' in title: + print(f" ✓ Found: {title[:70]}") + print(f" Author: {info['author']}") + print(f" Chunks: {info['count']}") + turing_found = True + + if not cartesian_found: + print(" ✗ Cartesian document not found in recent chunks") + if not turing_found: + print(" ✗ Turing document not found in recent chunks") + + # Count all chunks + print("\n3. Total chunks in database:") + result = chunk_collection.aggregate.over_all() + print(f" Total: {result.total_count}") + + # List recent works (last 5) + print("\n4. Recent works (showing first 5 chunks by creation time):") + all_chunks = chunk_collection.query.fetch_objects(limit=5) + for i, chunk in enumerate(all_chunks.objects, 1): + work_title = chunk.properties.get('work', {}).get('title', 'N/A') + print(f" {i}. {work_title[:60]}...") + +finally: + client.close() + print("\n✓ Done") diff --git a/create_missing_works.py b/create_missing_works.py new file mode 100644 index 0000000..edd55fd --- /dev/null +++ b/create_missing_works.py @@ -0,0 +1,58 @@ +"""Create missing Work entries for test documents.""" + +import json +import sys +from pathlib import Path + +# Fix Windows encoding +if sys.platform == "win32": + import io + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +# Add parent directory to path +sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag")) + +from utils.weaviate_ingest import create_or_get_work, get_weaviate_client + +# Documents to create Works for +documents = [ + "A_Cartesian_critique_of_the_artificial_i", + "Alan_Turing_and_John_von_Neumann_Their_B" +] + +output_dir = Path(__file__).parent / "generations" / "library_rag" / "output" + +print("Creating missing Work entries...\n") + +with get_weaviate_client() as client: + if client is None: + print("Error: Could not connect to Weaviate") + sys.exit(1) + + for doc_name in documents: + print(f"Processing: {doc_name}") + + # Load metadata from chunks JSON + chunks_file = output_dir / doc_name / f"{doc_name}_chunks.json" + if not chunks_file.exists(): + print(f" ✗ Chunks file not found") + continue + + with open(chunks_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + metadata = data.get("metadata", {}) + pages = data.get("pages", 0) + + # Create or get Work + work_uuid = create_or_get_work(client, doc_name, metadata, pages) + + if work_uuid: + print(f" ✓ Work created/retrieved: {work_uuid[:8]}...") + else: + print(f" ✗ Failed to create Work") + + print() + +print("=" * 70) +print("Done! Checking /documents page now should show the new works.") diff --git a/generations/library_rag/utils/weaviate_ingest.py b/generations/library_rag/utils/weaviate_ingest.py index 0a28707..9da6db5 100644 --- a/generations/library_rag/utils/weaviate_ingest.py +++ b/generations/library_rag/utils/weaviate_ingest.py @@ -559,6 +559,69 @@ def get_weaviate_client() -> Generator[Optional[WeaviateClient], None, None]: client.close() +def create_or_get_work( + client: WeaviateClient, + doc_name: str, + metadata: Dict[str, Any], + pages: int = 0, +) -> Optional[str]: + """Create or retrieve a Work entry for a document. + + Creates a Work object representing the philosophical work/document. + If a Work with the same sourceId already exists, returns its UUID. + + Args: + client: Active Weaviate client connection. + doc_name: Unique document identifier (sourceId). + metadata: Extracted metadata dict with keys: title, author, year, etc. + pages: Number of pages in the source document. + + Returns: + UUID string of the Work object, or None if creation failed. + """ + try: + work_collection: Collection[Any, Any] = client.collections.get("Work") + except Exception as e: + logger.warning(f"Collection Work non trouvée: {e}") + return None + + title = metadata.get("title") or doc_name + author = metadata.get("author") or "Inconnu" + year = metadata.get("year", 0) if metadata.get("year") else 0 + + try: + # Check if Work already exists with this sourceId + existing = work_collection.query.fetch_objects( + filters=wvq.Filter.by_property("sourceId").equal(doc_name), + limit=1 + ) + + if existing.objects: + work_uuid = str(existing.objects[0].uuid) + logger.info(f"Work déjà existant: {title} (UUID: {work_uuid[:8]}...)") + return work_uuid + + # Create new Work + work_obj: Dict[str, Any] = { + "title": title, + "author": author, + "year": year, + "language": metadata.get("language", "en"), + "genre": metadata.get("genre", "philosophy"), + "sourceId": doc_name, + "pages": pages, + } + + result = work_collection.data.insert(work_obj) + work_uuid = str(result) + logger.info(f"Work créé: {title} par {author} (UUID: {work_uuid[:8]}...)") + return work_uuid + + except Exception as e: + logger.warning(f"Erreur création Work: {e}") + return None + + def ingest_document_metadata( client: WeaviateClient, doc_name: str, @@ -832,6 +895,11 @@ def ingest_document( inserted=[], ) + # Créer ou récupérer le Work (toujours, pour la page /documents) + work_uuid: Optional[str] = create_or_get_work( + client, doc_name, metadata, pages + ) + # Insérer les métadonnées du document (optionnel) doc_uuid: Optional[str] = None if ingest_document_collection: diff --git a/reingest_batch_documents.py b/reingest_batch_documents.py new file mode 100644 index 0000000..572227b --- /dev/null +++ b/reingest_batch_documents.py @@ -0,0 +1,88 @@ +"""Reingest documents that failed to ingest due to collection name bug.""" + +import json +import sys +from pathlib import Path + +# Fix Windows encoding +if sys.platform == "win32": + import io + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +# Add parent directory to path +sys.path.insert(0, str(Path(__file__).parent / "generations" / "library_rag")) + +from utils.weaviate_ingest import ingest_document + +# Documents to reingest +documents = [ + "A_Cartesian_critique_of_the_artificial_i", + "Alan_Turing_and_John_von_Neumann_Their_B" +] + +output_dir = Path(__file__).parent / "generations" / "library_rag" / "output" + +print("🔄 Reingesting batch upload documents with fixed collection names...\n") + +for doc_name in documents: + print(f"📄 Processing: {doc_name}") + + # Load chunks JSON + chunks_file = output_dir / doc_name / f"{doc_name}_chunks.json" + if not chunks_file.exists(): + print(f" ✗ Chunks file not found: {chunks_file}") + continue + + with open(chunks_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + chunks = data.get("chunks", []) + metadata = data.get("metadata", {}) + toc = data.get("toc", []) + pages = data.get("pages", 0) + language = metadata.get("language", "en") + + if not chunks: + print(f" ⚠️ No chunks found in file") + continue + + print(f" • Chunks: {len(chunks)}") + print(f" • Title: {metadata.get('title', 'N/A')}") + print(f" • Author: {metadata.get('author', 'N/A')}") + print(f" • Language: {language}") + + # Ingest to Weaviate + print(f" 🚀 Ingesting to Weaviate...") + result = ingest_document( + doc_name=doc_name, + chunks=chunks, + metadata=metadata, + language=language, + toc=toc, + pages=pages, + ingest_document_collection=True, + ingest_summary_collection=False, + ) + + if result["success"]: + print(f" ✅ Success! Inserted {result['count']} chunks") + else: + print(f" ✗ Failed: {result.get('error', 'Unknown error')}") + + print() + +print("=" * 70) +print("✓ Reingestion complete!") +print() + +# Verify total count +import weaviate +print("🔍 Verifying total chunks in Weaviate...") +client = weaviate.connect_to_local() +try: + chunk_coll = client.collections.get("Chunk_v2") + total = chunk_coll.aggregate.over_all().total_count + print(f" Total chunks: {total}") + print(f" Expected: {5304 + 7 + 11} = 5,322") +finally: + client.close()