diff --git a/.gitignore b/.gitignore index f5a343a..d7bdd32 100644 --- a/.gitignore +++ b/.gitignore @@ -45,4 +45,4 @@ vectorize_remaining.py migrate_chunk_*.py # Archives (migration scripts moved here) -archive/ \ No newline at end of file +archive/chunk_v2_backup.json diff --git a/DOCUMENT_COLLECTION_ANALYSIS.md b/DOCUMENT_COLLECTION_ANALYSIS.md new file mode 100644 index 0000000..42b3af4 --- /dev/null +++ b/DOCUMENT_COLLECTION_ANALYSIS.md @@ -0,0 +1,156 @@ +# Analyse: Collection Document - À supprimer + +**Date**: 2026-01-09 +**Statut**: ✅ CONFIRMATION - La collection Document n'est PAS utilisée et DOIT être supprimée + +## Problème identifié + +La collection `Document` est toujours définie dans le schéma et contient actuellement **13 objets**, alors que l'architecture devrait utiliser uniquement: +- `Work` - Métadonnées des œuvres +- `Chunk_v2` - Fragments vectorisés (5,372 chunks) +- `Summary_v2` - Résumés de sections (114 summaries) + +## État actuel + +### Collections existantes (Weaviate): +``` +Work: 19 objets ✓ UTILISÉ +Document: 13 objets ✗ NON UTILISÉ (à supprimer) +Chunk_v2: 5,372 objets ✓ UTILISÉ +Summary_v2: 114 objets ✓ UTILISÉ +Chunk: 0 objets (ancienne collection, peut être supprimée) +Conversation, Message, Thought: Collections chat (séparées) +``` + +### Données dans Document: +```json +{ + "sourceId": "Alan_Turing_and_John_von_Neumann_Their_B", + "edition": null, + "pages": 0, + "chunksCount": 11, + "work": null +} +``` + +**Observation**: La plupart des champs sont NULL ou 0 (pas de données utiles). + +## Analyse du code + +### 1. Schéma (`schema.py`) + +**Lignes 159-224**: Définition complète de la collection Document +- Créée par défaut lors de l'initialisation du schéma +- Propriétés: sourceId, edition, language, pages, chunksCount, toc, hierarchy, createdAt, work (nested) + +**Problème de cohérence** (lignes 747-757 dans `weaviate_ingest.py`): +```python +doc_obj: Dict[str, Any] = { + "sourceId": doc_name, + "title": title, # ❌ N'EXISTE PAS dans schema.py + "author": author, # ❌ N'EXISTE PAS dans schema.py + "toc": json.dumps(toc), + "hierarchy": json.dumps(hierarchy), + "pages": pages, + "chunksCount": chunks_count, + "language": metadata.get("language"), + "createdAt": datetime.now().isoformat(), +} +``` + +Le code d'ingestion essaie d'insérer des champs `title` et `author` qui n'existent pas dans le schéma! Cela devrait causer une erreur mais est silencieusement ignoré. + +### 2. Ingestion (`utils/weaviate_ingest.py`) + +**Fonction `ingest_document_metadata()` (lignes 695-765)**: +- Insère les métadonnées du document dans la collection Document +- Stocke: sourceId, toc, hierarchy, pages, chunksCount, language, createdAt + +**Fonction `ingest_document()` (lignes 891-1107)**: +- Paramètre: `ingest_document_collection: bool = True` (ligne 909) +- Par défaut, la fonction INSÈRE dans Document collection (ligne 1010) + +**Fonction `delete_document_from_weaviate()` (lignes 1213-1267)**: +- Supprime de la collection Document (ligne 1243) + +### 3. Flask App (`flask_app.py`) + +**Résultat**: ✅ AUCUNE référence à la collection Document +- Pas de `collections.get("Document")` +- Pas de requêtes vers Document +- Les TOC et métadonnées sont chargées depuis les fichiers `chunks.json` (ligne 3360) + +## Conclusion: Document n'est PAS nécessaire + +### Données actuellement dans Document: + +| Champ | Disponible ailleurs? | Source alternative | +|-------|---------------------|-------------------| +| `sourceId` | ✓ | `Chunk_v2.workTitle` (dénormalisé) | +| `toc` | ✓ | `output//_chunks.json` | +| `hierarchy` | ✓ | `output//_chunks.json` | +| `pages` | ✓ | `output//_chunks.json` (metadata.pages) | +| `chunksCount` | ✓ | Dérivable via `Chunk_v2.aggregate.over_all(filter=workTitle)` | +| `language` | ✓ | `Work.language` + `Chunk_v2.language` | +| `createdAt` | ✓ | Dérivable via horodatage système des fichiers output/ | +| `edition` | ✗ | Jamais renseigné (toujours NULL) | +| `work` (nested) | ✓ | Collection `Work` dédiée | + +**Verdict**: Toutes les informations utiles de Document sont disponibles ailleurs. La collection est redondante. + +## Impact de la suppression + +### ✅ Aucun impact négatif: +- Flask app n'utilise pas Document +- TOC/hierarchy chargés depuis fichiers JSON +- Métadonnées disponibles dans Work et Chunk_v2 + +### ✅ Bénéfices: +- Simplifie l'architecture (3 collections au lieu de 4) +- Réduit la mémoire Weaviate (~13 objets + index) +- Simplifie le code d'ingestion (moins d'étapes) +- Évite la confusion sur "quelle collection utiliser?" + +## Plan d'action recommandé + +### Étape 1: Supprimer la collection Document de Weaviate +```python +import weaviate +client = weaviate.connect_to_local() +client.collections.delete("Document") +client.close() +``` + +### Étape 2: Supprimer de `schema.py` +- Supprimer fonction `create_document_collection()` (lignes 159-224) +- Supprimer appel dans `create_schema()` (ligne 432) +- Mettre à jour `verify_schema()` pour ne plus vérifier Document (ligne 456) +- Mettre à jour `display_schema()` pour ne plus afficher Document (ligne 483) + +### Étape 3: Nettoyer `utils/weaviate_ingest.py` +- Supprimer fonction `ingest_document_metadata()` (lignes 695-765) +- Supprimer paramètre `ingest_document_collection` (ligne 909) +- Supprimer appel à `ingest_document_metadata()` (ligne 1010) +- Supprimer suppression de Document dans `delete_document_from_weaviate()` (lignes 1241-1248) + +### Étape 4: Mettre à jour la documentation +- Mettre à jour `schema.py` docstring (ligne 12: supprimer Document de la hiérarchie) +- Mettre à jour `CLAUDE.md` (ligne 11: supprimer Document) +- Mettre à jour `.claude/CLAUDE.md` (supprimer références à Document) + +### Étape 5: Supprimer aussi la collection `Chunk` (ancienne) +```python +# Chunk_v2 la remplace complètement +client.collections.delete("Chunk") +``` + +## Risques + +**Aucun risque identifié** car: +- Collection non utilisée par l'application +- Données disponibles ailleurs +- Pas de dépendances externes + +--- + +**Recommandation finale**: Procéder à la suppression immédiate de la collection Document. diff --git a/generations/library_rag/.claude/CLAUDE.md b/generations/library_rag/.claude/CLAUDE.md index 51fa65b..ba4a60e 100644 --- a/generations/library_rag/.claude/CLAUDE.md +++ b/generations/library_rag/.claude/CLAUDE.md @@ -138,34 +138,30 @@ The core of the application is `utils/pdf_pipeline.py`, which orchestrates a 10- - `use_ocr_annotations=True` - OCR with annotations (3x cost, better TOC) - `ingest_to_weaviate=True` - Insert chunks into Weaviate -### Weaviate Schema (4 Collections) +### Weaviate Schema (3 Collections) -Defined in `schema.py`, the database uses a normalized design with denormalized nested objects: +Defined in `schema.py`, the database uses a denormalized design with nested objects: ``` Work (no vectorizer) title, author, year, language, genre - - ├─► Document (no vectorizer) - │ sourceId, edition, pages, toc, hierarchy + │ + ├─► Chunk_v2 (manual GPU vectorization) ⭐ PRIMARY + │ text (VECTORIZED) + │ keywords (VECTORIZED) + │ workTitle, workAuthor, sectionPath, chapterTitle, unitType, orderIndex │ work: {title, author} (nested) │ - │ ├─► Chunk (text2vec-transformers) ⭐ PRIMARY - │ │ text (VECTORIZED) - │ │ keywords (VECTORIZED) - │ │ sectionPath, chapterTitle, unitType, orderIndex - │ │ work: {title, author} (nested) - │ │ document: {sourceId, edition} (nested) - │ │ - │ └─► Summary (text2vec-transformers) - │ text (VECTORIZED) - │ concepts (VECTORIZED) - │ sectionPath, title, level, chunksCount - │ document: {sourceId} (nested) + └─► Summary_v2 (manual GPU vectorization) + text (VECTORIZED) + concepts (VECTORIZED) + sectionPath, title, level, chunksCount + work: {title, author} (nested) ``` **Vectorization Strategy:** -- Only `Chunk.text`, `Chunk.keywords`, `Summary.text`, `Summary.concepts` are vectorized +- Only `Chunk_v2.text`, `Chunk_v2.keywords`, `Summary_v2.text`, `Summary_v2.concepts` are vectorized +- Manual vectorization with Python GPU embedder (BAAI/bge-m3, 1024-dim, RTX 4070) - Metadata fields use `skip_vectorization=True` for filtering performance - Nested objects avoid joins for efficient single-query retrieval - BAAI/bge-m3 model: 1024 dimensions, 8192 token context diff --git a/generations/library_rag/fix_turings_machines.py b/generations/library_rag/fix_turings_machines.py new file mode 100644 index 0000000..03ffbaf --- /dev/null +++ b/generations/library_rag/fix_turings_machines.py @@ -0,0 +1,122 @@ +""" +Fix Turings_Machines ingestion with corrected metadata. + +The LLM returned prompt instructions instead of actual metadata. +This script: +1. Loads chunks from Turings_Machines_chunks.json +2. Corrects workTitle and workAuthor +3. Re-ingests to Weaviate with GPU embedder +""" + +import json +import sys +from pathlib import Path + +# Add current directory to path for imports +current_dir = Path(__file__).parent.absolute() +sys.path.insert(0, str(current_dir)) + +# Now import can work +import utils.weaviate_ingest as weaviate_ingest + +def fix_turings_machines(): + """Fix and re-ingest Turings_Machines with corrected metadata.""" + + # Load chunks JSON + chunks_file = Path("output/Turings_Machines/Turings_Machines_chunks.json") + + if not chunks_file.exists(): + print(f"ERROR: File not found: {chunks_file}") + return + + with open(chunks_file, "r", encoding="utf-8") as f: + data = json.load(f) + + print("Loaded chunks JSON") + print(f" - Chunks: {len(data.get('chunks', []))}") + print(f" - Current title: {data.get('metadata', {}).get('title', 'N/A')[:80]}") + print(f" - Current author: {data.get('metadata', {}).get('author', 'N/A')[:80]}") + + # Correct metadata + corrected_metadata = { + "title": "Turing's Machines", + "author": "Dorian Wiszniewski, Richard Coyne, Christopher Pierce", + "year": 2000, # Approximate - from references (Coyne 1999, etc.) + "language": "en" + } + + # Update metadata + data["metadata"] = corrected_metadata + + # Update all chunks with corrected metadata + for chunk in data.get("chunks", []): + chunk["workTitle"] = corrected_metadata["title"] + chunk["workAuthor"] = corrected_metadata["author"] + chunk["year"] = corrected_metadata["year"] + + print("\nCorrected metadata:") + print(f" - Title: {corrected_metadata['title']}") + print(f" - Author: {corrected_metadata['author']}") + print(f" - Year: {corrected_metadata['year']}") + + # Prepare chunks for ingestion (format expected by ingest_document) + chunks_for_ingestion = [] + for i, chunk in enumerate(data.get("chunks", [])): + chunks_for_ingestion.append({ + "text": chunk["text"], + "sectionPath": chunk.get("section", ""), + "sectionLevel": chunk.get("section_level", 1), + "chapterTitle": "", + "canonicalReference": "", + "unitType": chunk.get("type", "main_content"), + "keywords": chunk.get("concepts", []), + "language": "en", + "orderIndex": i, + }) + + print(f"\nPrepared {len(chunks_for_ingestion)} chunks for ingestion") + + # Re-ingest to Weaviate + print("\nStarting re-ingestion with GPU embedder...") + + result = weaviate_ingest.ingest_document( + doc_name="Turings_Machines", + chunks=chunks_for_ingestion, + metadata=corrected_metadata, + language="en" + ) + + if result.get("success"): + print(f"\nRe-ingestion successful!") + print(f" - Chunks inserted: {result.get('count', 0)}") + print(f" - Work UUID: {result.get('work_uuid', 'N/A')}") + else: + print(f"\nRe-ingestion failed!") + print(f" - Error: {result.get('error', 'Unknown')}") + + # Save corrected chunks JSON + corrected_file = chunks_file.parent / f"{chunks_file.stem}_corrected.json" + with open(corrected_file, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + print(f"\nSaved corrected chunks to: {corrected_file}") + + return result + +if __name__ == "__main__": + print("=" * 70) + print("Fix Turings_Machines Ingestion") + print("=" * 70) + + result = fix_turings_machines() + + if result and result.get("success"): + print("\n" + "=" * 70) + print("FIX COMPLETED SUCCESSFULLY") + print("=" * 70) + sys.exit(0) + else: + print("\n" + "=" * 70) + print("FIX FAILED") + print("=" * 70) + sys.exit(1) diff --git a/generations/library_rag/migrate_chunk_v2_to_none_vectorizer.py b/generations/library_rag/migrate_chunk_v2_to_none_vectorizer.py new file mode 100644 index 0000000..dcd4bb0 --- /dev/null +++ b/generations/library_rag/migrate_chunk_v2_to_none_vectorizer.py @@ -0,0 +1,355 @@ +""" +Migrate Chunk_v2 schema from TEXT2VEC_TRANSFORMERS to NONE vectorizer. + +This allows pure manual vectorization with GPU embedder, removing dependency +on Docker text2vec-transformers service. + +Steps: +1. Export all existing chunks with their vectors +2. Drop Chunk_v2 collection +3. Recreate Chunk_v2 with vectorizer=none() +4. Re-insert all chunks with their vectors +5. Verify data integrity +""" + +import weaviate +import weaviate.classes as wvc +from weaviate.classes.config import Configure, Property, DataType, VectorDistances +import sys +from pathlib import Path +import json +from typing import List, Dict, Any +import time + +# Add to path for imports +sys.path.insert(0, str(Path(__file__).parent)) + +def export_chunks(): + """Export all chunks with their vectors.""" + print("\n" + "="*70) + print("STEP 1: Exporting existing chunks") + print("="*70) + + client = weaviate.connect_to_local() + + try: + chunk_coll = client.collections.get("Chunk_v2") + + # Count total + count = chunk_coll.aggregate.over_all().total_count + print(f"Total chunks to export: {count}") + + # Export all with vectors + chunks = [] + batch_size = 1000 + + for i, obj in enumerate(chunk_coll.iterator(include_vector=True)): + if i % 100 == 0: + print(f" Exported {i}/{count} chunks...", end='\r') + + chunks.append({ + 'uuid': str(obj.uuid), + 'properties': obj.properties, + 'vector': obj.vector + }) + + print(f" Exported {len(chunks)}/{count} chunks... DONE") + + # Save to file + backup_file = Path("chunk_v2_backup.json") + with open(backup_file, 'w', encoding='utf-8') as f: + json.dump(chunks, f, ensure_ascii=False, indent=2) + + print(f"\nBackup saved to: {backup_file}") + print(f" File size: {backup_file.stat().st_size / 1024 / 1024:.2f} MB") + + return chunks + + finally: + client.close() + +def recreate_schema(): + """Drop and recreate Chunk_v2 with vectorizer=none().""" + print("\n" + "="*70) + print("STEP 2: Recreating Chunk_v2 schema") + print("="*70) + + client = weaviate.connect_to_local() + + try: + # Drop existing collection + print("Dropping existing Chunk_v2 collection...") + try: + client.collections.delete("Chunk_v2") + print(" Collection dropped") + except Exception as e: + print(f" Warning: {e}") + + time.sleep(2) + + # Create new collection with vectorizer=none() + print("\nCreating new Chunk_v2 with vectorizer=none()...") + + client.collections.create( + name="Chunk_v2", + description="Document chunks with manual GPU vectorization (BAAI/bge-m3, 1024-dim)", + vectorizer_config=Configure.Vectorizer.none(), # MANUAL VECTORIZATION ONLY + vector_index_config=Configure.VectorIndex.hnsw( + distance_metric=VectorDistances.COSINE, + ef_construction=128, + max_connections=32, + quantizer=Configure.VectorIndex.Quantizer.rq() + ), + properties=[ + Property(name="text", data_type=DataType.TEXT, description="Chunk text content"), + Property(name="workTitle", data_type=DataType.TEXT, skip_vectorization=True, description="Work title"), + Property(name="workAuthor", data_type=DataType.TEXT, skip_vectorization=True, description="Work author"), + Property(name="sectionPath", data_type=DataType.TEXT, skip_vectorization=True, description="Section path"), + Property(name="sectionLevel", data_type=DataType.INT, skip_vectorization=True, description="Section level"), + Property(name="chapterTitle", data_type=DataType.TEXT, skip_vectorization=True, description="Chapter title"), + Property(name="canonicalReference", data_type=DataType.TEXT, skip_vectorization=True, description="Canonical reference"), + Property(name="unitType", data_type=DataType.TEXT, skip_vectorization=True, description="Unit type"), + Property(name="keywords", data_type=DataType.TEXT_ARRAY, skip_vectorization=True, description="Keywords"), + Property(name="language", data_type=DataType.TEXT, skip_vectorization=True, description="Language code"), + Property(name="year", data_type=DataType.INT, skip_vectorization=True, description="Publication year"), + Property(name="orderIndex", data_type=DataType.INT, skip_vectorization=True, description="Order index"), + ] + ) + + print(" Collection created with vectorizer=none()") + + # Verify + chunk_coll = client.collections.get("Chunk_v2") + config = chunk_coll.config.get() + print(f"\nVerification:") + print(f" Vectorizer: {config.vectorizer}") + print(f" Vector index: {config.vector_index_type}") + + if str(config.vectorizer) == "Vectorizers.NONE": + print(" SUCCESS: Manual vectorization configured") + return True + else: + print(" ERROR: Vectorizer not set to NONE") + return False + + finally: + client.close() + +def reimport_chunks(chunks: List[Dict[str, Any]]): + """Re-import all chunks with their vectors.""" + print("\n" + "="*70) + print("STEP 3: Re-importing chunks with vectors") + print("="*70) + + client = weaviate.connect_to_local() + + try: + chunk_coll = client.collections.get("Chunk_v2") + + print(f"Total chunks to import: {len(chunks)}") + + # Batch import + batch_size = 50 + total_inserted = 0 + + for i in range(0, len(chunks), batch_size): + batch = chunks[i:i+batch_size] + + # Prepare DataObjects with vectors + import weaviate.classes.data as wvd + data_objects = [] + + for chunk in batch: + data_objects.append( + wvd.DataObject( + properties=chunk['properties'], + vector=chunk['vector'] + ) + ) + + # Insert batch + try: + response = chunk_coll.data.insert_many(data_objects) + total_inserted += len(batch) + print(f" Imported {total_inserted}/{len(chunks)} chunks...", end='\r') + except Exception as e: + print(f"\n ERROR in batch {i//batch_size + 1}: {e}") + + print(f" Imported {total_inserted}/{len(chunks)} chunks... DONE") + + # Verify count + time.sleep(2) + final_count = chunk_coll.aggregate.over_all().total_count + print(f"\nFinal count: {final_count}") + + if final_count == len(chunks): + print(" SUCCESS: All chunks imported") + return True + else: + print(f" WARNING: Expected {len(chunks)}, got {final_count}") + return False + + finally: + client.close() + +def verify_search(): + """Verify search still works with GPU embedder.""" + print("\n" + "="*70) + print("STEP 4: Verifying search functionality") + print("="*70) + + # Import GPU embedder + from memory.core import get_embedder + + client = weaviate.connect_to_local() + + try: + chunk_coll = client.collections.get("Chunk_v2") + embedder = get_embedder() + + # Test query + query = "Turing machine computation" + print(f"Test query: '{query}'") + + # Generate query vector + query_vector = embedder.embed_single(query) + print(f" Query vector shape: {query_vector.shape}") + + # Search + results = chunk_coll.query.near_vector( + near_vector=query_vector.tolist(), + limit=5, + return_metadata=wvc.query.MetadataQuery(distance=True) + ) + + print(f"\nSearch results: {len(results.objects)}") + + for i, obj in enumerate(results.objects[:3]): + similarity = 1 - obj.metadata.distance + print(f" {i+1}. Work: {obj.properties.get('workTitle', 'N/A')[:50]}") + print(f" Similarity: {similarity:.3f}") + + if len(results.objects) > 0: + print("\n SUCCESS: Search works with GPU embedder") + return True + else: + print("\n ERROR: No search results") + return False + + finally: + client.close() + +def test_new_insertion(): + """Test inserting new chunk with manual vector.""" + print("\n" + "="*70) + print("STEP 5: Testing new chunk insertion") + print("="*70) + + from memory.core import get_embedder + + client = weaviate.connect_to_local() + + try: + chunk_coll = client.collections.get("Chunk_v2") + embedder = get_embedder() + + # Create test chunk + test_text = "This is a test chunk to verify manual vectorization works perfectly." + test_vector = embedder.embed_single(test_text) + + print(f"Test text: '{test_text}'") + print(f"Test vector shape: {test_vector.shape}") + + # Insert with manual vector + import weaviate.classes.data as wvd + + uuid = chunk_coll.data.insert( + properties={ + 'text': test_text, + 'workTitle': 'TEST_MIGRATION', + 'workAuthor': 'Test Author', + 'sectionPath': 'Test Section', + 'language': 'en', + 'year': 2026, + 'orderIndex': 999999 + }, + vector=test_vector.tolist() + ) + + print(f"\nTest chunk inserted: {uuid}") + + # Verify insertion + obj = chunk_coll.query.fetch_object_by_id(uuid, include_vector=True) + + if obj and obj.vector and len(obj.vector) == 1024: + print(f" SUCCESS: Chunk inserted with {len(obj.vector)}-dim vector") + + # Clean up test chunk + chunk_coll.data.delete_by_id(uuid) + print(f" Test chunk deleted") + + return True + else: + print(f" ERROR: Chunk insertion failed") + return False + + finally: + client.close() + +def main(): + """Run full migration.""" + print("\n" + "="*70) + print("CHUNK_V2 SCHEMA MIGRATION: TEXT2VEC_TRANSFORMERS -> NONE") + print("GPU Embedder (BAAI/bge-m3) for Manual Vectorization") + print("="*70) + + try: + # Step 1: Export + chunks = export_chunks() + if not chunks: + print("\nERROR: No chunks exported") + return False + + # Step 2: Recreate schema + if not recreate_schema(): + print("\nERROR: Schema recreation failed") + return False + + # Step 3: Reimport + if not reimport_chunks(chunks): + print("\nERROR: Reimport failed") + return False + + # Step 4: Verify search + if not verify_search(): + print("\nERROR: Search verification failed") + return False + + # Step 5: Test new insertion + if not test_new_insertion(): + print("\nERROR: New insertion test failed") + return False + + print("\n" + "="*70) + print("MIGRATION COMPLETE - SUCCESS") + print("="*70) + print("\nChunk_v2 now uses:") + print(" - Vectorizer: NONE (manual vectorization only)") + print(" - GPU Embedder: BAAI/bge-m3 (1024-dim)") + print(" - All existing chunks preserved") + print(" - Search functionality verified") + print(" - New insertions working") + print("\nYou can now ingest documents with GPU embedder!") + print("text2vec-transformers is GONE forever.") + + return True + + except Exception as e: + print(f"\nFATAL ERROR: {e}") + import traceback + traceback.print_exc() + return False + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1) diff --git a/generations/library_rag/schema.py b/generations/library_rag/schema.py index 8728c86..10502f9 100644 --- a/generations/library_rag/schema.py +++ b/generations/library_rag/schema.py @@ -5,13 +5,12 @@ Library RAG application. It provides functions to create, verify, and display the schema configuration for indexing and searching philosophical texts. Schema Architecture: - The schema follows a normalized design with denormalized nested objects - for efficient querying. The hierarchy is:: + The schema follows a denormalized design with nested objects for efficient + querying. The hierarchy is:: Work (metadata only) - └── Document (edition/translation instance) - ├── Chunk (vectorized text fragments) - └── Summary (vectorized chapter summaries) + ├── Chunk_v2 (vectorized text fragments) + └── Summary_v2 (vectorized chapter summaries) Collections: **Work** (no vectorization): @@ -19,27 +18,24 @@ Collections: Stores canonical metadata: title, author, year, language, genre. Not vectorized - used only for metadata and relationships. - **Document** (no vectorization): - Represents a specific edition or translation of a Work. - Contains: sourceId, edition, language, pages, TOC, hierarchy. + **Chunk_v2** (manual GPU vectorization): + Text fragments optimized for semantic search (200-800 chars). + Vectorized with Python GPU embedder (BAAI/bge-m3, 1024-dim). + Vectorized fields: text, keywords. + Non-vectorized fields: workTitle, workAuthor, sectionPath, chapterTitle, unitType, orderIndex. Includes nested Work reference for denormalized access. - **Chunk** (vectorized with text2vec-transformers): - Text fragments optimized for semantic search (200-800 chars). - Vectorized fields: text, summary, keywords. - Non-vectorized fields: sectionPath, chapterTitle, unitType, orderIndex. - Includes nested Document and Work references. - - **Summary** (vectorized with text2vec-transformers): + **Summary_v2** (manual GPU vectorization): LLM-generated chapter/section summaries for high-level search. + Vectorized with Python GPU embedder (BAAI/bge-m3, 1024-dim). Vectorized fields: text, concepts. - Includes nested Document reference. + Includes nested Work reference for denormalized access. Vectorization Strategy: - - Only Chunk.text, Chunk.summary, Chunk.keywords, Summary.text, and Summary.concepts are vectorized - - Uses text2vec-transformers (BAAI/bge-m3 with 1024-dim via Docker) + - Only Chunk_v2.text, Chunk_v2.keywords, Summary_v2.text, and Summary_v2.concepts are vectorized + - Manual vectorization with Python GPU embedder (BAAI/bge-m3, 1024-dim, RTX 4070) - Metadata fields use skip_vectorization=True for filtering only - - Work and Document collections have no vectorizer (metadata only) + - Work collection has no vectorizer (metadata only) Vector Index Configuration (2026-01): - **HNSW Index**: Hierarchical Navigable Small World for efficient search @@ -58,12 +54,10 @@ Migration Note (2024-12): Nested Objects: Instead of using Weaviate cross-references, we use nested objects for denormalized data access. This allows single-query retrieval of chunk - data with its Work/Document metadata without joins:: + data with its Work metadata without joins:: - Chunk.work = {title, author} - Chunk.document = {sourceId, edition} - Document.work = {title, author} - Summary.document = {sourceId} + Chunk_v2.work = {title, author} + Summary_v2.work = {title, author} Usage: From command line:: @@ -156,74 +150,6 @@ def create_work_collection(client: weaviate.WeaviateClient) -> None: ) -def create_document_collection(client: weaviate.WeaviateClient) -> None: - """Create the Document collection for edition/translation instances. - - Args: - client: Connected Weaviate client. - - Note: - Contains nested Work reference for denormalized access. - """ - client.collections.create( - name="Document", - description="A specific edition or translation of a work (PDF, ebook, etc.).", - vectorizer_config=wvc.Configure.Vectorizer.none(), - properties=[ - wvc.Property( - name="sourceId", - description="Unique identifier for this document (filename without extension).", - data_type=wvc.DataType.TEXT, - ), - wvc.Property( - name="edition", - description="Edition or translator (e.g., 'trad. Cousin', 'Loeb Classical Library').", - data_type=wvc.DataType.TEXT, - ), - wvc.Property( - name="language", - description="Language of this edition (e.g., 'fr', 'en').", - data_type=wvc.DataType.TEXT, - ), - wvc.Property( - name="pages", - description="Number of pages in the PDF/document.", - data_type=wvc.DataType.INT, - ), - wvc.Property( - name="chunksCount", - description="Total number of chunks extracted from this document.", - data_type=wvc.DataType.INT, - ), - wvc.Property( - name="toc", - description="Table of contents as JSON string [{title, level, page}, ...].", - data_type=wvc.DataType.TEXT, - ), - wvc.Property( - name="hierarchy", - description="Full hierarchical structure as JSON string.", - data_type=wvc.DataType.TEXT, - ), - wvc.Property( - name="createdAt", - description="Timestamp when this document was ingested.", - data_type=wvc.DataType.DATE, - ), - # Nested Work reference - wvc.Property( - name="work", - description="Reference to the Work this document is an instance of.", - data_type=wvc.DataType.OBJECT, - nested_properties=[ - wvc.Property(name="title", data_type=wvc.DataType.TEXT), - wvc.Property(name="author", data_type=wvc.DataType.TEXT), - ], - ), - ], - ) - - def create_chunk_collection(client: weaviate.WeaviateClient) -> None: """Create the Chunk collection for vectorized text fragments. @@ -410,7 +336,7 @@ def create_summary_collection(client: weaviate.WeaviateClient) -> None: def create_schema(client: weaviate.WeaviateClient, delete_existing: bool = True) -> None: """Create the complete Weaviate schema for Library RAG. - Creates all four collections: Work, Document, Chunk, Summary. + Creates all three collections: Work, Chunk, Summary. Args: client: Connected Weaviate client. @@ -429,16 +355,13 @@ def create_schema(client: weaviate.WeaviateClient, delete_existing: bool = True) print(" → Work (métadonnées œuvre)...") create_work_collection(client) - print(" → Document (métadonnées édition)...") - create_document_collection(client) - print(" → Chunk (fragments vectorisés)...") create_chunk_collection(client) print(" → Summary (résumés de chapitres)...") create_summary_collection(client) - print(" ✓ 4 collections créées") + print(" ✓ 3 collections créées") def verify_schema(client: weaviate.WeaviateClient) -> bool: @@ -453,7 +376,7 @@ def verify_schema(client: weaviate.WeaviateClient) -> bool: print("\n[3/4] Vérification des collections...") collections = client.collections.list_all() - expected: Set[str] = {"Work", "Document", "Chunk", "Summary"} + expected: Set[str] = {"Work", "Chunk", "Summary"} actual: Set[str] = set(collections.keys()) if expected == actual: @@ -480,7 +403,7 @@ def display_schema(client: weaviate.WeaviateClient) -> None: collections = client.collections.list_all() - for name in ["Work", "Document", "Chunk", "Summary"]: + for name in ["Work", "Chunk", "Summary"]: if name not in collections: continue @@ -523,14 +446,12 @@ def print_summary() -> None: print("=" * 80) print("\n✓ Architecture:") print(" - Work: Source unique pour author/title") - print(" - Document: Métadonnées d'édition avec référence vers Work") - print(" - Chunk: Fragments vectorisés (text + summary + keywords)") + print(" - Chunk: Fragments vectorisés (text + keywords)") print(" - Summary: Résumés de chapitres vectorisés (text + concepts)") print("\n✓ Vectorisation:") print(" - Work: NONE") - print(" - Document: NONE") - print(" - Chunk: text2vec (text + summary + keywords)") - print(" - Summary: text2vec (text + concepts)") + print(" - Chunk: GPU embedder (BAAI/bge-m3, 1024-dim)") + print(" - Summary: GPU embedder (BAAI/bge-m3, 1024-dim)") print("\n✓ Index Vectoriel (Optimisation 2026):") print(" - Chunk: HNSW + RQ (~75% moins de RAM)") print(" - Summary: HNSW + RQ") diff --git a/generations/library_rag/utils/types.py b/generations/library_rag/utils/types.py index 719f8ac..f57d48c 100644 --- a/generations/library_rag/utils/types.py +++ b/generations/library_rag/utils/types.py @@ -848,7 +848,7 @@ class WeaviateIngestResult(TypedDict, total=False): inserted: List of inserted chunk summaries (first 10). work: Title of the ingested work. author: Author of the ingested work. - document_uuid: UUID of created Document object (if any). + work_uuid: UUID of created Work object (if any). all_objects: Complete list of all inserted ChunkObjects. Note: @@ -863,7 +863,7 @@ class WeaviateIngestResult(TypedDict, total=False): inserted: List[Any] # List[InsertedChunkSummary] from weaviate_ingest work: str author: str - document_uuid: Optional[str] + work_uuid: Optional[str] all_objects: List[Any] # List[ChunkObject] from weaviate_ingest diff --git a/generations/library_rag/utils/weaviate_ingest.py b/generations/library_rag/utils/weaviate_ingest.py index ddfef3a..71f5e9e 100644 --- a/generations/library_rag/utils/weaviate_ingest.py +++ b/generations/library_rag/utils/weaviate_ingest.py @@ -190,9 +190,8 @@ class DeleteResult(TypedDict, total=False): Attributes: success: Whether deletion succeeded. error: Error message if deletion failed. - deleted_chunks: Number of chunks deleted from Chunk collection. - deleted_summaries: Number of summaries deleted from Summary collection. - deleted_document: Whether the Document object was deleted. + deleted_chunks: Number of chunks deleted from Chunk_v2 collection. + deleted_summaries: Number of summaries deleted from Summary_v2 collection. Example: >>> result = delete_document_chunks("platon_republique") @@ -203,7 +202,6 @@ class DeleteResult(TypedDict, total=False): error: str deleted_chunks: int deleted_summaries: int - deleted_document: bool # ============================================================================= @@ -379,7 +377,8 @@ def validate_document_metadata( ) # Validate title (required for work.title nested object) - title = metadata.get("title") or metadata.get("work") + # Priority: work > original_title > title (to avoid LLM prompt instructions) + title = metadata.get("work") or metadata.get("original_title") or metadata.get("title") if not title or not str(title).strip(): raise ValueError( f"Invalid metadata for '{doc_name}': 'title' is missing or empty. " @@ -388,7 +387,8 @@ def validate_document_metadata( ) # Validate author (required for work.author nested object) - author = metadata.get("author") + # Priority: original_author > author (to avoid LLM prompt instructions) + author = metadata.get("original_author") or metadata.get("author") if not author or not str(author).strip(): raise ValueError( f"Invalid metadata for '{doc_name}': 'author' is missing or empty. " @@ -649,8 +649,10 @@ def create_or_get_work( logger.warning(f"Collection Work non trouvée: {e}") return None - title = metadata.get("title") or doc_name - author = metadata.get("author") or "Inconnu" + # Priority: work > original_title > title (to avoid LLM prompt instructions) + title = metadata.get("work") or metadata.get("original_title") or metadata.get("title") or doc_name + # Priority: original_author > author (to avoid LLM prompt instructions) + author = metadata.get("original_author") or metadata.get("author") or "Inconnu" year = metadata.get("year", 0) if metadata.get("year") else 0 try: @@ -686,76 +688,6 @@ def create_or_get_work( return None -def ingest_document_metadata( - client: WeaviateClient, - doc_name: str, - metadata: Dict[str, Any], - toc: List[Dict[str, Any]], - hierarchy: Dict[str, Any], - chunks_count: int, - pages: int, -) -> Optional[str]: - """Insert document metadata into the Document collection. - - Creates a Document object containing metadata about a processed document, - including its table of contents, hierarchy structure, and statistics. - - Args: - client: Active Weaviate client connection. - doc_name: Unique document identifier (sourceId). - metadata: Extracted metadata dict with keys: title, author, language. - toc: Table of contents as a hierarchical list of dicts. - hierarchy: Complete document hierarchy structure. - chunks_count: Total number of chunks in the document. - pages: Number of pages in the source PDF. - - Returns: - UUID string of the created Document object, or None if insertion failed. - - Example: - >>> with get_weaviate_client() as client: - ... uuid = ingest_document_metadata( - ... client, - ... doc_name="platon_republique", - ... metadata={"title": "La Republique", "author": "Platon"}, - ... toc=[{"title": "Livre I", "level": 1}], - ... hierarchy={}, - ... chunks_count=150, - ... pages=300, - ... ) - - Note: - The TOC and hierarchy are serialized to JSON strings for storage. - The createdAt field is set to the current timestamp. - """ - try: - doc_collection: Collection[Any, Any] = client.collections.get("Document") - except Exception as e: - logger.warning(f"Collection Document non trouvée: {e}") - return None - - try: - doc_obj: Dict[str, Any] = { - "sourceId": doc_name, - "title": metadata.get("title") or doc_name, - "author": metadata.get("author") or "Inconnu", - "toc": json.dumps(toc, ensure_ascii=False) if toc else "[]", - "hierarchy": json.dumps(hierarchy, ensure_ascii=False) if hierarchy else "{}", - "pages": pages, - "chunksCount": chunks_count, - "language": metadata.get("language", "fr"), - "createdAt": datetime.now(timezone.utc).isoformat(), - } - - result = doc_collection.data.insert(doc_obj) - logger.info(f"Document metadata ingéré: {doc_name}") - return str(result) - - except Exception as e: - logger.warning(f"Erreur ingestion document metadata: {e}") - return None - - def ingest_summaries( client: WeaviateClient, doc_name: str, @@ -897,14 +829,13 @@ def ingest_document( toc: Optional[List[Dict[str, Any]]] = None, hierarchy: Optional[Dict[str, Any]] = None, pages: int = 0, - ingest_document_collection: bool = True, ingest_summary_collection: bool = False, ) -> IngestResult: """Ingest document chunks into Weaviate with nested objects. - Main ingestion function that inserts chunks into the Chunk collection - with nested Work and Document references. Optionally also creates - entries in the Document and Summary collections. + Main ingestion function that inserts chunks into the Chunk_v2 collection + with nested Work references. Optionally also creates entries in the + Summary_v2 collection. This function uses batch insertion for optimal performance and constructs proper nested objects for filtering capabilities. @@ -922,12 +853,10 @@ def ingest_document( - author: Author name - edition (optional): Edition identifier language: ISO language code. Defaults to "fr". - toc: Optional table of contents for Document/Summary collections. + toc: Optional table of contents for Summary collection. hierarchy: Optional complete document hierarchy structure. pages: Number of pages in source document. Defaults to 0. - ingest_document_collection: If True, also insert into Document - collection. Defaults to True. - ingest_summary_collection: If True, also insert into Summary + ingest_summary_collection: If True, also insert into Summary_v2 collection (requires toc). Defaults to False. Returns: @@ -937,7 +866,7 @@ def ingest_document( - inserted: Preview of first 10 inserted chunks - work: Work title - author: Author name - - document_uuid: UUID of Document object (if created) + - work_uuid: UUID of Work object (if created) - all_objects: Complete list of inserted ChunkObjects - error: Error message (if failed) @@ -995,14 +924,6 @@ def ingest_document( client, doc_name, metadata, pages ) - # Insérer les métadonnées du document (optionnel) - doc_uuid: Optional[str] = None - if ingest_document_collection: - doc_uuid = ingest_document_metadata( - client, doc_name, metadata, toc or [], hierarchy or {}, - len(chunks), pages - ) - # Insérer les résumés (optionnel) if ingest_summary_collection and toc: ingest_summaries(client, doc_name, toc, {}) @@ -1018,8 +939,10 @@ def ingest_document( objects_to_insert: List[ChunkObject] = [] # Extraire et valider les métadonnées (validation déjà faite, juste extraction) - title: str = metadata.get("title") or metadata.get("work") or doc_name - author: str = metadata.get("author") or "Inconnu" + # Priority: work > original_title > title (to avoid LLM prompt instructions) + title: str = metadata.get("work") or metadata.get("original_title") or metadata.get("title") or doc_name + # Priority: original_author > author (to avoid LLM prompt instructions) + author: str = metadata.get("original_author") or metadata.get("author") or "Inconnu" edition: str = metadata.get("edition", "") for idx, chunk in enumerate(chunks): @@ -1153,7 +1076,7 @@ def ingest_document( inserted=inserted_summary, work=title, author=author, - document_uuid=doc_uuid, + work_uuid=work_uuid, all_objects=objects_to_insert, ) @@ -1169,9 +1092,8 @@ def ingest_document( def delete_document_chunks(doc_name: str) -> DeleteResult: """Delete all data for a document from Weaviate collections. - Removes chunks, summaries, and the document metadata from their - respective collections. Uses nested object filtering to find - related objects. + Removes chunks and summaries from their respective collections. + Uses nested object filtering to find related objects. This function is useful for re-processing a document after changes to the processing pipeline or to clean up test data. @@ -1184,7 +1106,6 @@ def delete_document_chunks(doc_name: str) -> DeleteResult: - success: True if deletion succeeded (even if no objects found) - deleted_chunks: Number of Chunk objects deleted - deleted_summaries: Number of Summary objects deleted - - deleted_document: True if Document object was deleted - error: Error message (if failed) Example: @@ -1227,23 +1148,12 @@ def delete_document_chunks(doc_name: str) -> DeleteResult: except Exception as e: logger.warning(f"Erreur suppression summaries: {e}") - # Supprimer le document - try: - doc_collection: Collection[Any, Any] = client.collections.get("Document") - result = doc_collection.data.delete_many( - where=wvq.Filter.by_property("sourceId").equal(doc_name) - ) - deleted_document = result.successful > 0 - except Exception as e: - logger.warning(f"Erreur suppression document: {e}") - logger.info(f"Suppression: {deleted_chunks} chunks, {deleted_summaries} summaries pour {doc_name}") return DeleteResult( success=True, deleted_chunks=deleted_chunks, deleted_summaries=deleted_summaries, - deleted_document=deleted_document, ) except Exception as e: