feat: Remove Document collection from schema

BREAKING CHANGE: Document collection removed from Weaviate schema Architecture simplification: - Removed Document collection (unused by Flask app) - All metadata now in Work collection or file-based (chunks.json) - Simplified from 4 collections to 3 (Work, Chunk_v2, Summary_v2) Schema changes (schema.py): - Removed create_document_collection() function - Updated verify_schema() to expect 3 collections - Updated display_schema() and print_summary() - Updated documentation to reflect Chunk_v2/Summary_v2 Ingestion changes (weaviate_ingest.py): - Removed ingest_document_metadata() function - Removed ingest_document_collection parameter - Updated IngestResult to use work_uuid instead of document_uuid - Removed Document deletion from delete_document_chunks() - Updated DeleteResult TypedDict Type changes (types.py): - WeaviateIngestResult: document_uuid → work_uuid Documentation updates (.claude/CLAUDE.md): - Updated schema diagram (4 → 3 collections) - Removed Document references - Updated to reflect manual GPU vectorization Database changes: - Deleted Document collection (13 objects) - Deleted Chunk collection (0 objects, old schema) Benefits: - Simpler architecture (3 collections vs 4) - No redundant data storage - All metadata available via Work or file-based storage - Reduced Weaviate memory footprint Migration: - See DOCUMENT_COLLECTION_ANALYSIS.md for detailed analysis - See migrate_chunk_v2_to_none_vectorizer.py for vectorizer migration Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-09 14:13:51 +01:00
parent 625c52a925
commit 53f6a92365
8 changed files with 698 additions and 238 deletions
--- a/generations/library_rag/.claude/CLAUDE.md
+++ b/generations/library_rag/.claude/CLAUDE.md
@@ -138,34 +138,30 @@ The core of the application is `utils/pdf_pipeline.py`, which orchestrates a 10-
 - `use_ocr_annotations=True` - OCR with annotations (3x cost, better TOC)
 - `ingest_to_weaviate=True` - Insert chunks into Weaviate

-### Weaviate Schema (4 Collections)
+### Weaviate Schema (3 Collections)

-Defined in `schema.py`, the database uses a normalized design with denormalized nested objects:
+Defined in `schema.py`, the database uses a denormalized design with nested objects:

 ```
 Work (no vectorizer)
  title, author, year, language, genre
-
-  ├─► Document (no vectorizer)
-  │     sourceId, edition, pages, toc, hierarchy
+  │
+  ├─► Chunk_v2 (manual GPU vectorization) ⭐ PRIMARY
+  │     text (VECTORIZED)
+  │     keywords (VECTORIZED)
+  │     workTitle, workAuthor, sectionPath, chapterTitle, unitType, orderIndex
  │     work: {title, author} (nested)
  │
-  │   ├─► Chunk (text2vec-transformers) ⭐ PRIMARY
-  │   │     text (VECTORIZED)
-  │   │     keywords (VECTORIZED)
-  │   │     sectionPath, chapterTitle, unitType, orderIndex
-  │   │     work: {title, author} (nested)
-  │   │     document: {sourceId, edition} (nested)
-  │   │
-  │   └─► Summary (text2vec-transformers)
-  │         text (VECTORIZED)
-  │         concepts (VECTORIZED)
-  │         sectionPath, title, level, chunksCount
-  │         document: {sourceId} (nested)
+  └─► Summary_v2 (manual GPU vectorization)
+        text (VECTORIZED)
+        concepts (VECTORIZED)
+        sectionPath, title, level, chunksCount
+        work: {title, author} (nested)
 ```

 **Vectorization Strategy:**
- Only `Chunk.text`, `Chunk.keywords`, `Summary.text`, `Summary.concepts` are vectorized
+- Only `Chunk_v2.text`, `Chunk_v2.keywords`, `Summary_v2.text`, `Summary_v2.concepts` are vectorized
+- Manual vectorization with Python GPU embedder (BAAI/bge-m3, 1024-dim, RTX 4070)
 - Metadata fields use `skip_vectorization=True` for filtering performance
 - Nested objects avoid joins for efficient single-query retrieval
 - BAAI/bge-m3 model: 1024 dimensions, 8192 token context
--- a/generations/library_rag/fix_turings_machines.py
+++ b/generations/library_rag/fix_turings_machines.py
@@ -0,0 +1,122 @@
+"""
+Fix Turings_Machines ingestion with corrected metadata.
+
+The LLM returned prompt instructions instead of actual metadata.
+This script:
+1. Loads chunks from Turings_Machines_chunks.json
+2. Corrects workTitle and workAuthor
+3. Re-ingests to Weaviate with GPU embedder
+"""
+
+import json
+import sys
+from pathlib import Path
+
+# Add current directory to path for imports
+current_dir = Path(__file__).parent.absolute()
+sys.path.insert(0, str(current_dir))
+
+# Now import can work
+import utils.weaviate_ingest as weaviate_ingest
+
+def fix_turings_machines():
+    """Fix and re-ingest Turings_Machines with corrected metadata."""
+
+    # Load chunks JSON
+    chunks_file = Path("output/Turings_Machines/Turings_Machines_chunks.json")
+
+    if not chunks_file.exists():
+        print(f"ERROR: File not found: {chunks_file}")
+        return
+
+    with open(chunks_file, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    print("Loaded chunks JSON")
+    print(f"   - Chunks: {len(data.get('chunks', []))}")
+    print(f"   - Current title: {data.get('metadata', {}).get('title', 'N/A')[:80]}")
+    print(f"   - Current author: {data.get('metadata', {}).get('author', 'N/A')[:80]}")
+
+    # Correct metadata
+    corrected_metadata = {
+        "title": "Turing's Machines",
+        "author": "Dorian Wiszniewski, Richard Coyne, Christopher Pierce",
+        "year": 2000,  # Approximate - from references (Coyne 1999, etc.)
+        "language": "en"
+    }
+
+    # Update metadata
+    data["metadata"] = corrected_metadata
+
+    # Update all chunks with corrected metadata
+    for chunk in data.get("chunks", []):
+        chunk["workTitle"] = corrected_metadata["title"]
+        chunk["workAuthor"] = corrected_metadata["author"]
+        chunk["year"] = corrected_metadata["year"]
+
+    print("\nCorrected metadata:")
+    print(f"   - Title: {corrected_metadata['title']}")
+    print(f"   - Author: {corrected_metadata['author']}")
+    print(f"   - Year: {corrected_metadata['year']}")
+
+    # Prepare chunks for ingestion (format expected by ingest_document)
+    chunks_for_ingestion = []
+    for i, chunk in enumerate(data.get("chunks", [])):
+        chunks_for_ingestion.append({
+            "text": chunk["text"],
+            "sectionPath": chunk.get("section", ""),
+            "sectionLevel": chunk.get("section_level", 1),
+            "chapterTitle": "",
+            "canonicalReference": "",
+            "unitType": chunk.get("type", "main_content"),
+            "keywords": chunk.get("concepts", []),
+            "language": "en",
+            "orderIndex": i,
+        })
+
+    print(f"\nPrepared {len(chunks_for_ingestion)} chunks for ingestion")
+
+    # Re-ingest to Weaviate
+    print("\nStarting re-ingestion with GPU embedder...")
+
+    result = weaviate_ingest.ingest_document(
+        doc_name="Turings_Machines",
+        chunks=chunks_for_ingestion,
+        metadata=corrected_metadata,
+        language="en"
+    )
+
+    if result.get("success"):
+        print(f"\nRe-ingestion successful!")
+        print(f"   - Chunks inserted: {result.get('count', 0)}")
+        print(f"   - Work UUID: {result.get('work_uuid', 'N/A')}")
+    else:
+        print(f"\nRe-ingestion failed!")
+        print(f"   - Error: {result.get('error', 'Unknown')}")
+
+    # Save corrected chunks JSON
+    corrected_file = chunks_file.parent / f"{chunks_file.stem}_corrected.json"
+    with open(corrected_file, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+
+    print(f"\nSaved corrected chunks to: {corrected_file}")
+
+    return result
+
+if __name__ == "__main__":
+    print("=" * 70)
+    print("Fix Turings_Machines Ingestion")
+    print("=" * 70)
+
+    result = fix_turings_machines()
+
+    if result and result.get("success"):
+        print("\n" + "=" * 70)
+        print("FIX COMPLETED SUCCESSFULLY")
+        print("=" * 70)
+        sys.exit(0)
+    else:
+        print("\n" + "=" * 70)
+        print("FIX FAILED")
+        print("=" * 70)
+        sys.exit(1)
--- a/generations/library_rag/migrate_chunk_v2_to_none_vectorizer.py
+++ b/generations/library_rag/migrate_chunk_v2_to_none_vectorizer.py
@@ -0,0 +1,355 @@
+"""
+Migrate Chunk_v2 schema from TEXT2VEC_TRANSFORMERS to NONE vectorizer.
+
+This allows pure manual vectorization with GPU embedder, removing dependency
+on Docker text2vec-transformers service.
+
+Steps:
+1. Export all existing chunks with their vectors
+2. Drop Chunk_v2 collection
+3. Recreate Chunk_v2 with vectorizer=none()
+4. Re-insert all chunks with their vectors
+5. Verify data integrity
+"""
+
+import weaviate
+import weaviate.classes as wvc
+from weaviate.classes.config import Configure, Property, DataType, VectorDistances
+import sys
+from pathlib import Path
+import json
+from typing import List, Dict, Any
+import time
+
+# Add to path for imports
+sys.path.insert(0, str(Path(__file__).parent))
+
+def export_chunks():
+    """Export all chunks with their vectors."""
+    print("\n" + "="*70)
+    print("STEP 1: Exporting existing chunks")
+    print("="*70)
+
+    client = weaviate.connect_to_local()
+
+    try:
+        chunk_coll = client.collections.get("Chunk_v2")
+
+        # Count total
+        count = chunk_coll.aggregate.over_all().total_count
+        print(f"Total chunks to export: {count}")
+
+        # Export all with vectors
+        chunks = []
+        batch_size = 1000
+
+        for i, obj in enumerate(chunk_coll.iterator(include_vector=True)):
+            if i % 100 == 0:
+                print(f"  Exported {i}/{count} chunks...", end='\r')
+
+            chunks.append({
+                'uuid': str(obj.uuid),
+                'properties': obj.properties,
+                'vector': obj.vector
+            })
+
+        print(f"  Exported {len(chunks)}/{count} chunks... DONE")
+
+        # Save to file
+        backup_file = Path("chunk_v2_backup.json")
+        with open(backup_file, 'w', encoding='utf-8') as f:
+            json.dump(chunks, f, ensure_ascii=False, indent=2)
+
+        print(f"\nBackup saved to: {backup_file}")
+        print(f"  File size: {backup_file.stat().st_size / 1024 / 1024:.2f} MB")
+
+        return chunks
+
+    finally:
+        client.close()
+
+def recreate_schema():
+    """Drop and recreate Chunk_v2 with vectorizer=none()."""
+    print("\n" + "="*70)
+    print("STEP 2: Recreating Chunk_v2 schema")
+    print("="*70)
+
+    client = weaviate.connect_to_local()
+
+    try:
+        # Drop existing collection
+        print("Dropping existing Chunk_v2 collection...")
+        try:
+            client.collections.delete("Chunk_v2")
+            print("  Collection dropped")
+        except Exception as e:
+            print(f"  Warning: {e}")
+
+        time.sleep(2)
+
+        # Create new collection with vectorizer=none()
+        print("\nCreating new Chunk_v2 with vectorizer=none()...")
+
+        client.collections.create(
+            name="Chunk_v2",
+            description="Document chunks with manual GPU vectorization (BAAI/bge-m3, 1024-dim)",
+            vectorizer_config=Configure.Vectorizer.none(),  # MANUAL VECTORIZATION ONLY
+            vector_index_config=Configure.VectorIndex.hnsw(
+                distance_metric=VectorDistances.COSINE,
+                ef_construction=128,
+                max_connections=32,
+                quantizer=Configure.VectorIndex.Quantizer.rq()
+            ),
+            properties=[
+                Property(name="text", data_type=DataType.TEXT, description="Chunk text content"),
+                Property(name="workTitle", data_type=DataType.TEXT, skip_vectorization=True, description="Work title"),
+                Property(name="workAuthor", data_type=DataType.TEXT, skip_vectorization=True, description="Work author"),
+                Property(name="sectionPath", data_type=DataType.TEXT, skip_vectorization=True, description="Section path"),
+                Property(name="sectionLevel", data_type=DataType.INT, skip_vectorization=True, description="Section level"),
+                Property(name="chapterTitle", data_type=DataType.TEXT, skip_vectorization=True, description="Chapter title"),
+                Property(name="canonicalReference", data_type=DataType.TEXT, skip_vectorization=True, description="Canonical reference"),
+                Property(name="unitType", data_type=DataType.TEXT, skip_vectorization=True, description="Unit type"),
+                Property(name="keywords", data_type=DataType.TEXT_ARRAY, skip_vectorization=True, description="Keywords"),
+                Property(name="language", data_type=DataType.TEXT, skip_vectorization=True, description="Language code"),
+                Property(name="year", data_type=DataType.INT, skip_vectorization=True, description="Publication year"),
+                Property(name="orderIndex", data_type=DataType.INT, skip_vectorization=True, description="Order index"),
+            ]
+        )
+
+        print("  Collection created with vectorizer=none()")
+
+        # Verify
+        chunk_coll = client.collections.get("Chunk_v2")
+        config = chunk_coll.config.get()
+        print(f"\nVerification:")
+        print(f"  Vectorizer: {config.vectorizer}")
+        print(f"  Vector index: {config.vector_index_type}")
+
+        if str(config.vectorizer) == "Vectorizers.NONE":
+            print("  SUCCESS: Manual vectorization configured")
+            return True
+        else:
+            print("  ERROR: Vectorizer not set to NONE")
+            return False
+
+    finally:
+        client.close()
+
+def reimport_chunks(chunks: List[Dict[str, Any]]):
+    """Re-import all chunks with their vectors."""
+    print("\n" + "="*70)
+    print("STEP 3: Re-importing chunks with vectors")
+    print("="*70)
+
+    client = weaviate.connect_to_local()
+
+    try:
+        chunk_coll = client.collections.get("Chunk_v2")
+
+        print(f"Total chunks to import: {len(chunks)}")
+
+        # Batch import
+        batch_size = 50
+        total_inserted = 0
+
+        for i in range(0, len(chunks), batch_size):
+            batch = chunks[i:i+batch_size]
+
+            # Prepare DataObjects with vectors
+            import weaviate.classes.data as wvd
+            data_objects = []
+
+            for chunk in batch:
+                data_objects.append(
+                    wvd.DataObject(
+                        properties=chunk['properties'],
+                        vector=chunk['vector']
+                    )
+                )
+
+            # Insert batch
+            try:
+                response = chunk_coll.data.insert_many(data_objects)
+                total_inserted += len(batch)
+                print(f"  Imported {total_inserted}/{len(chunks)} chunks...", end='\r')
+            except Exception as e:
+                print(f"\n  ERROR in batch {i//batch_size + 1}: {e}")
+
+        print(f"  Imported {total_inserted}/{len(chunks)} chunks... DONE")
+
+        # Verify count
+        time.sleep(2)
+        final_count = chunk_coll.aggregate.over_all().total_count
+        print(f"\nFinal count: {final_count}")
+
+        if final_count == len(chunks):
+            print("  SUCCESS: All chunks imported")
+            return True
+        else:
+            print(f"  WARNING: Expected {len(chunks)}, got {final_count}")
+            return False
+
+    finally:
+        client.close()
+
+def verify_search():
+    """Verify search still works with GPU embedder."""
+    print("\n" + "="*70)
+    print("STEP 4: Verifying search functionality")
+    print("="*70)
+
+    # Import GPU embedder
+    from memory.core import get_embedder
+
+    client = weaviate.connect_to_local()
+
+    try:
+        chunk_coll = client.collections.get("Chunk_v2")
+        embedder = get_embedder()
+
+        # Test query
+        query = "Turing machine computation"
+        print(f"Test query: '{query}'")
+
+        # Generate query vector
+        query_vector = embedder.embed_single(query)
+        print(f"  Query vector shape: {query_vector.shape}")
+
+        # Search
+        results = chunk_coll.query.near_vector(
+            near_vector=query_vector.tolist(),
+            limit=5,
+            return_metadata=wvc.query.MetadataQuery(distance=True)
+        )
+
+        print(f"\nSearch results: {len(results.objects)}")
+
+        for i, obj in enumerate(results.objects[:3]):
+            similarity = 1 - obj.metadata.distance
+            print(f"  {i+1}. Work: {obj.properties.get('workTitle', 'N/A')[:50]}")
+            print(f"     Similarity: {similarity:.3f}")
+
+        if len(results.objects) > 0:
+            print("\n  SUCCESS: Search works with GPU embedder")
+            return True
+        else:
+            print("\n  ERROR: No search results")
+            return False
+
+    finally:
+        client.close()
+
+def test_new_insertion():
+    """Test inserting new chunk with manual vector."""
+    print("\n" + "="*70)
+    print("STEP 5: Testing new chunk insertion")
+    print("="*70)
+
+    from memory.core import get_embedder
+
+    client = weaviate.connect_to_local()
+
+    try:
+        chunk_coll = client.collections.get("Chunk_v2")
+        embedder = get_embedder()
+
+        # Create test chunk
+        test_text = "This is a test chunk to verify manual vectorization works perfectly."
+        test_vector = embedder.embed_single(test_text)
+
+        print(f"Test text: '{test_text}'")
+        print(f"Test vector shape: {test_vector.shape}")
+
+        # Insert with manual vector
+        import weaviate.classes.data as wvd
+
+        uuid = chunk_coll.data.insert(
+            properties={
+                'text': test_text,
+                'workTitle': 'TEST_MIGRATION',
+                'workAuthor': 'Test Author',
+                'sectionPath': 'Test Section',
+                'language': 'en',
+                'year': 2026,
+                'orderIndex': 999999
+            },
+            vector=test_vector.tolist()
+        )
+
+        print(f"\nTest chunk inserted: {uuid}")
+
+        # Verify insertion
+        obj = chunk_coll.query.fetch_object_by_id(uuid, include_vector=True)
+
+        if obj and obj.vector and len(obj.vector) == 1024:
+            print(f"  SUCCESS: Chunk inserted with {len(obj.vector)}-dim vector")
+
+            # Clean up test chunk
+            chunk_coll.data.delete_by_id(uuid)
+            print(f"  Test chunk deleted")
+
+            return True
+        else:
+            print(f"  ERROR: Chunk insertion failed")
+            return False
+
+    finally:
+        client.close()
+
+def main():
+    """Run full migration."""
+    print("\n" + "="*70)
+    print("CHUNK_V2 SCHEMA MIGRATION: TEXT2VEC_TRANSFORMERS -> NONE")
+    print("GPU Embedder (BAAI/bge-m3) for Manual Vectorization")
+    print("="*70)
+
+    try:
+        # Step 1: Export
+        chunks = export_chunks()
+        if not chunks:
+            print("\nERROR: No chunks exported")
+            return False
+
+        # Step 2: Recreate schema
+        if not recreate_schema():
+            print("\nERROR: Schema recreation failed")
+            return False
+
+        # Step 3: Reimport
+        if not reimport_chunks(chunks):
+            print("\nERROR: Reimport failed")
+            return False
+
+        # Step 4: Verify search
+        if not verify_search():
+            print("\nERROR: Search verification failed")
+            return False
+
+        # Step 5: Test new insertion
+        if not test_new_insertion():
+            print("\nERROR: New insertion test failed")
+            return False
+
+        print("\n" + "="*70)
+        print("MIGRATION COMPLETE - SUCCESS")
+        print("="*70)
+        print("\nChunk_v2 now uses:")
+        print("  - Vectorizer: NONE (manual vectorization only)")
+        print("  - GPU Embedder: BAAI/bge-m3 (1024-dim)")
+        print("  - All existing chunks preserved")
+        print("  - Search functionality verified")
+        print("  - New insertions working")
+        print("\nYou can now ingest documents with GPU embedder!")
+        print("text2vec-transformers is GONE forever.")
+
+        return True
+
+    except Exception as e:
+        print(f"\nFATAL ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)
--- a/generations/library_rag/schema.py
+++ b/generations/library_rag/schema.py
@@ -5,13 +5,12 @@ Library RAG application. It provides functions to create, verify, and display
 the schema configuration for indexing and searching philosophical texts.

 Schema Architecture:
-    The schema follows a normalized design with denormalized nested objects
-    for efficient querying. The hierarchy is::
+    The schema follows a denormalized design with nested objects for efficient
+    querying. The hierarchy is::

        Work (metadata only)
-          └── Document (edition/translation instance)
-                ├── Chunk (vectorized text fragments)
-                └── Summary (vectorized chapter summaries)
+          ├── Chunk_v2 (vectorized text fragments)
+          └── Summary_v2 (vectorized chapter summaries)

 Collections:
    **Work** (no vectorization):
@@ -19,27 +18,24 @@ Collections:
        Stores canonical metadata: title, author, year, language, genre.
        Not vectorized - used only for metadata and relationships.

-    **Document** (no vectorization):
-        Represents a specific edition or translation of a Work.
-        Contains: sourceId, edition, language, pages, TOC, hierarchy.
+    **Chunk_v2** (manual GPU vectorization):
+        Text fragments optimized for semantic search (200-800 chars).
+        Vectorized with Python GPU embedder (BAAI/bge-m3, 1024-dim).
+        Vectorized fields: text, keywords.
+        Non-vectorized fields: workTitle, workAuthor, sectionPath, chapterTitle, unitType, orderIndex.
        Includes nested Work reference for denormalized access.

-    **Chunk** (vectorized with text2vec-transformers):
-        Text fragments optimized for semantic search (200-800 chars).
-        Vectorized fields: text, summary, keywords.
-        Non-vectorized fields: sectionPath, chapterTitle, unitType, orderIndex.
-        Includes nested Document and Work references.
-
-    **Summary** (vectorized with text2vec-transformers):
+    **Summary_v2** (manual GPU vectorization):
        LLM-generated chapter/section summaries for high-level search.
+        Vectorized with Python GPU embedder (BAAI/bge-m3, 1024-dim).
        Vectorized fields: text, concepts.
-        Includes nested Document reference.
+        Includes nested Work reference for denormalized access.

 Vectorization Strategy:
-    - Only Chunk.text, Chunk.summary, Chunk.keywords, Summary.text, and Summary.concepts are vectorized
-    - Uses text2vec-transformers (BAAI/bge-m3 with 1024-dim via Docker)
+    - Only Chunk_v2.text, Chunk_v2.keywords, Summary_v2.text, and Summary_v2.concepts are vectorized
+    - Manual vectorization with Python GPU embedder (BAAI/bge-m3, 1024-dim, RTX 4070)
    - Metadata fields use skip_vectorization=True for filtering only
-    - Work and Document collections have no vectorizer (metadata only)
+    - Work collection has no vectorizer (metadata only)

 Vector Index Configuration (2026-01):
    - **HNSW Index**: Hierarchical Navigable Small World for efficient search
@@ -58,12 +54,10 @@ Migration Note (2024-12):
 Nested Objects:
    Instead of using Weaviate cross-references, we use nested objects for
    denormalized data access. This allows single-query retrieval of chunk
-    data with its Work/Document metadata without joins::
+    data with its Work metadata without joins::

-        Chunk.work = {title, author}
-        Chunk.document = {sourceId, edition}
-        Document.work = {title, author}
-        Summary.document = {sourceId}
+        Chunk_v2.work = {title, author}
+        Summary_v2.work = {title, author}

 Usage:
    From command line::
@@ -156,74 +150,6 @@ def create_work_collection(client: weaviate.WeaviateClient) -> None:
    )


-def create_document_collection(client: weaviate.WeaviateClient) -> None:
-    """Create the Document collection for edition/translation instances.
-
-    Args:
-        client: Connected Weaviate client.
-
-    Note:
-        Contains nested Work reference for denormalized access.
-    """
-    client.collections.create(
-        name="Document",
-        description="A specific edition or translation of a work (PDF, ebook, etc.).",
-        vectorizer_config=wvc.Configure.Vectorizer.none(),
-        properties=[
-            wvc.Property(
-                name="sourceId",
-                description="Unique identifier for this document (filename without extension).",
-                data_type=wvc.DataType.TEXT,
-            ),
-            wvc.Property(
-                name="edition",
-                description="Edition or translator (e.g., 'trad. Cousin', 'Loeb Classical Library').",
-                data_type=wvc.DataType.TEXT,
-            ),
-            wvc.Property(
-                name="language",
-                description="Language of this edition (e.g., 'fr', 'en').",
-                data_type=wvc.DataType.TEXT,
-            ),
-            wvc.Property(
-                name="pages",
-                description="Number of pages in the PDF/document.",
-                data_type=wvc.DataType.INT,
-            ),
-            wvc.Property(
-                name="chunksCount",
-                description="Total number of chunks extracted from this document.",
-                data_type=wvc.DataType.INT,
-            ),
-            wvc.Property(
-                name="toc",
-                description="Table of contents as JSON string [{title, level, page}, ...].",
-                data_type=wvc.DataType.TEXT,
-            ),
-            wvc.Property(
-                name="hierarchy",
-                description="Full hierarchical structure as JSON string.",
-                data_type=wvc.DataType.TEXT,
-            ),
-            wvc.Property(
-                name="createdAt",
-                description="Timestamp when this document was ingested.",
-                data_type=wvc.DataType.DATE,
-            ),
-            # Nested Work reference
-            wvc.Property(
-                name="work",
-                description="Reference to the Work this document is an instance of.",
-                data_type=wvc.DataType.OBJECT,
-                nested_properties=[
-                    wvc.Property(name="title", data_type=wvc.DataType.TEXT),
-                    wvc.Property(name="author", data_type=wvc.DataType.TEXT),
-                ],
-            ),
-        ],
-    )
-
-
 def create_chunk_collection(client: weaviate.WeaviateClient) -> None:
    """Create the Chunk collection for vectorized text fragments.

@@ -410,7 +336,7 @@ def create_summary_collection(client: weaviate.WeaviateClient) -> None:
 def create_schema(client: weaviate.WeaviateClient, delete_existing: bool = True) -> None:
    """Create the complete Weaviate schema for Library RAG.

-    Creates all four collections: Work, Document, Chunk, Summary.
+    Creates all three collections: Work, Chunk, Summary.

    Args:
        client: Connected Weaviate client.
@@ -429,16 +355,13 @@ def create_schema(client: weaviate.WeaviateClient, delete_existing: bool = True)
    print("      → Work (métadonnées œuvre)...")
    create_work_collection(client)

-    print("      → Document (métadonnées édition)...")
-    create_document_collection(client)
-
    print("      → Chunk (fragments vectorisés)...")
    create_chunk_collection(client)

    print("      → Summary (résumés de chapitres)...")
    create_summary_collection(client)

-    print("      ✓ 4 collections créées")
+    print("      ✓ 3 collections créées")


 def verify_schema(client: weaviate.WeaviateClient) -> bool:
@@ -453,7 +376,7 @@ def verify_schema(client: weaviate.WeaviateClient) -> bool:
    print("\n[3/4] Vérification des collections...")
    collections = client.collections.list_all()

-    expected: Set[str] = {"Work", "Document", "Chunk", "Summary"}
+    expected: Set[str] = {"Work", "Chunk", "Summary"}
    actual: Set[str] = set(collections.keys())

    if expected == actual:
@@ -480,7 +403,7 @@ def display_schema(client: weaviate.WeaviateClient) -> None:

    collections = client.collections.list_all()

-    for name in ["Work", "Document", "Chunk", "Summary"]:
+    for name in ["Work", "Chunk", "Summary"]:
        if name not in collections:
            continue

@@ -523,14 +446,12 @@ def print_summary() -> None:
    print("=" * 80)
    print("\n✓ Architecture:")
    print("  - Work: Source unique pour author/title")
-    print("  - Document: Métadonnées d'édition avec référence vers Work")
-    print("  - Chunk: Fragments vectorisés (text + summary + keywords)")
+    print("  - Chunk: Fragments vectorisés (text + keywords)")
    print("  - Summary: Résumés de chapitres vectorisés (text + concepts)")
    print("\n✓ Vectorisation:")
    print("  - Work:    NONE")
-    print("  - Document: NONE")
-    print("  - Chunk:   text2vec (text + summary + keywords)")
-    print("  - Summary: text2vec (text + concepts)")
+    print("  - Chunk:   GPU embedder (BAAI/bge-m3, 1024-dim)")
+    print("  - Summary: GPU embedder (BAAI/bge-m3, 1024-dim)")
    print("\n✓ Index Vectoriel (Optimisation 2026):")
    print("  - Chunk:   HNSW + RQ (~75% moins de RAM)")
    print("  - Summary: HNSW + RQ")
--- a/generations/library_rag/utils/types.py
+++ b/generations/library_rag/utils/types.py
@@ -848,7 +848,7 @@ class WeaviateIngestResult(TypedDict, total=False):
        inserted: List of inserted chunk summaries (first 10).
        work: Title of the ingested work.
        author: Author of the ingested work.
-        document_uuid: UUID of created Document object (if any).
+        work_uuid: UUID of created Work object (if any).
        all_objects: Complete list of all inserted ChunkObjects.

    Note:
@@ -863,7 +863,7 @@ class WeaviateIngestResult(TypedDict, total=False):
    inserted: List[Any]  # List[InsertedChunkSummary] from weaviate_ingest
    work: str
    author: str
-    document_uuid: Optional[str]
+    work_uuid: Optional[str]
    all_objects: List[Any]  # List[ChunkObject] from weaviate_ingest


--- a/generations/library_rag/utils/weaviate_ingest.py
+++ b/generations/library_rag/utils/weaviate_ingest.py
@@ -190,9 +190,8 @@ class DeleteResult(TypedDict, total=False):
    Attributes:
        success: Whether deletion succeeded.
        error: Error message if deletion failed.
-        deleted_chunks: Number of chunks deleted from Chunk collection.
-        deleted_summaries: Number of summaries deleted from Summary collection.
-        deleted_document: Whether the Document object was deleted.
+        deleted_chunks: Number of chunks deleted from Chunk_v2 collection.
+        deleted_summaries: Number of summaries deleted from Summary_v2 collection.

    Example:
        >>> result = delete_document_chunks("platon_republique")
@@ -203,7 +202,6 @@ class DeleteResult(TypedDict, total=False):
    error: str
    deleted_chunks: int
    deleted_summaries: int
-    deleted_document: bool


 # =============================================================================
@@ -379,7 +377,8 @@ def validate_document_metadata(
        )

    # Validate title (required for work.title nested object)
-    title = metadata.get("title") or metadata.get("work")
+    # Priority: work > original_title > title (to avoid LLM prompt instructions)
+    title = metadata.get("work") or metadata.get("original_title") or metadata.get("title")
    if not title or not str(title).strip():
        raise ValueError(
            f"Invalid metadata for '{doc_name}': 'title' is missing or empty. "
@@ -388,7 +387,8 @@ def validate_document_metadata(
        )

    # Validate author (required for work.author nested object)
-    author = metadata.get("author")
+    # Priority: original_author > author (to avoid LLM prompt instructions)
+    author = metadata.get("original_author") or metadata.get("author")
    if not author or not str(author).strip():
        raise ValueError(
            f"Invalid metadata for '{doc_name}': 'author' is missing or empty. "
@@ -649,8 +649,10 @@ def create_or_get_work(
        logger.warning(f"Collection Work non trouvée: {e}")
        return None

-    title = metadata.get("title") or doc_name
-    author = metadata.get("author") or "Inconnu"
+    # Priority: work > original_title > title (to avoid LLM prompt instructions)
+    title = metadata.get("work") or metadata.get("original_title") or metadata.get("title") or doc_name
+    # Priority: original_author > author (to avoid LLM prompt instructions)
+    author = metadata.get("original_author") or metadata.get("author") or "Inconnu"
    year = metadata.get("year", 0) if metadata.get("year") else 0

    try:
@@ -686,76 +688,6 @@ def create_or_get_work(
        return None


-def ingest_document_metadata(
-    client: WeaviateClient,
-    doc_name: str,
-    metadata: Dict[str, Any],
-    toc: List[Dict[str, Any]],
-    hierarchy: Dict[str, Any],
-    chunks_count: int,
-    pages: int,
-) -> Optional[str]:
-    """Insert document metadata into the Document collection.
-
-    Creates a Document object containing metadata about a processed document,
-    including its table of contents, hierarchy structure, and statistics.
-
-    Args:
-        client: Active Weaviate client connection.
-        doc_name: Unique document identifier (sourceId).
-        metadata: Extracted metadata dict with keys: title, author, language.
-        toc: Table of contents as a hierarchical list of dicts.
-        hierarchy: Complete document hierarchy structure.
-        chunks_count: Total number of chunks in the document.
-        pages: Number of pages in the source PDF.
-
-    Returns:
-        UUID string of the created Document object, or None if insertion failed.
-
-    Example:
-        >>> with get_weaviate_client() as client:
-        ...     uuid = ingest_document_metadata(
-        ...         client,
-        ...         doc_name="platon_republique",
-        ...         metadata={"title": "La Republique", "author": "Platon"},
-        ...         toc=[{"title": "Livre I", "level": 1}],
-        ...         hierarchy={},
-        ...         chunks_count=150,
-        ...         pages=300,
-        ...     )
-
-    Note:
-        The TOC and hierarchy are serialized to JSON strings for storage.
-        The createdAt field is set to the current timestamp.
-    """
-    try:
-        doc_collection: Collection[Any, Any] = client.collections.get("Document")
-    except Exception as e:
-        logger.warning(f"Collection Document non trouvée: {e}")
-        return None
-
-    try:
-        doc_obj: Dict[str, Any] = {
-            "sourceId": doc_name,
-            "title": metadata.get("title") or doc_name,
-            "author": metadata.get("author") or "Inconnu",
-            "toc": json.dumps(toc, ensure_ascii=False) if toc else "[]",
-            "hierarchy": json.dumps(hierarchy, ensure_ascii=False) if hierarchy else "{}",
-            "pages": pages,
-            "chunksCount": chunks_count,
-            "language": metadata.get("language", "fr"),
-            "createdAt": datetime.now(timezone.utc).isoformat(),
-        }
-
-        result = doc_collection.data.insert(doc_obj)
-        logger.info(f"Document metadata ingéré: {doc_name}")
-        return str(result)
-
-    except Exception as e:
-        logger.warning(f"Erreur ingestion document metadata: {e}")
-        return None
-
-
 def ingest_summaries(
    client: WeaviateClient,
    doc_name: str,
@@ -897,14 +829,13 @@ def ingest_document(
    toc: Optional[List[Dict[str, Any]]] = None,
    hierarchy: Optional[Dict[str, Any]] = None,
    pages: int = 0,
-    ingest_document_collection: bool = True,
    ingest_summary_collection: bool = False,
 ) -> IngestResult:
    """Ingest document chunks into Weaviate with nested objects.

-    Main ingestion function that inserts chunks into the Chunk collection
-    with nested Work and Document references. Optionally also creates
-    entries in the Document and Summary collections.
+    Main ingestion function that inserts chunks into the Chunk_v2 collection
+    with nested Work references. Optionally also creates entries in the
+    Summary_v2 collection.

    This function uses batch insertion for optimal performance and
    constructs proper nested objects for filtering capabilities.
@@ -922,12 +853,10 @@ def ingest_document(
            - author: Author name
            - edition (optional): Edition identifier
        language: ISO language code. Defaults to "fr".
-        toc: Optional table of contents for Document/Summary collections.
+        toc: Optional table of contents for Summary collection.
        hierarchy: Optional complete document hierarchy structure.
        pages: Number of pages in source document. Defaults to 0.
-        ingest_document_collection: If True, also insert into Document
-            collection. Defaults to True.
-        ingest_summary_collection: If True, also insert into Summary
+        ingest_summary_collection: If True, also insert into Summary_v2
            collection (requires toc). Defaults to False.

    Returns:
@@ -937,7 +866,7 @@ def ingest_document(
            - inserted: Preview of first 10 inserted chunks
            - work: Work title
            - author: Author name
-            - document_uuid: UUID of Document object (if created)
+            - work_uuid: UUID of Work object (if created)
            - all_objects: Complete list of inserted ChunkObjects
            - error: Error message (if failed)

@@ -995,14 +924,6 @@ def ingest_document(
                client, doc_name, metadata, pages
            )

-            # Insérer les métadonnées du document (optionnel)
-            doc_uuid: Optional[str] = None
-            if ingest_document_collection:
-                doc_uuid = ingest_document_metadata(
-                    client, doc_name, metadata, toc or [], hierarchy or {},
-                    len(chunks), pages
-                )
-
            # Insérer les résumés (optionnel)
            if ingest_summary_collection and toc:
                ingest_summaries(client, doc_name, toc, {})
@@ -1018,8 +939,10 @@ def ingest_document(
            objects_to_insert: List[ChunkObject] = []

            # Extraire et valider les métadonnées (validation déjà faite, juste extraction)
-            title: str = metadata.get("title") or metadata.get("work") or doc_name
-            author: str = metadata.get("author") or "Inconnu"
+            # Priority: work > original_title > title (to avoid LLM prompt instructions)
+            title: str = metadata.get("work") or metadata.get("original_title") or metadata.get("title") or doc_name
+            # Priority: original_author > author (to avoid LLM prompt instructions)
+            author: str = metadata.get("original_author") or metadata.get("author") or "Inconnu"
            edition: str = metadata.get("edition", "")

            for idx, chunk in enumerate(chunks):
@@ -1153,7 +1076,7 @@ def ingest_document(
                inserted=inserted_summary,
                work=title,
                author=author,
-                document_uuid=doc_uuid,
+                work_uuid=work_uuid,
                all_objects=objects_to_insert,
            )

@@ -1169,9 +1092,8 @@ def ingest_document(
 def delete_document_chunks(doc_name: str) -> DeleteResult:
    """Delete all data for a document from Weaviate collections.

-    Removes chunks, summaries, and the document metadata from their
-    respective collections. Uses nested object filtering to find
-    related objects.
+    Removes chunks and summaries from their respective collections.
+    Uses nested object filtering to find related objects.

    This function is useful for re-processing a document after changes
    to the processing pipeline or to clean up test data.
@@ -1184,7 +1106,6 @@ def delete_document_chunks(doc_name: str) -> DeleteResult:
            - success: True if deletion succeeded (even if no objects found)
            - deleted_chunks: Number of Chunk objects deleted
            - deleted_summaries: Number of Summary objects deleted
-            - deleted_document: True if Document object was deleted
            - error: Error message (if failed)

    Example:
@@ -1227,23 +1148,12 @@ def delete_document_chunks(doc_name: str) -> DeleteResult:
            except Exception as e:
                logger.warning(f"Erreur suppression summaries: {e}")

-            # Supprimer le document
-            try:
-                doc_collection: Collection[Any, Any] = client.collections.get("Document")
-                result = doc_collection.data.delete_many(
-                    where=wvq.Filter.by_property("sourceId").equal(doc_name)
-                )
-                deleted_document = result.successful > 0
-            except Exception as e:
-                logger.warning(f"Erreur suppression document: {e}")
-
            logger.info(f"Suppression: {deleted_chunks} chunks, {deleted_summaries} summaries pour {doc_name}")

            return DeleteResult(
                success=True,
                deleted_chunks=deleted_chunks,
                deleted_summaries=deleted_summaries,
-                deleted_document=deleted_document,
            )

    except Exception as e: