refactor: Rename Chunk_v2/Summary_v2 collections to Chunk/Summary

- Add migrate_rename_collections.py script for data migration - Update flask_app.py to use new collection names - Update weaviate_ingest.py to use new collection names - Update schema.py documentation - Update README.md and ANALYSE_MCP_TOOLS.md Migration completed: 5372 chunks + 114 summaries preserved with vectors. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-14 23:59:03 +01:00
parent 5a732e885f
commit 1bf570e201
6 changed files with 383 additions and 46 deletions
--- a/generations/library_rag/flask_app.py
+++ b/generations/library_rag/flask_app.py
@@ -193,7 +193,7 @@ def get_collection_stats() -> Optional[CollectionStats]:
            stats: CollectionStats = {}

            # Chunk stats (renamed from Passage)
-            passages = client.collections.get("Chunk_v2")
+            passages = client.collections.get("Chunk")
            passage_count = passages.aggregate.over_all(total_count=True)
            stats["passages"] = passage_count.total_count or 0

@@ -248,7 +248,7 @@ def get_all_passages(
            if client is None:
                return []

-            chunks = client.collections.get("Chunk_v2")
+            chunks = client.collections.get("Chunk")

            result = chunks.query.fetch_objects(
                limit=limit,
@@ -293,7 +293,7 @@ def simple_search(
            if client is None:
                return []

-            chunks = client.collections.get("Chunk_v2")
+            chunks = client.collections.get("Chunk")

            # Build filters using top-level properties (workAuthor, workTitle)
            filters: Optional[Any] = None
@@ -377,7 +377,7 @@ def hierarchical_search(
            # STAGE 1: Search Summary collection for relevant sections
            # ═══════════════════════════════════════════════════════════════

-            summary_collection = client.collections.get("Summary_v2")
+            summary_collection = client.collections.get("Summary")

            # Generate query vector with GPU embedder (Phase 5: manual vectorization)
            embedder = get_gpu_embedder()
@@ -423,7 +423,7 @@ def hierarchical_search(
                    "similarity": round((1 - summary_obj.metadata.distance) * 100, 1) if summary_obj.metadata and summary_obj.metadata.distance else 0,
                })

-            # Post-filter sections by author/work (Summary_v2 has workTitle property)
+            # Post-filter sections by author/work (Summary has workTitle property)
            if author_filter or work_filter:
                print(f"[HIERARCHICAL] Post-filtering {len(sections_data)} sections by work='{work_filter}'")

@@ -485,7 +485,7 @@ def hierarchical_search(
            # For each section, search chunks using the section's summary text
            # This groups chunks under their relevant sections

-            chunk_collection = client.collections.get("Chunk_v2")
+            chunk_collection = client.collections.get("Chunk")

            # Build base filters (author/work only)
            base_filters: Optional[Any] = None
@@ -650,9 +650,9 @@ def summary_only_search(
            if client is None:
                return []

-            summaries = client.collections.get("Summary_v2")
+            summaries = client.collections.get("Summary")

-            # Build Work map for metadata lookup (Summary_v2 has workTitle, not document)
+            # Build Work map for metadata lookup (Summary has workTitle, not document)
            work_collection = client.collections.get("Work")
            work_map = {}
            for work in work_collection.iterator(include_vector=False):
@@ -1043,7 +1043,7 @@ def rag_search(
                print("[RAG Search] Weaviate client unavailable")
                return []

-            chunks = client.collections.get("Chunk_v2")
+            chunks = client.collections.get("Chunk")

            # Build work filter if selected_works is provided
            work_filter: Optional[Any] = None
@@ -1536,8 +1536,8 @@ def api_get_works() -> Union[Response, tuple[Response, int]]:
                    "message": "Cannot connect to Weaviate database"
                }), 500

-            # Query Chunk_v2 collection to get all unique works with counts
-            chunks = client.collections.get("Chunk_v2")
+            # Query Chunk collection to get all unique works with counts
+            chunks = client.collections.get("Chunk")

            # Fetch all chunks to aggregate by work
            # In v2: work is NOT a nested object, use workTitle and workAuthor properties
@@ -3421,7 +3421,7 @@ def documents() -> str:
            # Get all Works (now with sourceId added in Phase 1 of migration)
            try:
                work_collection = client.collections.get("Work")
-                chunk_collection = client.collections.get("Chunk_v2")
+                chunk_collection = client.collections.get("Chunk")

                # Build documents from Work collection
                for work in work_collection.iterator(include_vector=False):
@@ -3461,7 +3461,7 @@ def documents() -> str:

            # Count summaries (if collection exists)
            try:
-                summary_collection = client.collections.get("Summary_v2")
+                summary_collection = client.collections.get("Summary")
                for summary in summary_collection.iterator(include_vector=False):
                    work_title = summary.properties.get("workTitle")

--- a/generations/library_rag/migrate_rename_collections.py
+++ b/generations/library_rag/migrate_rename_collections.py
@@ -0,0 +1,337 @@
+#!/usr/bin/env python3
+"""
+Rename collections: Chunk_v2 -> Chunk, Summary_v2 -> Summary
+
+Weaviate doesn't support renaming collections directly, so this script:
+1. Creates new collections (Chunk, Summary) with identical schema
+2. Copies all objects with their vectors (batch insert)
+3. Validates the migration (count check)
+4. Optionally deletes old collections (--cleanup flag)
+
+Usage:
+    python migrate_rename_collections.py --dry-run    # Preview without changes
+    python migrate_rename_collections.py              # Execute migration
+    python migrate_rename_collections.py --cleanup    # Delete old collections after validation
+"""
+
+import weaviate
+import weaviate.classes as wvc
+from weaviate.classes.config import Configure, Property, DataType, VectorDistances
+from weaviate.classes.query import Filter
+import sys
+import argparse
+from typing import Any
+import time
+
+MIGRATIONS = [
+    ("Chunk_v2", "Chunk"),
+    ("Summary_v2", "Summary"),
+]
+
+BATCH_SIZE = 100
+
+
+def get_collection_count(client: weaviate.WeaviateClient, name: str) -> int:
+    """Get the number of objects in a collection."""
+    try:
+        coll = client.collections.get(name)
+        return coll.aggregate.over_all().total_count
+    except Exception:
+        return 0
+
+
+def collection_exists(client: weaviate.WeaviateClient, name: str) -> bool:
+    """Check if a collection exists."""
+    try:
+        client.collections.get(name)
+        return True
+    except Exception:
+        return False
+
+
+def create_chunk_collection(client: weaviate.WeaviateClient) -> None:
+    """Create the new Chunk collection with schema from Chunk_v2."""
+    print("  Creating Chunk collection...")
+
+    client.collections.create(
+        name="Chunk",
+        description="Document chunks with manual GPU vectorization (BAAI/bge-m3, 1024-dim)",
+        vectorizer_config=Configure.Vectorizer.none(),
+        vector_index_config=Configure.VectorIndex.hnsw(
+            distance_metric=VectorDistances.COSINE,
+            ef_construction=128,
+            max_connections=32,
+            quantizer=Configure.VectorIndex.Quantizer.rq(),
+        ),
+        properties=[
+            Property(name="text", data_type=DataType.TEXT, description="Chunk text content"),
+            Property(name="workTitle", data_type=DataType.TEXT, description="Work title"),
+            Property(name="workAuthor", data_type=DataType.TEXT, description="Work author"),
+            Property(name="sectionPath", data_type=DataType.TEXT, description="Section path"),
+            Property(name="sectionLevel", data_type=DataType.INT, description="Section level"),
+            Property(name="chapterTitle", data_type=DataType.TEXT, description="Chapter title"),
+            Property(name="canonicalReference", data_type=DataType.TEXT, description="Canonical reference"),
+            Property(name="unitType", data_type=DataType.TEXT, description="Unit type"),
+            Property(name="keywords", data_type=DataType.TEXT_ARRAY, description="Keywords"),
+            Property(name="language", data_type=DataType.TEXT, description="Language code"),
+            Property(name="year", data_type=DataType.INT, description="Publication year"),
+            Property(name="orderIndex", data_type=DataType.INT, description="Order index"),
+            Property(name="summary", data_type=DataType.TEXT, description="Chunk summary"),
+            Property(name="document", data_type=DataType.TEXT, description="Document reference"),
+        ],
+    )
+    print("  [OK] Chunk collection created")
+
+
+def create_summary_collection(client: weaviate.WeaviateClient) -> None:
+    """Create the new Summary collection with schema from Summary_v2."""
+    print("  Creating Summary collection...")
+
+    client.collections.create(
+        name="Summary",
+        description="Section summaries (v2 - sans Document)",
+        vectorizer_config=Configure.Vectorizer.none(),
+        vector_index_config=Configure.VectorIndex.hnsw(
+            distance_metric=VectorDistances.COSINE,
+            ef_construction=128,
+            max_connections=32,
+            quantizer=Configure.VectorIndex.Quantizer.rq(),
+        ),
+        properties=[
+            Property(name="text", data_type=DataType.TEXT, description="Summary text (vectorized)"),
+            Property(name="concepts", data_type=DataType.TEXT_ARRAY, description="Key concepts"),
+            Property(name="workTitle", data_type=DataType.TEXT, description="Work title"),
+            Property(name="sectionPath", data_type=DataType.TEXT, description="Section path"),
+            Property(name="title", data_type=DataType.TEXT, description="Section title"),
+            Property(name="level", data_type=DataType.INT, description="Hierarchy level"),
+            Property(name="chunksCount", data_type=DataType.INT, description="Chunks count"),
+            Property(name="language", data_type=DataType.TEXT, description="Language code"),
+            Property(name="workAuthor", data_type=DataType.TEXT, description="Work author"),
+            Property(name="year", data_type=DataType.NUMBER, description="Publication year"),
+        ],
+    )
+    print("  [OK] Summary collection created")
+
+
+def clean_properties(props: dict[str, Any], collection_name: str) -> dict[str, Any]:
+    """Clean properties to ensure correct types."""
+    cleaned = dict(props)
+
+    # Integer fields that may have been stored as float
+    int_fields = ["sectionLevel", "year", "orderIndex", "level", "chunksCount"]
+
+    for field in int_fields:
+        if field in cleaned and cleaned[field] is not None:
+            try:
+                cleaned[field] = int(cleaned[field])
+            except (ValueError, TypeError):
+                pass
+
+    return cleaned
+
+
+def migrate_objects(
+    client: weaviate.WeaviateClient,
+    source_name: str,
+    target_name: str,
+    dry_run: bool = False
+) -> int:
+    """Copy all objects from source to target collection with vectors."""
+    source = client.collections.get(source_name)
+    target = client.collections.get(target_name)
+
+    total = source.aggregate.over_all().total_count
+    print(f"  Migrating {total} objects from {source_name} -> {target_name}")
+
+    if dry_run:
+        print(f"  [DRY-RUN] Would migrate {total} objects")
+        return total
+
+    migrated = 0
+    errors = 0
+    batch_objects: list[dict[str, Any]] = []
+
+    for obj in source.iterator(include_vector=True):
+        # Get vector (handle both dict and direct vector)
+        vector = obj.vector
+        if isinstance(vector, dict):
+            vector = vector.get("default", list(vector.values())[0] if vector else None)
+
+        # Clean properties to ensure correct types
+        cleaned_props = clean_properties(obj.properties, target_name)
+
+        batch_objects.append({
+            "uuid": obj.uuid,
+            "properties": cleaned_props,
+            "vector": vector,
+        })
+
+        if len(batch_objects) >= BATCH_SIZE:
+            # Insert batch
+            with target.batch.dynamic() as batch:
+                for item in batch_objects:
+                    batch.add_object(
+                        uuid=item["uuid"],
+                        properties=item["properties"],
+                        vector=item["vector"],
+                    )
+            migrated += len(batch_objects)
+            print(f"    Progress: {migrated}/{total} ({100*migrated//total}%)", end='\r')
+            batch_objects = []
+
+    # Insert remaining objects
+    if batch_objects:
+        with target.batch.dynamic() as batch:
+            for item in batch_objects:
+                batch.add_object(
+                    uuid=item["uuid"],
+                    properties=item["properties"],
+                    vector=item["vector"],
+                )
+        migrated += len(batch_objects)
+
+    print(f"    Progress: {migrated}/{total} (100%)      ")
+    print(f"  [OK] Migrated {migrated} objects")
+
+    return migrated
+
+
+def validate_migration(
+    client: weaviate.WeaviateClient,
+    source_name: str,
+    target_name: str
+) -> bool:
+    """Validate that source and target have same object count."""
+    source_count = get_collection_count(client, source_name)
+    target_count = get_collection_count(client, target_name)
+
+    if source_count == target_count:
+        print(f"  [OK] Validation passed: {source_name}={source_count}, {target_name}={target_count}")
+        return True
+    else:
+        print(f"  [ERROR] Validation FAILED: {source_name}={source_count}, {target_name}={target_count}")
+        return False
+
+
+def cleanup_old_collections(client: weaviate.WeaviateClient, dry_run: bool = False) -> None:
+    """Delete old collections after successful migration."""
+    print("\n" + "="*70)
+    print("CLEANUP: Deleting old collections")
+    print("="*70)
+
+    for source_name, target_name in MIGRATIONS:
+        if not collection_exists(client, source_name):
+            print(f"  {source_name}: Already deleted")
+            continue
+
+        if not collection_exists(client, target_name):
+            print(f"  [ERROR] Cannot delete {source_name}: {target_name} doesn't exist!")
+            continue
+
+        # Validate before deleting
+        if not validate_migration(client, source_name, target_name):
+            print(f"  [ERROR] Skipping {source_name} deletion: validation failed")
+            continue
+
+        if dry_run:
+            print(f"  [DRY-RUN] Would delete {source_name}")
+        else:
+            client.collections.delete(source_name)
+            print(f"  [OK] Deleted {source_name}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Rename Weaviate collections")
+    parser.add_argument("--dry-run", action="store_true", help="Preview without making changes")
+    parser.add_argument("--cleanup", action="store_true", help="Delete old collections after validation")
+    args = parser.parse_args()
+
+    print("="*70)
+    print("WEAVIATE COLLECTION RENAME: Chunk_v2 -> Chunk, Summary_v2 -> Summary")
+    print("="*70)
+    print(f"Mode: {'DRY-RUN' if args.dry_run else 'LIVE'}")
+    print(f"Cleanup: {'YES' if args.cleanup else 'NO'}")
+    print()
+
+    client = weaviate.connect_to_local()
+
+    try:
+        # Show current state
+        print("Current collections:")
+        for source_name, target_name in MIGRATIONS:
+            source_count = get_collection_count(client, source_name)
+            target_exists = collection_exists(client, target_name)
+            target_count = get_collection_count(client, target_name) if target_exists else 0
+            print(f"  {source_name}: {source_count} objects")
+            print(f"  {target_name}: {'exists (' + str(target_count) + ' objects)' if target_exists else 'does not exist'}")
+        print()
+
+        if args.cleanup:
+            cleanup_old_collections(client, dry_run=args.dry_run)
+            return
+
+        # Migration
+        for source_name, target_name in MIGRATIONS:
+            print("="*70)
+            print(f"MIGRATING: {source_name} -> {target_name}")
+            print("="*70)
+
+            # Check source exists
+            if not collection_exists(client, source_name):
+                print(f"  [ERROR] Source collection {source_name} does not exist!")
+                continue
+
+            # Check if target already exists
+            if collection_exists(client, target_name):
+                target_count = get_collection_count(client, target_name)
+                if target_count > 0:
+                    print(f"  Target {target_name} already exists with {target_count} objects")
+                    print(f"  Skipping (already migrated)")
+                    continue
+                else:
+                    print(f"  Target {target_name} exists but empty, will populate")
+            else:
+                # Create target collection
+                if not args.dry_run:
+                    if target_name == "Chunk":
+                        create_chunk_collection(client)
+                    elif target_name == "Summary":
+                        create_summary_collection(client)
+                else:
+                    print(f"  [DRY-RUN] Would create {target_name} collection")
+
+            # Migrate objects
+            if not args.dry_run:
+                migrate_objects(client, source_name, target_name, dry_run=False)
+            else:
+                migrate_objects(client, source_name, target_name, dry_run=True)
+
+            # Validate
+            if not args.dry_run:
+                validate_migration(client, source_name, target_name)
+
+            print()
+
+        # Final status
+        print("="*70)
+        print("MIGRATION COMPLETE")
+        print("="*70)
+        print("\nFinal state:")
+        for source_name, target_name in MIGRATIONS:
+            source_count = get_collection_count(client, source_name)
+            target_count = get_collection_count(client, target_name)
+            print(f"  {source_name}: {source_count} objects")
+            print(f"  {target_name}: {target_count} objects")
+
+        if not args.dry_run:
+            print("\nNext steps:")
+            print("  1. Update code: replace 'Chunk_v2' -> 'Chunk', 'Summary_v2' -> 'Summary'")
+            print("  2. Test the application")
+            print("  3. Run: python migrate_rename_collections.py --cleanup")
+
+    finally:
+        client.close()
+
+
+if __name__ == "__main__":
+    main()
--- a/generations/library_rag/schema.py
+++ b/generations/library_rag/schema.py
@@ -9,8 +9,8 @@ Schema Architecture:
    querying. The hierarchy is::

        Work (metadata only)
-          ├── Chunk_v2 (vectorized text fragments)
-          └── Summary_v2 (vectorized chapter summaries)
+          ├── Chunk (vectorized text fragments)
+          └── Summary (vectorized chapter summaries)

 Collections:
    **Work** (no vectorization):
@@ -18,21 +18,21 @@ Collections:
        Stores canonical metadata: title, author, year, language, genre.
        Not vectorized - used only for metadata and relationships.

-    **Chunk_v2** (manual GPU vectorization):
+    **Chunk** (manual GPU vectorization):
        Text fragments optimized for semantic search (200-800 chars).
        Vectorized with Python GPU embedder (BAAI/bge-m3, 1024-dim).
        Vectorized fields: text, keywords.
        Non-vectorized fields: workTitle, workAuthor, sectionPath, chapterTitle, unitType, orderIndex.
        Includes nested Work reference for denormalized access.

-    **Summary_v2** (manual GPU vectorization):
+    **Summary** (manual GPU vectorization):
        LLM-generated chapter/section summaries for high-level search.
        Vectorized with Python GPU embedder (BAAI/bge-m3, 1024-dim).
        Vectorized fields: text, concepts.
        Includes nested Work reference for denormalized access.

 Vectorization Strategy:
-    - Only Chunk_v2.text, Chunk_v2.keywords, Summary_v2.text, and Summary_v2.concepts are vectorized
+    - Only Chunk.text, Chunk.keywords, Summary.text, and Summary.concepts are vectorized
    - Manual vectorization with Python GPU embedder (BAAI/bge-m3, 1024-dim, RTX 4070)
    - Metadata fields use skip_vectorization=True for filtering only
    - Work collection has no vectorizer (metadata only)
@@ -56,8 +56,8 @@ Nested Objects:
    denormalized data access. This allows single-query retrieval of chunk
    data with its Work metadata without joins::

-        Chunk_v2.work = {title, author}
-        Summary_v2.work = {title, author}
+        Chunk.work = {title, author}
+        Summary.work = {title, author}

 Usage:
    From command line::
--- a/generations/library_rag/utils/weaviate_ingest.py
+++ b/generations/library_rag/utils/weaviate_ingest.py
@@ -190,8 +190,8 @@ class DeleteResult(TypedDict, total=False):
    Attributes:
        success: Whether deletion succeeded.
        error: Error message if deletion failed.
-        deleted_chunks: Number of chunks deleted from Chunk_v2 collection.
-        deleted_summaries: Number of summaries deleted from Summary_v2 collection.
+        deleted_chunks: Number of chunks deleted from Chunk collection.
+        deleted_summaries: Number of summaries deleted from Summary collection.

    Example:
        >>> result = delete_document_chunks("platon_republique")
@@ -725,7 +725,7 @@ def ingest_summaries(
        Recursively processes nested TOC entries (children).
    """
    try:
-        summary_collection: Collection[Any, Any] = client.collections.get("Summary_v2")
+        summary_collection: Collection[Any, Any] = client.collections.get("Summary")
    except Exception as e:
        logger.warning(f"Collection Summary non trouvée: {e}")
        return 0
@@ -833,9 +833,9 @@ def ingest_document(
 ) -> IngestResult:
    """Ingest document chunks into Weaviate with nested objects.

-    Main ingestion function that inserts chunks into the Chunk_v2 collection
+    Main ingestion function that inserts chunks into the Chunk collection
    with nested Work references. Optionally also creates entries in the
-    Summary_v2 collection.
+    Summary collection.

    This function uses batch insertion for optimal performance and
    constructs proper nested objects for filtering capabilities.
@@ -856,7 +856,7 @@ def ingest_document(
        toc: Optional table of contents for Summary collection.
        hierarchy: Optional complete document hierarchy structure.
        pages: Number of pages in source document. Defaults to 0.
-        ingest_summary_collection: If True, also insert into Summary_v2
+        ingest_summary_collection: If True, also insert into Summary
            collection (requires toc). Defaults to False.

    Returns:
@@ -911,7 +911,7 @@ def ingest_document(

            # Récupérer la collection Chunk
            try:
-                chunk_collection: Collection[Any, Any] = client.collections.get("Chunk_v2")
+                chunk_collection: Collection[Any, Any] = client.collections.get("Chunk")
            except Exception as e:
                return IngestResult(
                    success=False,
@@ -983,14 +983,14 @@ def ingest_document(
                    "keywords": chunk.get("concepts", chunk.get("keywords", [])),
                    "language": language,
                    "orderIndex": idx,
-                    # Use flat fields instead of nested objects for Chunk_v2 schema
+                    # Use flat fields instead of nested objects for Chunk schema
                    "workTitle": title,
                    "workAuthor": author,
                    "year": metadata.get("year", 0) if metadata.get("year") else 0,
-                    # Note: document reference fields not used in current Chunk_v2 schema
+                    # Note: document reference fields not used in current Chunk schema
                }

-                # Note: Nested objects validation skipped for Chunk_v2 flat schema
+                # Note: Nested objects validation skipped for Chunk flat schema
                # validate_chunk_nested_objects(chunk_obj, idx, doc_name)

                objects_to_insert.append(chunk_obj)
@@ -1130,7 +1130,7 @@ def delete_document_chunks(doc_name: str) -> DeleteResult:

            # Supprimer les chunks (filtrer sur document.sourceId nested)
            try:
-                chunk_collection: Collection[Any, Any] = client.collections.get("Chunk_v2")
+                chunk_collection: Collection[Any, Any] = client.collections.get("Chunk")
                result = chunk_collection.data.delete_many(
                    where=wvq.Filter.by_property("document.sourceId").equal(doc_name)
                )
@@ -1140,7 +1140,7 @@ def delete_document_chunks(doc_name: str) -> DeleteResult:

            # Supprimer les summaries (filtrer sur document.sourceId nested)
            try:
-                summary_collection: Collection[Any, Any] = client.collections.get("Summary_v2")
+                summary_collection: Collection[Any, Any] = client.collections.get("Summary")
                result = summary_collection.data.delete_many(
                    where=wvq.Filter.by_property("document.sourceId").equal(doc_name)
                )