refactor: Rename Chunk_v2/Summary_v2 collections to Chunk/Summary
- Add migrate_rename_collections.py script for data migration - Update flask_app.py to use new collection names - Update weaviate_ingest.py to use new collection names - Update schema.py documentation - Update README.md and ANALYSE_MCP_TOOLS.md Migration completed: 5372 chunks + 114 summaries preserved with vectors. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -193,7 +193,7 @@ def get_collection_stats() -> Optional[CollectionStats]:
|
||||
stats: CollectionStats = {}
|
||||
|
||||
# Chunk stats (renamed from Passage)
|
||||
passages = client.collections.get("Chunk_v2")
|
||||
passages = client.collections.get("Chunk")
|
||||
passage_count = passages.aggregate.over_all(total_count=True)
|
||||
stats["passages"] = passage_count.total_count or 0
|
||||
|
||||
@@ -248,7 +248,7 @@ def get_all_passages(
|
||||
if client is None:
|
||||
return []
|
||||
|
||||
chunks = client.collections.get("Chunk_v2")
|
||||
chunks = client.collections.get("Chunk")
|
||||
|
||||
result = chunks.query.fetch_objects(
|
||||
limit=limit,
|
||||
@@ -293,7 +293,7 @@ def simple_search(
|
||||
if client is None:
|
||||
return []
|
||||
|
||||
chunks = client.collections.get("Chunk_v2")
|
||||
chunks = client.collections.get("Chunk")
|
||||
|
||||
# Build filters using top-level properties (workAuthor, workTitle)
|
||||
filters: Optional[Any] = None
|
||||
@@ -377,7 +377,7 @@ def hierarchical_search(
|
||||
# STAGE 1: Search Summary collection for relevant sections
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
|
||||
summary_collection = client.collections.get("Summary_v2")
|
||||
summary_collection = client.collections.get("Summary")
|
||||
|
||||
# Generate query vector with GPU embedder (Phase 5: manual vectorization)
|
||||
embedder = get_gpu_embedder()
|
||||
@@ -423,7 +423,7 @@ def hierarchical_search(
|
||||
"similarity": round((1 - summary_obj.metadata.distance) * 100, 1) if summary_obj.metadata and summary_obj.metadata.distance else 0,
|
||||
})
|
||||
|
||||
# Post-filter sections by author/work (Summary_v2 has workTitle property)
|
||||
# Post-filter sections by author/work (Summary has workTitle property)
|
||||
if author_filter or work_filter:
|
||||
print(f"[HIERARCHICAL] Post-filtering {len(sections_data)} sections by work='{work_filter}'")
|
||||
|
||||
@@ -485,7 +485,7 @@ def hierarchical_search(
|
||||
# For each section, search chunks using the section's summary text
|
||||
# This groups chunks under their relevant sections
|
||||
|
||||
chunk_collection = client.collections.get("Chunk_v2")
|
||||
chunk_collection = client.collections.get("Chunk")
|
||||
|
||||
# Build base filters (author/work only)
|
||||
base_filters: Optional[Any] = None
|
||||
@@ -650,9 +650,9 @@ def summary_only_search(
|
||||
if client is None:
|
||||
return []
|
||||
|
||||
summaries = client.collections.get("Summary_v2")
|
||||
summaries = client.collections.get("Summary")
|
||||
|
||||
# Build Work map for metadata lookup (Summary_v2 has workTitle, not document)
|
||||
# Build Work map for metadata lookup (Summary has workTitle, not document)
|
||||
work_collection = client.collections.get("Work")
|
||||
work_map = {}
|
||||
for work in work_collection.iterator(include_vector=False):
|
||||
@@ -1043,7 +1043,7 @@ def rag_search(
|
||||
print("[RAG Search] Weaviate client unavailable")
|
||||
return []
|
||||
|
||||
chunks = client.collections.get("Chunk_v2")
|
||||
chunks = client.collections.get("Chunk")
|
||||
|
||||
# Build work filter if selected_works is provided
|
||||
work_filter: Optional[Any] = None
|
||||
@@ -1536,8 +1536,8 @@ def api_get_works() -> Union[Response, tuple[Response, int]]:
|
||||
"message": "Cannot connect to Weaviate database"
|
||||
}), 500
|
||||
|
||||
# Query Chunk_v2 collection to get all unique works with counts
|
||||
chunks = client.collections.get("Chunk_v2")
|
||||
# Query Chunk collection to get all unique works with counts
|
||||
chunks = client.collections.get("Chunk")
|
||||
|
||||
# Fetch all chunks to aggregate by work
|
||||
# In v2: work is NOT a nested object, use workTitle and workAuthor properties
|
||||
@@ -3421,7 +3421,7 @@ def documents() -> str:
|
||||
# Get all Works (now with sourceId added in Phase 1 of migration)
|
||||
try:
|
||||
work_collection = client.collections.get("Work")
|
||||
chunk_collection = client.collections.get("Chunk_v2")
|
||||
chunk_collection = client.collections.get("Chunk")
|
||||
|
||||
# Build documents from Work collection
|
||||
for work in work_collection.iterator(include_vector=False):
|
||||
@@ -3461,7 +3461,7 @@ def documents() -> str:
|
||||
|
||||
# Count summaries (if collection exists)
|
||||
try:
|
||||
summary_collection = client.collections.get("Summary_v2")
|
||||
summary_collection = client.collections.get("Summary")
|
||||
for summary in summary_collection.iterator(include_vector=False):
|
||||
work_title = summary.properties.get("workTitle")
|
||||
|
||||
|
||||
337
generations/library_rag/migrate_rename_collections.py
Normal file
337
generations/library_rag/migrate_rename_collections.py
Normal file
@@ -0,0 +1,337 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Rename collections: Chunk_v2 -> Chunk, Summary_v2 -> Summary
|
||||
|
||||
Weaviate doesn't support renaming collections directly, so this script:
|
||||
1. Creates new collections (Chunk, Summary) with identical schema
|
||||
2. Copies all objects with their vectors (batch insert)
|
||||
3. Validates the migration (count check)
|
||||
4. Optionally deletes old collections (--cleanup flag)
|
||||
|
||||
Usage:
|
||||
python migrate_rename_collections.py --dry-run # Preview without changes
|
||||
python migrate_rename_collections.py # Execute migration
|
||||
python migrate_rename_collections.py --cleanup # Delete old collections after validation
|
||||
"""
|
||||
|
||||
import weaviate
|
||||
import weaviate.classes as wvc
|
||||
from weaviate.classes.config import Configure, Property, DataType, VectorDistances
|
||||
from weaviate.classes.query import Filter
|
||||
import sys
|
||||
import argparse
|
||||
from typing import Any
|
||||
import time
|
||||
|
||||
MIGRATIONS = [
|
||||
("Chunk_v2", "Chunk"),
|
||||
("Summary_v2", "Summary"),
|
||||
]
|
||||
|
||||
BATCH_SIZE = 100
|
||||
|
||||
|
||||
def get_collection_count(client: weaviate.WeaviateClient, name: str) -> int:
|
||||
"""Get the number of objects in a collection."""
|
||||
try:
|
||||
coll = client.collections.get(name)
|
||||
return coll.aggregate.over_all().total_count
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
def collection_exists(client: weaviate.WeaviateClient, name: str) -> bool:
|
||||
"""Check if a collection exists."""
|
||||
try:
|
||||
client.collections.get(name)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def create_chunk_collection(client: weaviate.WeaviateClient) -> None:
|
||||
"""Create the new Chunk collection with schema from Chunk_v2."""
|
||||
print(" Creating Chunk collection...")
|
||||
|
||||
client.collections.create(
|
||||
name="Chunk",
|
||||
description="Document chunks with manual GPU vectorization (BAAI/bge-m3, 1024-dim)",
|
||||
vectorizer_config=Configure.Vectorizer.none(),
|
||||
vector_index_config=Configure.VectorIndex.hnsw(
|
||||
distance_metric=VectorDistances.COSINE,
|
||||
ef_construction=128,
|
||||
max_connections=32,
|
||||
quantizer=Configure.VectorIndex.Quantizer.rq(),
|
||||
),
|
||||
properties=[
|
||||
Property(name="text", data_type=DataType.TEXT, description="Chunk text content"),
|
||||
Property(name="workTitle", data_type=DataType.TEXT, description="Work title"),
|
||||
Property(name="workAuthor", data_type=DataType.TEXT, description="Work author"),
|
||||
Property(name="sectionPath", data_type=DataType.TEXT, description="Section path"),
|
||||
Property(name="sectionLevel", data_type=DataType.INT, description="Section level"),
|
||||
Property(name="chapterTitle", data_type=DataType.TEXT, description="Chapter title"),
|
||||
Property(name="canonicalReference", data_type=DataType.TEXT, description="Canonical reference"),
|
||||
Property(name="unitType", data_type=DataType.TEXT, description="Unit type"),
|
||||
Property(name="keywords", data_type=DataType.TEXT_ARRAY, description="Keywords"),
|
||||
Property(name="language", data_type=DataType.TEXT, description="Language code"),
|
||||
Property(name="year", data_type=DataType.INT, description="Publication year"),
|
||||
Property(name="orderIndex", data_type=DataType.INT, description="Order index"),
|
||||
Property(name="summary", data_type=DataType.TEXT, description="Chunk summary"),
|
||||
Property(name="document", data_type=DataType.TEXT, description="Document reference"),
|
||||
],
|
||||
)
|
||||
print(" [OK] Chunk collection created")
|
||||
|
||||
|
||||
def create_summary_collection(client: weaviate.WeaviateClient) -> None:
|
||||
"""Create the new Summary collection with schema from Summary_v2."""
|
||||
print(" Creating Summary collection...")
|
||||
|
||||
client.collections.create(
|
||||
name="Summary",
|
||||
description="Section summaries (v2 - sans Document)",
|
||||
vectorizer_config=Configure.Vectorizer.none(),
|
||||
vector_index_config=Configure.VectorIndex.hnsw(
|
||||
distance_metric=VectorDistances.COSINE,
|
||||
ef_construction=128,
|
||||
max_connections=32,
|
||||
quantizer=Configure.VectorIndex.Quantizer.rq(),
|
||||
),
|
||||
properties=[
|
||||
Property(name="text", data_type=DataType.TEXT, description="Summary text (vectorized)"),
|
||||
Property(name="concepts", data_type=DataType.TEXT_ARRAY, description="Key concepts"),
|
||||
Property(name="workTitle", data_type=DataType.TEXT, description="Work title"),
|
||||
Property(name="sectionPath", data_type=DataType.TEXT, description="Section path"),
|
||||
Property(name="title", data_type=DataType.TEXT, description="Section title"),
|
||||
Property(name="level", data_type=DataType.INT, description="Hierarchy level"),
|
||||
Property(name="chunksCount", data_type=DataType.INT, description="Chunks count"),
|
||||
Property(name="language", data_type=DataType.TEXT, description="Language code"),
|
||||
Property(name="workAuthor", data_type=DataType.TEXT, description="Work author"),
|
||||
Property(name="year", data_type=DataType.NUMBER, description="Publication year"),
|
||||
],
|
||||
)
|
||||
print(" [OK] Summary collection created")
|
||||
|
||||
|
||||
def clean_properties(props: dict[str, Any], collection_name: str) -> dict[str, Any]:
|
||||
"""Clean properties to ensure correct types."""
|
||||
cleaned = dict(props)
|
||||
|
||||
# Integer fields that may have been stored as float
|
||||
int_fields = ["sectionLevel", "year", "orderIndex", "level", "chunksCount"]
|
||||
|
||||
for field in int_fields:
|
||||
if field in cleaned and cleaned[field] is not None:
|
||||
try:
|
||||
cleaned[field] = int(cleaned[field])
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def migrate_objects(
|
||||
client: weaviate.WeaviateClient,
|
||||
source_name: str,
|
||||
target_name: str,
|
||||
dry_run: bool = False
|
||||
) -> int:
|
||||
"""Copy all objects from source to target collection with vectors."""
|
||||
source = client.collections.get(source_name)
|
||||
target = client.collections.get(target_name)
|
||||
|
||||
total = source.aggregate.over_all().total_count
|
||||
print(f" Migrating {total} objects from {source_name} -> {target_name}")
|
||||
|
||||
if dry_run:
|
||||
print(f" [DRY-RUN] Would migrate {total} objects")
|
||||
return total
|
||||
|
||||
migrated = 0
|
||||
errors = 0
|
||||
batch_objects: list[dict[str, Any]] = []
|
||||
|
||||
for obj in source.iterator(include_vector=True):
|
||||
# Get vector (handle both dict and direct vector)
|
||||
vector = obj.vector
|
||||
if isinstance(vector, dict):
|
||||
vector = vector.get("default", list(vector.values())[0] if vector else None)
|
||||
|
||||
# Clean properties to ensure correct types
|
||||
cleaned_props = clean_properties(obj.properties, target_name)
|
||||
|
||||
batch_objects.append({
|
||||
"uuid": obj.uuid,
|
||||
"properties": cleaned_props,
|
||||
"vector": vector,
|
||||
})
|
||||
|
||||
if len(batch_objects) >= BATCH_SIZE:
|
||||
# Insert batch
|
||||
with target.batch.dynamic() as batch:
|
||||
for item in batch_objects:
|
||||
batch.add_object(
|
||||
uuid=item["uuid"],
|
||||
properties=item["properties"],
|
||||
vector=item["vector"],
|
||||
)
|
||||
migrated += len(batch_objects)
|
||||
print(f" Progress: {migrated}/{total} ({100*migrated//total}%)", end='\r')
|
||||
batch_objects = []
|
||||
|
||||
# Insert remaining objects
|
||||
if batch_objects:
|
||||
with target.batch.dynamic() as batch:
|
||||
for item in batch_objects:
|
||||
batch.add_object(
|
||||
uuid=item["uuid"],
|
||||
properties=item["properties"],
|
||||
vector=item["vector"],
|
||||
)
|
||||
migrated += len(batch_objects)
|
||||
|
||||
print(f" Progress: {migrated}/{total} (100%) ")
|
||||
print(f" [OK] Migrated {migrated} objects")
|
||||
|
||||
return migrated
|
||||
|
||||
|
||||
def validate_migration(
|
||||
client: weaviate.WeaviateClient,
|
||||
source_name: str,
|
||||
target_name: str
|
||||
) -> bool:
|
||||
"""Validate that source and target have same object count."""
|
||||
source_count = get_collection_count(client, source_name)
|
||||
target_count = get_collection_count(client, target_name)
|
||||
|
||||
if source_count == target_count:
|
||||
print(f" [OK] Validation passed: {source_name}={source_count}, {target_name}={target_count}")
|
||||
return True
|
||||
else:
|
||||
print(f" [ERROR] Validation FAILED: {source_name}={source_count}, {target_name}={target_count}")
|
||||
return False
|
||||
|
||||
|
||||
def cleanup_old_collections(client: weaviate.WeaviateClient, dry_run: bool = False) -> None:
|
||||
"""Delete old collections after successful migration."""
|
||||
print("\n" + "="*70)
|
||||
print("CLEANUP: Deleting old collections")
|
||||
print("="*70)
|
||||
|
||||
for source_name, target_name in MIGRATIONS:
|
||||
if not collection_exists(client, source_name):
|
||||
print(f" {source_name}: Already deleted")
|
||||
continue
|
||||
|
||||
if not collection_exists(client, target_name):
|
||||
print(f" [ERROR] Cannot delete {source_name}: {target_name} doesn't exist!")
|
||||
continue
|
||||
|
||||
# Validate before deleting
|
||||
if not validate_migration(client, source_name, target_name):
|
||||
print(f" [ERROR] Skipping {source_name} deletion: validation failed")
|
||||
continue
|
||||
|
||||
if dry_run:
|
||||
print(f" [DRY-RUN] Would delete {source_name}")
|
||||
else:
|
||||
client.collections.delete(source_name)
|
||||
print(f" [OK] Deleted {source_name}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Rename Weaviate collections")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Preview without making changes")
|
||||
parser.add_argument("--cleanup", action="store_true", help="Delete old collections after validation")
|
||||
args = parser.parse_args()
|
||||
|
||||
print("="*70)
|
||||
print("WEAVIATE COLLECTION RENAME: Chunk_v2 -> Chunk, Summary_v2 -> Summary")
|
||||
print("="*70)
|
||||
print(f"Mode: {'DRY-RUN' if args.dry_run else 'LIVE'}")
|
||||
print(f"Cleanup: {'YES' if args.cleanup else 'NO'}")
|
||||
print()
|
||||
|
||||
client = weaviate.connect_to_local()
|
||||
|
||||
try:
|
||||
# Show current state
|
||||
print("Current collections:")
|
||||
for source_name, target_name in MIGRATIONS:
|
||||
source_count = get_collection_count(client, source_name)
|
||||
target_exists = collection_exists(client, target_name)
|
||||
target_count = get_collection_count(client, target_name) if target_exists else 0
|
||||
print(f" {source_name}: {source_count} objects")
|
||||
print(f" {target_name}: {'exists (' + str(target_count) + ' objects)' if target_exists else 'does not exist'}")
|
||||
print()
|
||||
|
||||
if args.cleanup:
|
||||
cleanup_old_collections(client, dry_run=args.dry_run)
|
||||
return
|
||||
|
||||
# Migration
|
||||
for source_name, target_name in MIGRATIONS:
|
||||
print("="*70)
|
||||
print(f"MIGRATING: {source_name} -> {target_name}")
|
||||
print("="*70)
|
||||
|
||||
# Check source exists
|
||||
if not collection_exists(client, source_name):
|
||||
print(f" [ERROR] Source collection {source_name} does not exist!")
|
||||
continue
|
||||
|
||||
# Check if target already exists
|
||||
if collection_exists(client, target_name):
|
||||
target_count = get_collection_count(client, target_name)
|
||||
if target_count > 0:
|
||||
print(f" Target {target_name} already exists with {target_count} objects")
|
||||
print(f" Skipping (already migrated)")
|
||||
continue
|
||||
else:
|
||||
print(f" Target {target_name} exists but empty, will populate")
|
||||
else:
|
||||
# Create target collection
|
||||
if not args.dry_run:
|
||||
if target_name == "Chunk":
|
||||
create_chunk_collection(client)
|
||||
elif target_name == "Summary":
|
||||
create_summary_collection(client)
|
||||
else:
|
||||
print(f" [DRY-RUN] Would create {target_name} collection")
|
||||
|
||||
# Migrate objects
|
||||
if not args.dry_run:
|
||||
migrate_objects(client, source_name, target_name, dry_run=False)
|
||||
else:
|
||||
migrate_objects(client, source_name, target_name, dry_run=True)
|
||||
|
||||
# Validate
|
||||
if not args.dry_run:
|
||||
validate_migration(client, source_name, target_name)
|
||||
|
||||
print()
|
||||
|
||||
# Final status
|
||||
print("="*70)
|
||||
print("MIGRATION COMPLETE")
|
||||
print("="*70)
|
||||
print("\nFinal state:")
|
||||
for source_name, target_name in MIGRATIONS:
|
||||
source_count = get_collection_count(client, source_name)
|
||||
target_count = get_collection_count(client, target_name)
|
||||
print(f" {source_name}: {source_count} objects")
|
||||
print(f" {target_name}: {target_count} objects")
|
||||
|
||||
if not args.dry_run:
|
||||
print("\nNext steps:")
|
||||
print(" 1. Update code: replace 'Chunk_v2' -> 'Chunk', 'Summary_v2' -> 'Summary'")
|
||||
print(" 2. Test the application")
|
||||
print(" 3. Run: python migrate_rename_collections.py --cleanup")
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -9,8 +9,8 @@ Schema Architecture:
|
||||
querying. The hierarchy is::
|
||||
|
||||
Work (metadata only)
|
||||
├── Chunk_v2 (vectorized text fragments)
|
||||
└── Summary_v2 (vectorized chapter summaries)
|
||||
├── Chunk (vectorized text fragments)
|
||||
└── Summary (vectorized chapter summaries)
|
||||
|
||||
Collections:
|
||||
**Work** (no vectorization):
|
||||
@@ -18,21 +18,21 @@ Collections:
|
||||
Stores canonical metadata: title, author, year, language, genre.
|
||||
Not vectorized - used only for metadata and relationships.
|
||||
|
||||
**Chunk_v2** (manual GPU vectorization):
|
||||
**Chunk** (manual GPU vectorization):
|
||||
Text fragments optimized for semantic search (200-800 chars).
|
||||
Vectorized with Python GPU embedder (BAAI/bge-m3, 1024-dim).
|
||||
Vectorized fields: text, keywords.
|
||||
Non-vectorized fields: workTitle, workAuthor, sectionPath, chapterTitle, unitType, orderIndex.
|
||||
Includes nested Work reference for denormalized access.
|
||||
|
||||
**Summary_v2** (manual GPU vectorization):
|
||||
**Summary** (manual GPU vectorization):
|
||||
LLM-generated chapter/section summaries for high-level search.
|
||||
Vectorized with Python GPU embedder (BAAI/bge-m3, 1024-dim).
|
||||
Vectorized fields: text, concepts.
|
||||
Includes nested Work reference for denormalized access.
|
||||
|
||||
Vectorization Strategy:
|
||||
- Only Chunk_v2.text, Chunk_v2.keywords, Summary_v2.text, and Summary_v2.concepts are vectorized
|
||||
- Only Chunk.text, Chunk.keywords, Summary.text, and Summary.concepts are vectorized
|
||||
- Manual vectorization with Python GPU embedder (BAAI/bge-m3, 1024-dim, RTX 4070)
|
||||
- Metadata fields use skip_vectorization=True for filtering only
|
||||
- Work collection has no vectorizer (metadata only)
|
||||
@@ -56,8 +56,8 @@ Nested Objects:
|
||||
denormalized data access. This allows single-query retrieval of chunk
|
||||
data with its Work metadata without joins::
|
||||
|
||||
Chunk_v2.work = {title, author}
|
||||
Summary_v2.work = {title, author}
|
||||
Chunk.work = {title, author}
|
||||
Summary.work = {title, author}
|
||||
|
||||
Usage:
|
||||
From command line::
|
||||
|
||||
@@ -190,8 +190,8 @@ class DeleteResult(TypedDict, total=False):
|
||||
Attributes:
|
||||
success: Whether deletion succeeded.
|
||||
error: Error message if deletion failed.
|
||||
deleted_chunks: Number of chunks deleted from Chunk_v2 collection.
|
||||
deleted_summaries: Number of summaries deleted from Summary_v2 collection.
|
||||
deleted_chunks: Number of chunks deleted from Chunk collection.
|
||||
deleted_summaries: Number of summaries deleted from Summary collection.
|
||||
|
||||
Example:
|
||||
>>> result = delete_document_chunks("platon_republique")
|
||||
@@ -725,7 +725,7 @@ def ingest_summaries(
|
||||
Recursively processes nested TOC entries (children).
|
||||
"""
|
||||
try:
|
||||
summary_collection: Collection[Any, Any] = client.collections.get("Summary_v2")
|
||||
summary_collection: Collection[Any, Any] = client.collections.get("Summary")
|
||||
except Exception as e:
|
||||
logger.warning(f"Collection Summary non trouvée: {e}")
|
||||
return 0
|
||||
@@ -833,9 +833,9 @@ def ingest_document(
|
||||
) -> IngestResult:
|
||||
"""Ingest document chunks into Weaviate with nested objects.
|
||||
|
||||
Main ingestion function that inserts chunks into the Chunk_v2 collection
|
||||
Main ingestion function that inserts chunks into the Chunk collection
|
||||
with nested Work references. Optionally also creates entries in the
|
||||
Summary_v2 collection.
|
||||
Summary collection.
|
||||
|
||||
This function uses batch insertion for optimal performance and
|
||||
constructs proper nested objects for filtering capabilities.
|
||||
@@ -856,7 +856,7 @@ def ingest_document(
|
||||
toc: Optional table of contents for Summary collection.
|
||||
hierarchy: Optional complete document hierarchy structure.
|
||||
pages: Number of pages in source document. Defaults to 0.
|
||||
ingest_summary_collection: If True, also insert into Summary_v2
|
||||
ingest_summary_collection: If True, also insert into Summary
|
||||
collection (requires toc). Defaults to False.
|
||||
|
||||
Returns:
|
||||
@@ -911,7 +911,7 @@ def ingest_document(
|
||||
|
||||
# Récupérer la collection Chunk
|
||||
try:
|
||||
chunk_collection: Collection[Any, Any] = client.collections.get("Chunk_v2")
|
||||
chunk_collection: Collection[Any, Any] = client.collections.get("Chunk")
|
||||
except Exception as e:
|
||||
return IngestResult(
|
||||
success=False,
|
||||
@@ -983,14 +983,14 @@ def ingest_document(
|
||||
"keywords": chunk.get("concepts", chunk.get("keywords", [])),
|
||||
"language": language,
|
||||
"orderIndex": idx,
|
||||
# Use flat fields instead of nested objects for Chunk_v2 schema
|
||||
# Use flat fields instead of nested objects for Chunk schema
|
||||
"workTitle": title,
|
||||
"workAuthor": author,
|
||||
"year": metadata.get("year", 0) if metadata.get("year") else 0,
|
||||
# Note: document reference fields not used in current Chunk_v2 schema
|
||||
# Note: document reference fields not used in current Chunk schema
|
||||
}
|
||||
|
||||
# Note: Nested objects validation skipped for Chunk_v2 flat schema
|
||||
# Note: Nested objects validation skipped for Chunk flat schema
|
||||
# validate_chunk_nested_objects(chunk_obj, idx, doc_name)
|
||||
|
||||
objects_to_insert.append(chunk_obj)
|
||||
@@ -1130,7 +1130,7 @@ def delete_document_chunks(doc_name: str) -> DeleteResult:
|
||||
|
||||
# Supprimer les chunks (filtrer sur document.sourceId nested)
|
||||
try:
|
||||
chunk_collection: Collection[Any, Any] = client.collections.get("Chunk_v2")
|
||||
chunk_collection: Collection[Any, Any] = client.collections.get("Chunk")
|
||||
result = chunk_collection.data.delete_many(
|
||||
where=wvq.Filter.by_property("document.sourceId").equal(doc_name)
|
||||
)
|
||||
@@ -1140,7 +1140,7 @@ def delete_document_chunks(doc_name: str) -> DeleteResult:
|
||||
|
||||
# Supprimer les summaries (filtrer sur document.sourceId nested)
|
||||
try:
|
||||
summary_collection: Collection[Any, Any] = client.collections.get("Summary_v2")
|
||||
summary_collection: Collection[Any, Any] = client.collections.get("Summary")
|
||||
result = summary_collection.data.delete_many(
|
||||
where=wvq.Filter.by_property("document.sourceId").equal(doc_name)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user