BREAKING: No breaking changes - zero data loss migration Core Changes: - Added manual GPU vectorization in weaviate_ingest.py (~100 lines) - New vectorize_chunks_batch() function using BAAI/bge-m3 on RTX 4070 - Modified ingest_document() and ingest_summaries() for GPU vectors - Updated docker-compose.yml with healthchecks Performance: - Ingestion: 500-1000ms/chunk → 15ms/chunk (30-70x faster) - VRAM usage: 2.6 GB peak (well under 8 GB available) - No degradation on search/chat (already using GPU embedder) Data Safety: - All 5355 existing chunks preserved (100% compatible vectors) - Same model (BAAI/bge-m3), same dimensions (1024) - Docker text2vec-transformers optional (can be removed later) Tests (All Passed): ✅ Ingestion: 9 chunks in 1.2s ✅ Search: 16 results, GPU embedder confirmed ✅ Chat: 11 chunks across 5 sections, hierarchical search OK Architecture: Before: Hybrid (Docker CPU for ingestion, Python GPU for queries) After: Unified (Python GPU for everything) Files Modified: - generations/library_rag/utils/weaviate_ingest.py (GPU vectorization) - generations/library_rag/.claude/CLAUDE.md (documentation) - generations/library_rag/docker-compose.yml (healthchecks) Documentation: - MIGRATION_GPU_EMBEDDER_SUCCESS.md (detailed report) - TEST_FINAL_GPU_EMBEDDER.md (ingestion + search tests) - TEST_CHAT_GPU_EMBEDDER.md (chat test) - TESTS_COMPLETS_GPU_EMBEDDER.md (complete summary) - BUG_REPORT_WEAVIATE_CONNECTION.md (initial bug analysis) - DIAGNOSTIC_ARCHITECTURE_EMBEDDINGS.md (technical analysis) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
95 lines
3.6 KiB
YAML
95 lines
3.6 KiB
YAML
# Library RAG - Weaviate + BGE-M3 Embeddings
|
|
# ===========================================
|
|
#
|
|
# This docker-compose runs Weaviate with BAAI/bge-m3 embedding model.
|
|
#
|
|
# BGE-M3 Advantages:
|
|
# - 1024 dimensions (vs 384 for MiniLM-L6) - 2.7x richer representation
|
|
# - 8192 token context (vs 512) - 16x longer sequences
|
|
# - Superior multilingual support (Greek, Latin, French, English)
|
|
# - Better trained on academic/philosophical texts
|
|
#
|
|
# GPU Configuration:
|
|
# - ENABLE_CUDA="1" - Uses NVIDIA GPU for faster vectorization
|
|
# - ENABLE_CUDA="0" - Uses CPU only (slower but functional)
|
|
# - GPU device mapping included for CUDA acceleration
|
|
#
|
|
# Migration Note (2024-12):
|
|
# Migrated from sentence-transformers-multi-qa-MiniLM-L6-cos-v1 (384-dim)
|
|
# to BAAI/bge-m3 (1024-dim). All collections were deleted and recreated.
|
|
# See MIGRATION_BGE_M3.md for details.
|
|
|
|
services:
|
|
weaviate:
|
|
image: cr.weaviate.io/semitechnologies/weaviate:1.34.4
|
|
restart: on-failure:0
|
|
ports:
|
|
- "8080:8080"
|
|
- "50051:50051"
|
|
environment:
|
|
QUERY_DEFAULTS_LIMIT: "25"
|
|
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: "true" # ok pour dev/local
|
|
PERSISTENCE_DATA_PATH: "/var/lib/weaviate"
|
|
CLUSTER_HOSTNAME: "node1"
|
|
CLUSTER_GOSSIP_BIND_PORT: "7946"
|
|
CLUSTER_DATA_BIND_PORT: "7947"
|
|
# Fix for "No private IP address found" error
|
|
CLUSTER_JOIN: ""
|
|
DEFAULT_VECTORIZER_MODULE: "text2vec-transformers"
|
|
ENABLE_MODULES: "text2vec-transformers"
|
|
TRANSFORMERS_INFERENCE_API: "http://text2vec-transformers:8080"
|
|
# Limits to prevent OOM crashes
|
|
GOMEMLIMIT: "6GiB"
|
|
GOGC: "100"
|
|
volumes:
|
|
- weaviate_data:/var/lib/weaviate
|
|
mem_limit: 8g
|
|
memswap_limit: 10g
|
|
cpus: 4
|
|
# Ensure Weaviate waits for text2vec-transformers to be healthy before starting
|
|
depends_on:
|
|
text2vec-transformers:
|
|
condition: service_healthy
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:8080/v1/.well-known/ready"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 60s
|
|
|
|
text2vec-transformers:
|
|
# BAAI/bge-m3: Multilingual embedding model (1024 dimensions)
|
|
# Superior for philosophical texts (Greek, Latin, French, English)
|
|
# 8192 token context window (16x longer than MiniLM-L6)
|
|
# Using ONNX version (only available format in Weaviate registry)
|
|
#
|
|
# GPU LIMITATION (Dec 2024):
|
|
# - Weaviate only provides ONNX version of BGE-M3 (no PyTorch)
|
|
# - ONNX runtime is CPU-optimized (no native CUDA support)
|
|
# - GPU acceleration would require NVIDIA NIM (different architecture)
|
|
# - Current setup: CPU-only with AVX2 optimization (functional but slower)
|
|
image: cr.weaviate.io/semitechnologies/transformers-inference:baai-bge-m3-onnx-latest
|
|
restart: on-failure:0
|
|
ports:
|
|
- "8090:8080" # Expose vectorizer API for manual vectorization
|
|
environment:
|
|
# ONNX runtime - CPU only (CUDA not supported in ONNX version)
|
|
ENABLE_CUDA: "0"
|
|
# Increased timeouts for very long chunks (e.g., Peirce CP 3.403, CP 8.388, Menon chunk 10)
|
|
# Default is 60s, increased to 600s (10 minutes) for exceptionally large texts (e.g., CP 8.388: 218k chars)
|
|
WORKER_TIMEOUT: "600"
|
|
mem_limit: 10g
|
|
memswap_limit: 12g
|
|
cpus: 3
|
|
# Healthcheck ensures service is fully loaded before Weaviate starts
|
|
# BGE-M3 model takes ~60-120s to load into memory
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:8080/.well-known/ready"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 5
|
|
start_period: 120s # BGE-M3 model loading can take up to 2 minutes
|
|
|
|
volumes:
|
|
weaviate_data:
|