fix: Replace text2vec-transformers with BGE-M3 in MCP retrieval tools
The text2vec-transformers Docker service was removed in Jan 2026, but retrieval_tools.py still used near_text() which requires it. Now uses GPU embedder (BGE-M3) with near_vector() like flask_app.py. Changes: - Add GPU embedder singleton (get_gpu_embedder) - search_chunks_handler: near_text → near_vector + BGE-M3 - search_summaries_handler: near_text → near_vector + BGE-M3 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -69,9 +69,39 @@ from mcp_tools.logging_config import (
|
|||||||
log_weaviate_query,
|
log_weaviate_query,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# GPU embedder for BGE-M3 vectorization (replaces text2vec-transformers)
|
||||||
|
from memory.core import get_embedder
|
||||||
|
|
||||||
# Logger for this module - uses structured logging
|
# Logger for this module - uses structured logging
|
||||||
logger = get_tool_logger("retrieval")
|
logger = get_tool_logger("retrieval")
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# GPU Embedder Singleton (BGE-M3)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
_embedder = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_gpu_embedder():
|
||||||
|
"""Get or create GPU embedder singleton for BGE-M3 vectorization.
|
||||||
|
|
||||||
|
Returns the shared GPU embedding service instance. The embedder uses
|
||||||
|
BAAI/bge-m3 model (1024 dimensions) for semantic vectorization.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
GPUEmbeddingService instance.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
This singleton pattern ensures the model is loaded only once,
|
||||||
|
avoiding repeated GPU memory allocation.
|
||||||
|
"""
|
||||||
|
global _embedder
|
||||||
|
if _embedder is None:
|
||||||
|
logger.info("Initializing GPU embedder (BGE-M3) for retrieval...")
|
||||||
|
_embedder = get_embedder()
|
||||||
|
logger.info(f"GPU embedder ready: {_embedder.model_name}")
|
||||||
|
return _embedder
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Canonical Reference Extraction
|
# Canonical Reference Extraction
|
||||||
@@ -472,10 +502,14 @@ async def search_chunks_handler(input_data: SearchChunksInput) -> SearchChunksOu
|
|||||||
)
|
)
|
||||||
filters = (filters & lang_f) if filters else lang_f
|
filters = (filters & lang_f) if filters else lang_f
|
||||||
|
|
||||||
# Perform near_text query with timing
|
# Vectorize query with GPU embedder (BGE-M3)
|
||||||
|
embedder = get_gpu_embedder()
|
||||||
|
query_vector = embedder.embed_single(input_data.query)
|
||||||
|
|
||||||
|
# Perform near_vector query with timing
|
||||||
query_start = time.perf_counter()
|
query_start = time.perf_counter()
|
||||||
result = chunks.query.near_text(
|
result = chunks.query.near_vector(
|
||||||
query=input_data.query,
|
near_vector=query_vector.tolist(),
|
||||||
limit=input_data.limit,
|
limit=input_data.limit,
|
||||||
filters=filters,
|
filters=filters,
|
||||||
return_metadata=wvq.MetadataQuery(distance=True),
|
return_metadata=wvq.MetadataQuery(distance=True),
|
||||||
@@ -484,7 +518,7 @@ async def search_chunks_handler(input_data: SearchChunksInput) -> SearchChunksOu
|
|||||||
|
|
||||||
# Log Weaviate query
|
# Log Weaviate query
|
||||||
log_weaviate_query(
|
log_weaviate_query(
|
||||||
operation="near_text",
|
operation="near_vector",
|
||||||
collection="Chunk",
|
collection="Chunk",
|
||||||
filters={
|
filters={
|
||||||
"author": input_data.author_filter,
|
"author": input_data.author_filter,
|
||||||
@@ -619,10 +653,14 @@ async def search_summaries_handler(
|
|||||||
)
|
)
|
||||||
filters = (filters & max_filter) if filters else max_filter
|
filters = (filters & max_filter) if filters else max_filter
|
||||||
|
|
||||||
# Perform near_text query with timing
|
# Vectorize query with GPU embedder (BGE-M3)
|
||||||
|
embedder = get_gpu_embedder()
|
||||||
|
query_vector = embedder.embed_single(input_data.query)
|
||||||
|
|
||||||
|
# Perform near_vector query with timing
|
||||||
query_start = time.perf_counter()
|
query_start = time.perf_counter()
|
||||||
result = summaries.query.near_text(
|
result = summaries.query.near_vector(
|
||||||
query=input_data.query,
|
near_vector=query_vector.tolist(),
|
||||||
limit=input_data.limit,
|
limit=input_data.limit,
|
||||||
filters=filters,
|
filters=filters,
|
||||||
return_metadata=wvq.MetadataQuery(distance=True),
|
return_metadata=wvq.MetadataQuery(distance=True),
|
||||||
@@ -631,7 +669,7 @@ async def search_summaries_handler(
|
|||||||
|
|
||||||
# Log Weaviate query
|
# Log Weaviate query
|
||||||
log_weaviate_query(
|
log_weaviate_query(
|
||||||
operation="near_text",
|
operation="near_vector",
|
||||||
collection="Summary",
|
collection="Summary",
|
||||||
filters={
|
filters={
|
||||||
"min_level": input_data.min_level,
|
"min_level": input_data.min_level,
|
||||||
|
|||||||
Reference in New Issue
Block a user