Source code for langchain_mongodb.retrievers.full_text_search
from typing import Any, Dict, List, Optional
from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever
from pymongo.collection import Collection
from langchain_mongodb.pipelines import text_search_stage
from langchain_mongodb.utils import make_serializable
[docs]class MongoDBAtlasFullTextSearchRetriever(BaseRetriever):
"""Hybrid Search Retriever performs full-text searches
using Lucene's standard (BM25) analyzer.
"""
collection: Collection
"""MongoDB Collection on an Atlas cluster"""
search_index_name: str
"""Atlas Search Index name"""
search_field: str
"""Collection field that contains the text to be searched. It must be indexed"""
top_k: Optional[int] = None
"""Number of documents to return. Default is no limit"""
filter: Optional[Dict[str, Any]] = None
"""(Optional) List of MQL match expression comparing an indexed field"""
show_embeddings: float = False
"""If true, returned Document metadata will include vectors"""
def _get_relevant_documents(
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
) -> List[Document]:
"""Retrieve documents that are highest scoring / most similar to query.
Args:
query: String to find relevant documents for
run_manager: The callback handler to use
Returns:
List of relevant documents
"""
pipeline = text_search_stage( # type: ignore
query=query,
search_field=self.search_field,
index_name=self.search_index_name,
limit=self.top_k,
filter=self.filter,
)
# Execution
cursor = self.collection.aggregate(pipeline) # type: ignore[arg-type]
# Formatting
docs = []
for res in cursor:
text = res.pop(self.search_field)
make_serializable(res)
docs.append(Document(page_content=text, metadata=res))
return docs