[docs]classMongoDBAtlasHybridSearchRetriever(BaseRetriever):"""Hybrid Search Retriever combines vector and full-text searches weighting them the via Reciprocal Rank Fusion (RRF) algorithm. Increasing the vector_penalty will reduce the importance on the vector search. Increasing the fulltext_penalty will correspondingly reduce the fulltext score. For more on the algorithm,see https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking """vectorstore:MongoDBAtlasVectorSearch"""MongoDBAtlas VectorStore"""search_index_name:str"""Atlas Search Index (full-text) name"""top_k:int=4"""Number of documents to return."""oversampling_factor:int=10"""This times top_k is the number of candidates chosen at each step"""pre_filter:Optional[Dict[str,Any]]=None"""(Optional) Any MQL match expression comparing an indexed field"""post_filter:Optional[List[Dict[str,Any]]]=None"""(Optional) Pipeline of MongoDB aggregation stages for postprocessing."""vector_penalty:float=60.0"""Penalty applied to vector search results in RRF: scores=1/(rank + penalty)"""fulltext_penalty:float=60.0"""Penalty applied to full-text search results in RRF: scores=1/(rank + penalty)"""show_embeddings:float=False"""If true, returned Document metadata will include vectors."""@propertydefcollection(self)->Collection:returnself.vectorstore._collectiondef_get_relevant_documents(self,query:str,*,run_manager:CallbackManagerForRetrieverRun)->List[Document]:"""Retrieve documents that are highest scoring / most similar to query. Note that the same query is used in both searches, embedded for vector search, and as-is for full-text search. Args: query: String to find relevant documents for run_manager: The callback handler to use Returns: List of relevant documents """query_vector=self.vectorstore._embedding.embed_query(query)scores_fields=["vector_score","fulltext_score"]pipeline:List[Any]=[]# First we build up the aggregation pipeline,# then it is passed to the server to execute# Vector Search stagevector_pipeline=[vector_search_stage(query_vector=query_vector,search_field=self.vectorstore._embedding_key,index_name=self.vectorstore._index_name,top_k=self.top_k,filter=self.pre_filter,oversampling_factor=self.oversampling_factor,)]vector_pipeline+=reciprocal_rank_stage("vector_score",self.vector_penalty)combine_pipelines(pipeline,vector_pipeline,self.collection.name)# Full-Text Search stagetext_pipeline=text_search_stage(query=query,search_field=self.vectorstore._text_key,index_name=self.search_index_name,limit=self.top_k,filter=self.pre_filter,)text_pipeline.extend(reciprocal_rank_stage("fulltext_score",self.fulltext_penalty))combine_pipelines(pipeline,text_pipeline,self.collection.name)# Sum and sort stagepipeline.extend(final_hybrid_stage(scores_fields=scores_fields,limit=self.top_k))# Removal of embeddings unless requested.ifnotself.show_embeddings:pipeline.append({"$project":{self.vectorstore._embedding_key:0}})# Post filteringifself.post_filterisnotNone:pipeline.extend(self.post_filter)# Executioncursor=self.collection.aggregate(pipeline)# type: ignore[arg-type]# Formattingdocs=[]forresincursor:text=res.pop(self.vectorstore._text_key)# score = res.pop("score") # The score remains buried!make_serializable(res)docs.append(Document(page_content=text,metadata=res))returndocs