Source code for langchain_community.vectorstores.azuresearch
from__future__importannotationsimportasyncioimportbase64importitertoolsimportjsonimportloggingimporttimeimportuuidfromtypingimport(TYPE_CHECKING,Any,Callable,ClassVar,Collection,Dict,Iterable,List,Literal,Optional,Tuple,Type,Union,cast,)importnumpyasnpfromlangchain_core.callbacksimport(AsyncCallbackManagerForRetrieverRun,CallbackManagerForRetrieverRun,)fromlangchain_core.documentsimportDocumentfromlangchain_core.embeddingsimportEmbeddingsfromlangchain_core.exceptionsimportLangChainExceptionfromlangchain_core.retrieversimportBaseRetrieverfromlangchain_core.utilsimportget_from_envfromlangchain_core.vectorstoresimportVectorStorefrompydanticimportConfigDict,model_validatorfromlangchain_community.vectorstores.utilsimportmaximal_marginal_relevancelogger=logging.getLogger()ifTYPE_CHECKING:fromazure.core.credentialsimportTokenCredentialfromazure.core.credentials_asyncimportAsyncTokenCredentialfromazure.search.documentsimportSearchClient,SearchItemPagedfromazure.search.documents.aioimport(AsyncSearchItemPaged,)fromazure.search.documents.aioimport(SearchClientasAsyncSearchClient,)fromazure.search.documents.indexes.modelsimport(CorsOptions,ScoringProfile,SearchField,SemanticConfiguration,VectorSearch,)# Allow overriding field names for Azure SearchFIELDS_ID=get_from_env(key="AZURESEARCH_FIELDS_ID",env_key="AZURESEARCH_FIELDS_ID",default="id")FIELDS_CONTENT=get_from_env(key="AZURESEARCH_FIELDS_CONTENT",env_key="AZURESEARCH_FIELDS_CONTENT",default="content",)FIELDS_CONTENT_VECTOR=get_from_env(key="AZURESEARCH_FIELDS_CONTENT_VECTOR",env_key="AZURESEARCH_FIELDS_CONTENT_VECTOR",default="content_vector",)FIELDS_METADATA=get_from_env(key="AZURESEARCH_FIELDS_TAG",env_key="AZURESEARCH_FIELDS_TAG",default="metadata")MAX_UPLOAD_BATCH_SIZE=1000def_get_search_client(endpoint:str,index_name:str,key:Optional[str]=None,azure_ad_access_token:Optional[str]=None,semantic_configuration_name:Optional[str]=None,fields:Optional[List[SearchField]]=None,vector_search:Optional[VectorSearch]=None,semantic_configurations:Optional[Union[SemanticConfiguration,List[SemanticConfiguration]]]=None,scoring_profiles:Optional[List[ScoringProfile]]=None,default_scoring_profile:Optional[str]=None,default_fields:Optional[List[SearchField]]=None,user_agent:Optional[str]="langchain-comm-python-azure-search",cors_options:Optional[CorsOptions]=None,async_:bool=False,additional_search_client_options:Optional[Dict[str,Any]]=None,azure_credential:Optional[TokenCredential]=None,azure_async_credential:Optional[AsyncTokenCredential]=None,)->Union[SearchClient,AsyncSearchClient]:fromazure.core.credentialsimportAccessToken,AzureKeyCredential,TokenCredentialfromazure.core.exceptionsimportResourceNotFoundErrorfromazure.identityimportDefaultAzureCredential,InteractiveBrowserCredentialfromazure.identity.aioimportDefaultAzureCredentialasAsyncDefaultAzureCredentialfromazure.search.documentsimportSearchClientfromazure.search.documents.aioimportSearchClientasAsyncSearchClientfromazure.search.documents.indexesimportSearchIndexClientfromazure.search.documents.indexes.modelsimport(ExhaustiveKnnAlgorithmConfiguration,ExhaustiveKnnParameters,HnswAlgorithmConfiguration,HnswParameters,SearchIndex,SemanticConfiguration,SemanticField,SemanticPrioritizedFields,SemanticSearch,VectorSearch,VectorSearchAlgorithmKind,VectorSearchAlgorithmMetric,VectorSearchProfile,)classAzureBearerTokenCredential(TokenCredential):def__init__(self,token:str):# set the expiry to an hour from now.self._token=AccessToken(token,int(time.time())+3600)defget_token(self,*scopes:str,claims:Optional[str]=None,tenant_id:Optional[str]=None,enable_cae:bool=False,**kwargs:Any,)->AccessToken:returnself._tokenadditional_search_client_options=additional_search_client_optionsor{}default_fields=default_fieldsor[]credential:Union[AzureKeyCredential,TokenCredential,InteractiveBrowserCredential]# Determine the appropriate credential to useifkeyisnotNone:ifkey.upper()=="INTERACTIVE":credential=InteractiveBrowserCredential()credential.get_token("https://search.azure.com/.default")async_credential=credentialelse:credential=AzureKeyCredential(key)async_credential=credentialelifazure_ad_access_tokenisnotNone:credential=AzureBearerTokenCredential(azure_ad_access_token)async_credential=credentialelse:credential=azure_credentialorDefaultAzureCredential()async_credential=azure_async_credentialorAsyncDefaultAzureCredential()index_client:SearchIndexClient=SearchIndexClient(endpoint=endpoint,credential=credential,user_agent=user_agent,**additional_search_client_options,)try:index_client.get_index(name=index_name)exceptResourceNotFoundError:# Fields configurationiffieldsisnotNone:# Check mandatory fieldsfields_types={f.name:f.typeforfinfields}mandatory_fields={df.name:df.typefordfindefault_fields}# Check for missing keysmissing_fields={key:mandatory_fields[key]forkey,valueinset(mandatory_fields.items())-set(fields_types.items())}iflen(missing_fields)>0:# Helper for formatting field information for each missing field.deffmt_err(x:str)->str:return(f"{x} current type: '{fields_types.get(x,'MISSING')}'. "f"It has to be '{mandatory_fields.get(x)}' or you can point "f"to a different '{mandatory_fields.get(x)}' field name by "f"using the env variable 'AZURESEARCH_FIELDS_{x.upper()}'")error="\n".join([fmt_err(x)forxinmissing_fields])raiseValueError(f"You need to specify at least the following fields "f"{missing_fields} or provide alternative field names in the env "f"variables.\n\n{error}")else:fields=default_fields# Vector search configurationifvector_searchisNone:vector_search=VectorSearch(algorithms=[HnswAlgorithmConfiguration(name="default",kind=VectorSearchAlgorithmKind.HNSW,parameters=HnswParameters(m=4,ef_construction=400,ef_search=500,metric=VectorSearchAlgorithmMetric.COSINE,),),ExhaustiveKnnAlgorithmConfiguration(name="default_exhaustive_knn",kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,parameters=ExhaustiveKnnParameters(metric=VectorSearchAlgorithmMetric.COSINE),),],profiles=[VectorSearchProfile(name="myHnswProfile",algorithm_configuration_name="default",),VectorSearchProfile(name="myExhaustiveKnnProfile",algorithm_configuration_name="default_exhaustive_knn",),],)# Create the semantic settings with the configurationifsemantic_configurations:ifnotisinstance(semantic_configurations,list):semantic_configurations=[semantic_configurations]semantic_search=SemanticSearch(configurations=semantic_configurations,default_configuration_name=semantic_configuration_name,)elifsemantic_configuration_name:# use default semantic configurationsemantic_configuration=SemanticConfiguration(name=semantic_configuration_name,prioritized_fields=SemanticPrioritizedFields(content_fields=[SemanticField(field_name=FIELDS_CONTENT)],),)semantic_search=SemanticSearch(configurations=[semantic_configuration])else:# don't use semantic searchsemantic_search=None# Create the search index with the semantic settings and vector searchindex=SearchIndex(name=index_name,fields=fields,vector_search=vector_search,semantic_search=semantic_search,scoring_profiles=scoring_profiles,default_scoring_profile=default_scoring_profile,cors_options=cors_options,)index_client.create_index(index)# Create the search clientifnotasync_:returnSearchClient(endpoint=endpoint,index_name=index_name,credential=credential,user_agent=user_agent,**additional_search_client_options,)else:returnAsyncSearchClient(endpoint=endpoint,index_name=index_name,credential=async_credential,user_agent=user_agent,**additional_search_client_options,)
[docs]def__init__(self,azure_search_endpoint:str,azure_search_key:Optional[str],index_name:str,embedding_function:Union[Callable,Embeddings],search_type:str="hybrid",semantic_configuration_name:Optional[str]=None,fields:Optional[List[SearchField]]=None,vector_search:Optional[VectorSearch]=None,semantic_configurations:Optional[Union[SemanticConfiguration,List[SemanticConfiguration]]]=None,scoring_profiles:Optional[List[ScoringProfile]]=None,default_scoring_profile:Optional[str]=None,cors_options:Optional[CorsOptions]=None,*,vector_search_dimensions:Optional[int]=None,additional_search_client_options:Optional[Dict[str,Any]]=None,azure_ad_access_token:Optional[str]=None,azure_credential:Optional[TokenCredential]=None,azure_async_credential:Optional[AsyncTokenCredential]=None,**kwargs:Any,):try:fromazure.search.documents.indexes.modelsimport(SearchableField,SearchField,SearchFieldDataType,SimpleField,)exceptImportErrorase:raiseImportError("Unable to import azure.search.documents. Please install with ""`pip install -U azure-search-documents`.")frome"""Initialize with necessary components."""# Initialize base classself.embedding_function=embedding_functionifisinstance(self.embedding_function,Embeddings):self.embed_query=self.embedding_function.embed_queryelse:self.embed_query=self.embedding_functiondefault_fields=[SimpleField(name=FIELDS_ID,type=SearchFieldDataType.String,key=True,filterable=True,),SearchableField(name=FIELDS_CONTENT,type=SearchFieldDataType.String,),SearchField(name=FIELDS_CONTENT_VECTOR,type=SearchFieldDataType.Collection(SearchFieldDataType.Single),searchable=True,vector_search_dimensions=vector_search_dimensionsorlen(self.embed_query("Text")),vector_search_profile_name="myHnswProfile",),SearchableField(name=FIELDS_METADATA,type=SearchFieldDataType.String,),]user_agent="langchain"if"user_agent"inkwargsandkwargs["user_agent"]:user_agent+=" "+kwargs["user_agent"]self.client=_get_search_client(azure_search_endpoint,index_name,azure_search_key,azure_ad_access_token,semantic_configuration_name=semantic_configuration_name,fields=fields,vector_search=vector_search,semantic_configurations=semantic_configurations,scoring_profiles=scoring_profiles,default_scoring_profile=default_scoring_profile,default_fields=default_fields,user_agent=user_agent,cors_options=cors_options,additional_search_client_options=additional_search_client_options,azure_credential=azure_credential,)self.async_client=_get_search_client(azure_search_endpoint,index_name,azure_search_key,azure_ad_access_token,semantic_configuration_name=semantic_configuration_name,fields=fields,vector_search=vector_search,semantic_configurations=semantic_configurations,scoring_profiles=scoring_profiles,default_scoring_profile=default_scoring_profile,default_fields=default_fields,user_agent=user_agent,cors_options=cors_options,async_=True,azure_credential=azure_credential,azure_async_credential=azure_async_credential,)self.search_type=search_typeself.semantic_configuration_name=semantic_configuration_nameself.fields=fieldsiffieldselsedefault_fieldsself._azure_search_endpoint=azure_search_endpointself._azure_search_key=azure_search_keyself._index_name=index_nameself._semantic_configuration_name=semantic_configuration_nameself._fields=fieldsself._vector_search=vector_searchself._semantic_configurations=semantic_configurationsself._scoring_profiles=scoring_profilesself._default_scoring_profile=default_scoring_profileself._default_fields=default_fieldsself._user_agent=user_agentself._cors_options=cors_options
def__del__(self)->None:# Close the sync clientifhasattr(self,"client")andself.client:self.client.close()# Close the async clientifhasattr(self,"async_client")andself.async_client:# Check if we're in an existing event looptry:loop=asyncio.get_event_loop()ifloop.is_running():# Schedule the coroutine to close the async clientloop.create_task(self.async_client.close())else:# If no event loop is running, run the coroutine directlyloop.run_until_complete(self.async_client.close())exceptRuntimeError:# Handle the case where there's no event looploop=asyncio.new_event_loop()asyncio.set_event_loop(loop)try:loop.run_until_complete(self.async_client.close())finally:loop.close()@propertydefembeddings(self)->Optional[Embeddings]:# TODO: Support embedding object directlyreturn(self.embedding_functionifisinstance(self.embedding_function,Embeddings)elseNone)asyncdef_aembed_query(self,text:str)->List[float]:ifself.embeddings:returnawaitself.embeddings.aembed_query(text)else:returncast(Callable,self.embedding_function)(text)
[docs]defadd_texts(self,texts:Iterable[str],metadatas:Optional[List[dict]]=None,*,keys:Optional[List[str]]=None,**kwargs:Any,)->List[str]:"""Add texts data to an existing index."""# batching support if embedding function is an Embeddings objectifisinstance(self.embedding_function,Embeddings):try:embeddings=self.embedding_function.embed_documents(list(texts))exceptNotImplementedError:embeddings=[self.embedding_function.embed_query(x)forxintexts]else:embeddings=[self.embedding_function(x)forxintexts]iflen(embeddings)==0:logger.debug("Nothing to insert, skipping.")return[]# when `keys` are not passed in and there is `ids` in kwargs, use those instead# base class expects `ids` passed in rather than `keys`# https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65if(notkeys)and("ids"inkwargs)and(len(kwargs["ids"])==len(embeddings)):keys=kwargs["ids"]returnself.add_embeddings(zip(texts,embeddings),metadatas,keys=keys)
[docs]asyncdefaadd_texts(self,texts:Iterable[str],metadatas:Optional[List[dict]]=None,*,keys:Optional[List[str]]=None,**kwargs:Any,)->List[str]:ifisinstance(self.embedding_function,Embeddings):try:embeddings=awaitself.embedding_function.aembed_documents(list(texts))exceptNotImplementedError:embeddings=[awaitself.embedding_function.aembed_query(x)forxintexts]else:embeddings=[self.embedding_function(x)forxintexts]iflen(embeddings)==0:logger.debug("Nothing to insert, skipping.")return[]# when `keys` are not passed in and there is `ids` in kwargs, use those instead# base class expects `ids` passed in rather than `keys`# https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65if(notkeys)and("ids"inkwargs)and(len(kwargs["ids"])==len(embeddings)):keys=kwargs["ids"]returnawaitself.aadd_embeddings(zip(texts,embeddings),metadatas,keys=keys)
[docs]defadd_embeddings(self,text_embeddings:Iterable[Tuple[str,List[float]]],metadatas:Optional[List[dict]]=None,*,keys:Optional[List[str]]=None,)->List[str]:"""Add embeddings to an existing index."""ids=[]# Write data to indexdata=[]fori,(text,embedding)inenumerate(text_embeddings):# Use provided key otherwise use default keyifkeys:key=keys[i]else:key=str(uuid.uuid4())# Encoding key for Azure Search valid characterskey=base64.urlsafe_b64encode(bytes(key,"utf-8")).decode("ascii")metadata=metadatas[i]ifmetadataselse{}# Add data to index# Additional metadata to fields mappingdoc={"@search.action":"upload",FIELDS_ID:key,FIELDS_CONTENT:text,FIELDS_CONTENT_VECTOR:np.array(embedding,dtype=np.float32).tolist(),FIELDS_METADATA:json.dumps(metadata),}ifmetadata:additional_fields={k:vfork,vinmetadata.items()ifkin[x.nameforxinself.fields]}doc.update(additional_fields)data.append(doc)ids.append(key)# Upload data in batchesiflen(data)==MAX_UPLOAD_BATCH_SIZE:response=self.client.upload_documents(documents=data)# Check if all documents were successfully uploadedifnotall(r.succeededforrinresponse):raiseLangChainException(response)# Reset datadata=[]# Considering case where data is an exact multiple of batch-size entriesiflen(data)==0:returnids# Upload data to indexresponse=self.client.upload_documents(documents=data)# Check if all documents were successfully uploadedifall(r.succeededforrinresponse):returnidselse:raiseLangChainException(response)
[docs]asyncdefaadd_embeddings(self,text_embeddings:Iterable[Tuple[str,List[float]]],metadatas:Optional[List[dict]]=None,*,keys:Optional[List[str]]=None,)->List[str]:"""Add embeddings to an existing index."""ids=[]# Write data to indexdata=[]fori,(text,embedding)inenumerate(text_embeddings):# Use provided key otherwise use default keykey=keys[i]ifkeyselsestr(uuid.uuid4())# Encoding key for Azure Search valid characterskey=base64.urlsafe_b64encode(bytes(key,"utf-8")).decode("ascii")metadata=metadatas[i]ifmetadataselse{}# Add data to index# Additional metadata to fields mappingdoc={"@search.action":"upload",FIELDS_ID:key,FIELDS_CONTENT:text,FIELDS_CONTENT_VECTOR:np.array(embedding,dtype=np.float32).tolist(),FIELDS_METADATA:json.dumps(metadata),}ifmetadata:additional_fields={k:vfork,vinmetadata.items()ifkin[x.nameforxinself.fields]}doc.update(additional_fields)data.append(doc)ids.append(key)# Upload data in batchesiflen(data)==MAX_UPLOAD_BATCH_SIZE:response=awaitself.async_client.upload_documents(documents=data)# Check if all documents were successfully uploadedifnotall(r.succeededforrinresponse):raiseLangChainException(response)# Reset datadata=[]# Considering case where data is an exact multiple of batch-size entriesiflen(data)==0:returnids# Upload data to indexresponse=awaitself.async_client.upload_documents(documents=data)# Check if all documents were successfully uploadedifall(r.succeededforrinresponse):returnidselse:raiseLangChainException(response)
[docs]defdelete(self,ids:Optional[List[str]]=None,**kwargs:Any)->bool:"""Delete by vector ID. Args: ids: List of ids to delete. Returns: bool: True if deletion is successful, False otherwise. """ifids:res=self.client.delete_documents([{FIELDS_ID:i}foriinids])returnlen(res)>0else:returnFalse
[docs]asyncdefadelete(self,ids:Optional[List[str]]=None,**kwargs:Any)->bool:"""Delete by vector ID. Args: ids: List of ids to delete. Returns: bool: True if deletion is successful, False otherwise. """ifids:res=awaitself.async_client.delete_documents([{"id":i}foriinids])returnlen(res)>0else:returnFalse
[docs]defsimilarity_search(self,query:str,k:int=4,*,search_type:Optional[str]=None,**kwargs:Any,)->List[Document]:search_type=search_typeorself.search_typeifsearch_type=="similarity":docs=self.vector_search(query,k=k,**kwargs)elifsearch_type=="hybrid":docs=self.hybrid_search(query,k=k,**kwargs)elifsearch_type=="semantic_hybrid":docs=self.semantic_hybrid_search(query,k=k,**kwargs)else:raiseValueError(f"search_type of {search_type} not allowed.")returndocs
[docs]defsimilarity_search_with_score(self,query:str,*,k:int=4,**kwargs:Any)->List[Tuple[Document,float]]:"""Run similarity search with distance."""# Extract search_type from kwargs, defaulting to self.search_typesearch_type=kwargs.pop("search_type",self.search_type)ifsearch_type=="similarity":returnself.vector_search_with_score(query,k=k,**kwargs)elifsearch_type=="hybrid":returnself.hybrid_search_with_score(query,k=k,**kwargs)elifsearch_type=="semantic_hybrid":returnself.semantic_hybrid_search_with_score(query,k=k,**kwargs)else:raiseValueError(f"search_type of {search_type} not allowed.")
[docs]asyncdefasimilarity_search(self,query:str,k:int=4,*,search_type:Optional[str]=None,**kwargs:Any,)->List[Document]:search_type=search_typeorself.search_typeifsearch_type=="similarity":docs=awaitself.avector_search(query,k=k,**kwargs)elifsearch_type=="hybrid":docs=awaitself.ahybrid_search(query,k=k,**kwargs)elifsearch_type=="semantic_hybrid":docs=awaitself.asemantic_hybrid_search(query,k=k,**kwargs)else:raiseValueError(f"search_type of {search_type} not allowed.")returndocs
[docs]asyncdefasimilarity_search_with_score(self,query:str,*,k:int=4,**kwargs:Any)->List[Tuple[Document,float]]:"""Run similarity search with distance."""search_type=kwargs.get("search_type",self.search_type)ifsearch_type=="similarity":returnawaitself.avector_search_with_score(query,k=k,**kwargs)elifsearch_type=="hybrid":returnawaitself.ahybrid_search_with_score(query,k=k,**kwargs)elifsearch_type=="semantic_hybrid":returnawaitself.asemantic_hybrid_search_with_score(query,k=k,**kwargs)else:raiseValueError(f"search_type of {search_type} not allowed.")
[docs]defvector_search(self,query:str,k:int=4,*,filters:Optional[str]=None,**kwargs:Any)->List[Document]:""" Returns the most similar indexed documents to the query text. Args: query (str): The query text for which to find similar documents. k (int): The number of documents to return. Default is 4. Returns: List[Document]: A list of documents that are most similar to the query text. """docs_and_scores=self.vector_search_with_score(query,k=k,filters=filters)return[docfordoc,_indocs_and_scores]
[docs]asyncdefavector_search(self,query:str,k:int=4,*,filters:Optional[str]=None,**kwargs:Any)->List[Document]:""" Returns the most similar indexed documents to the query text. Args: query (str): The query text for which to find similar documents. k (int): The number of documents to return. Default is 4. Returns: List[Document]: A list of documents that are most similar to the query text. """docs_and_scores=awaitself.avector_search_with_score(query,k=k,filters=filters)return[docfordoc,_indocs_and_scores]
[docs]defvector_search_with_score(self,query:str,k:int=4,filters:Optional[str]=None,**kwargs:Any,)->List[Tuple[Document,float]]:"""Return docs most similar to query. Args: query (str): Text to look up documents similar to. k (int, optional): Number of Documents to return. Defaults to 4. filters (str, optional): Filtering expression. Defaults to None. Returns: List[Tuple[Document, float]]: List of Documents most similar to the query and score for each """embedding=self.embed_query(query)results=self._simple_search(embedding,"",k,filters=filters,**kwargs)return_results_to_documents(results)
[docs]asyncdefavector_search_with_score(self,query:str,k:int=4,filters:Optional[str]=None,**kwargs:Any,)->List[Tuple[Document,float]]:"""Return docs most similar to query. Args: query (str): Text to look up documents similar to. k (int, optional): Number of Documents to return. Defaults to 4. filters (str, optional): Filtering expression. Defaults to None. Returns: List[Tuple[Document, float]]: List of Documents most similar to the query and score for each """embedding=awaitself._aembed_query(query)results=awaitself._asimple_search(embedding,"",k,filters=filters,**kwargs)returnawait_aresults_to_documents(results)
[docs]defmax_marginal_relevance_search_with_score(self,query:str,k:int=4,fetch_k:int=20,lambda_mult:float=0.5,*,filters:Optional[str]=None,**kwargs:Any,)->List[Tuple[Document,float]]:"""Perform a search and return results that are reordered by MMR. Args: query (str): Text to look up documents similar to. k (int, optional): How many results to give. Defaults to 4. fetch_k (int, optional): Total results to select k from. Defaults to 20. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5 filters (str, optional): Filtering expression. Defaults to None. Returns: List[Tuple[Document, float]]: List of Documents most similar to the query and score for each """embedding=self.embed_query(query)results=self._simple_search(embedding,"",fetch_k,filters=filters,**kwargs)return_reorder_results_with_maximal_marginal_relevance(results,query_embedding=np.array(embedding),lambda_mult=lambda_mult,k=k)
[docs]asyncdefamax_marginal_relevance_search_with_score(self,query:str,k:int=4,fetch_k:int=20,lambda_mult:float=0.5,*,filters:Optional[str]=None,**kwargs:Any,)->List[Tuple[Document,float]]:"""Perform a search and return results that are reordered by MMR. Args: query (str): Text to look up documents similar to. k (int, optional): How many results to give. Defaults to 4. fetch_k (int, optional): Total results to select k from. Defaults to 20. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5 filters (str, optional): Filtering expression. Defaults to None. Returns: List[Tuple[Document, float]]: List of Documents most similar to the query and score for each """embedding=awaitself._aembed_query(query)results=awaitself._asimple_search(embedding,"",fetch_k,filters=filters,**kwargs)returnawait_areorder_results_with_maximal_marginal_relevance(results,query_embedding=np.array(embedding),lambda_mult=lambda_mult,k=k,)
[docs]defhybrid_search(self,query:str,k:int=4,**kwargs:Any)->List[Document]:""" Returns the most similar indexed documents to the query text. Args: query (str): The query text for which to find similar documents. k (int): The number of documents to return. Default is 4. Returns: List[Document]: A list of documents that are most similar to the query text. """docs_and_scores=self.hybrid_search_with_score(query,k=k,**kwargs)return[docfordoc,_indocs_and_scores]
[docs]asyncdefahybrid_search(self,query:str,k:int=4,**kwargs:Any)->List[Document]:""" Returns the most similar indexed documents to the query text. Args: query (str): The query text for which to find similar documents. k (int): The number of documents to return. Default is 4. Returns: List[Document]: A list of documents that are most similar to the query text. """docs_and_scores=awaitself.ahybrid_search_with_score(query,k=k,**kwargs)return[docfordoc,_indocs_and_scores]
[docs]defhybrid_search_with_score(self,query:str,k:int=4,filters:Optional[str]=None,**kwargs:Any,)->List[Tuple[Document,float]]:"""Return docs most similar to query with a hybrid query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. Returns: List of Documents most similar to the query and score for each """embedding=self.embed_query(query)results=self._simple_search(embedding,query,k,filters=filters,**kwargs)return_results_to_documents(results)
[docs]asyncdefahybrid_search_with_score(self,query:str,k:int=4,filters:Optional[str]=None,**kwargs:Any,)->List[Tuple[Document,float]]:"""Return docs most similar to query with a hybrid query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. Returns: List of Documents most similar to the query and score for each """embedding=awaitself._aembed_query(query)results=awaitself._asimple_search(embedding,query,k,filters=filters,**kwargs)returnawait_aresults_to_documents(results)
[docs]defhybrid_max_marginal_relevance_search_with_score(self,query:str,k:int=4,fetch_k:int=20,lambda_mult:float=0.5,*,filters:Optional[str]=None,**kwargs:Any,)->List[Tuple[Document,float]]:"""Return docs most similar to query with a hybrid query and reorder results by MMR. Args: query (str): Text to look up documents similar to. k (int, optional): Number of Documents to return. Defaults to 4. fetch_k (int, optional): Total results to select k from. Defaults to 20. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5 filters (str, optional): Filtering expression. Defaults to None. Returns: List of Documents most similar to the query and score for each """embedding=self.embed_query(query)results=self._simple_search(embedding,query,fetch_k,filters=filters,**kwargs)return_reorder_results_with_maximal_marginal_relevance(results,query_embedding=np.array(embedding),lambda_mult=lambda_mult,k=k)
[docs]asyncdefahybrid_max_marginal_relevance_search_with_score(self,query:str,k:int=4,fetch_k:int=20,lambda_mult:float=0.5,*,filters:Optional[str]=None,**kwargs:Any,)->List[Tuple[Document,float]]:"""Return docs most similar to query with a hybrid query and reorder results by MMR. Args: query (str): Text to look up documents similar to. k (int, optional): Number of Documents to return. Defaults to 4. fetch_k (int, optional): Total results to select k from. Defaults to 20. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5 filters (str, optional): Filtering expression. Defaults to None. Returns: List of Documents most similar to the query and score for each """embedding=awaitself._aembed_query(query)results=awaitself._asimple_search(embedding,query,fetch_k,filters=filters,**kwargs)returnawait_areorder_results_with_maximal_marginal_relevance(results,query_embedding=np.array(embedding),lambda_mult=lambda_mult,k=k,)
def_simple_search(self,embedding:List[float],text_query:str,k:int,*,filters:Optional[str]=None,**kwargs:Any,)->SearchItemPaged[dict]:"""Perform vector or hybrid search in the Azure search index. Args: embedding: A vector embedding to search in the vector space. text_query: A full-text search query expression; Use "*" or omit this parameter to perform only vector search. k: Number of documents to return. filters: Filtering expression. Returns: Search items """fromazure.search.documents.modelsimportVectorizedQueryreturnself.client.search(search_text=text_query,vector_queries=[VectorizedQuery(vector=np.array(embedding,dtype=np.float32).tolist(),k_nearest_neighbors=k,fields=FIELDS_CONTENT_VECTOR,)],filter=filters,top=k,**kwargs,)asyncdef_asimple_search(self,embedding:List[float],text_query:str,k:int,*,filters:Optional[str]=None,**kwargs:Any,)->AsyncSearchItemPaged[dict]:"""Perform vector or hybrid search in the Azure search index. Args: embedding: A vector embedding to search in the vector space. text_query: A full-text search query expression; Use "*" or omit this parameter to perform only vector search. k: Number of documents to return. filters: Filtering expression. Returns: Search items """fromazure.search.documents.modelsimportVectorizedQueryreturnawaitself.async_client.search(search_text=text_query,vector_queries=[VectorizedQuery(vector=np.array(embedding,dtype=np.float32).tolist(),k_nearest_neighbors=k,fields=FIELDS_CONTENT_VECTOR,)],filter=filters,top=k,**kwargs,)
[docs]defsemantic_hybrid_search(self,query:str,k:int=4,**kwargs:Any)->List[Document]:""" Returns the most similar indexed documents to the query text. Args: query (str): The query text for which to find similar documents. k (int): The number of documents to return. Default is 4. filters: Filtering expression. Returns: List[Document]: A list of documents that are most similar to the query text. """docs_and_scores=self.semantic_hybrid_search_with_score_and_rerank(query,k=k,**kwargs)return[docfordoc,_,_indocs_and_scores]
[docs]asyncdefasemantic_hybrid_search(self,query:str,k:int=4,**kwargs:Any)->List[Document]:""" Returns the most similar indexed documents to the query text. Args: query (str): The query text for which to find similar documents. k (int): The number of documents to return. Default is 4. filters: Filtering expression. Returns: List[Document]: A list of documents that are most similar to the query text. """docs_and_scores=awaitself.asemantic_hybrid_search_with_score_and_rerank(query,k=k,**kwargs)return[docfordoc,_,_indocs_and_scores]
[docs]defsemantic_hybrid_search_with_score(self,query:str,k:int=4,score_type:Literal["score","reranker_score"]="score",*,score_threshold:Optional[float]=None,**kwargs:Any,)->List[Tuple[Document,float]]:""" Returns the most similar indexed documents to the query text. Args: query (str): The query text for which to find similar documents. k (int): The number of documents to return. Default is 4. score_type: Must either be "score" or "reranker_score". Defaulted to "score". filters: Filtering expression. Returns: List[Tuple[Document, float]]: A list of documents and their corresponding scores. """docs_and_scores=self.semantic_hybrid_search_with_score_and_rerank(query,k=k,**kwargs)ifscore_type=="score":return[(doc,score)fordoc,score,_indocs_and_scoresifscore_thresholdisNoneorscore>=score_threshold]elifscore_type=="reranker_score":return[(doc,reranker_score)fordoc,_,reranker_scoreindocs_and_scoresifscore_thresholdisNoneorreranker_score>=score_threshold]
[docs]asyncdefasemantic_hybrid_search_with_score(self,query:str,k:int=4,score_type:Literal["score","reranker_score"]="score",*,score_threshold:Optional[float]=None,**kwargs:Any,)->List[Tuple[Document,float]]:""" Returns the most similar indexed documents to the query text. Args: query (str): The query text for which to find similar documents. k (int): The number of documents to return. Default is 4. score_type: Must either be "score" or "reranker_score". Defaulted to "score". filters: Filtering expression. Returns: List[Tuple[Document, float]]: A list of documents and their corresponding scores. """docs_and_scores=awaitself.asemantic_hybrid_search_with_score_and_rerank(query,k=k,**kwargs)ifscore_type=="score":return[(doc,score)fordoc,score,_indocs_and_scoresifscore_thresholdisNoneorscore>=score_threshold]elifscore_type=="reranker_score":return[(doc,reranker_score)fordoc,_,reranker_scoreindocs_and_scoresifscore_thresholdisNoneorreranker_score>=score_threshold]
[docs]defsemantic_hybrid_search_with_score_and_rerank(self,query:str,k:int=4,*,filters:Optional[str]=None,**kwargs:Any)->List[Tuple[Document,float,float]]:"""Return docs most similar to query with a hybrid query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filters: Filtering expression. Returns: List of Documents most similar to the query and score for each """fromazure.search.documents.modelsimportVectorizedQueryresults=self.client.search(search_text=query,vector_queries=[VectorizedQuery(vector=np.array(self.embed_query(query),dtype=np.float32).tolist(),k_nearest_neighbors=k,fields=FIELDS_CONTENT_VECTOR,)],filter=filters,query_type="semantic",semantic_configuration_name=self.semantic_configuration_name,query_caption="extractive",query_answer="extractive",top=k,**kwargs,)# Get Semantic Answerssemantic_answers=results.get_answers()or[]semantic_answers_dict:Dict={}forsemantic_answerinsemantic_answers:semantic_answers_dict[semantic_answer.key]={"text":semantic_answer.text,"highlights":semantic_answer.highlights,}# Convert results to Document objectsdocs=[(Document(page_content=result.pop(FIELDS_CONTENT),metadata={**({FIELDS_ID:result.pop(FIELDS_ID)}ifFIELDS_IDinresultelse{}),**(json.loads(result[FIELDS_METADATA])ifFIELDS_METADATAinresultelse{k:vfork,vinresult.items()ifk!=FIELDS_CONTENT_VECTOR}),**{"captions":({"text":result.get("@search.captions",[{}])[0].text,"highlights":result.get("@search.captions",[{}])[0].highlights,}ifresult.get("@search.captions")else{}),"answers":semantic_answers_dict.get(result.get(FIELDS_ID,""),"",),},},),float(result["@search.score"]),float(result["@search.reranker_score"]),)forresultinresults]returndocs
[docs]asyncdefasemantic_hybrid_search_with_score_and_rerank(self,query:str,k:int=4,*,filters:Optional[str]=None,**kwargs:Any)->List[Tuple[Document,float,float]]:"""Return docs most similar to query with a hybrid query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filters: Filtering expression. Returns: List of Documents most similar to the query and score for each """fromazure.search.documents.modelsimportVectorizedQueryvector=awaitself._aembed_query(query)results=awaitself.async_client.search(search_text=query,vector_queries=[VectorizedQuery(vector=np.array(vector,dtype=np.float32).tolist(),k_nearest_neighbors=k,fields=FIELDS_CONTENT_VECTOR,)],filter=filters,query_type="semantic",semantic_configuration_name=self.semantic_configuration_name,query_caption="extractive",query_answer="extractive",top=k,**kwargs,)# Get Semantic Answerssemantic_answers=(awaitresults.get_answers())or[]semantic_answers_dict:Dict={}forsemantic_answerinsemantic_answers:semantic_answers_dict[semantic_answer.key]={"text":semantic_answer.text,"highlights":semantic_answer.highlights,}# Convert results to Document objectsdocs=[(Document(page_content=result.pop(FIELDS_CONTENT),metadata={**({FIELDS_ID:result.pop(FIELDS_ID)}ifFIELDS_IDinresultelse{}),**(json.loads(result[FIELDS_METADATA])ifFIELDS_METADATAinresultelse{k:vfork,vinresult.items()ifk!=FIELDS_CONTENT_VECTOR}),**{"captions":({"text":result.get("@search.captions",[{}])[0].text,"highlights":result.get("@search.captions",[{}])[0].highlights,}ifresult.get("@search.captions")else{}),"answers":semantic_answers_dict.get(result.get(FIELDS_ID,""),"",),},},),float(result["@search.score"]),float(result["@search.reranker_score"]),)asyncforresultinresults]returndocs
[docs]@classmethoddeffrom_texts(cls:Type[AzureSearch],texts:List[str],embedding:Embeddings,metadatas:Optional[List[dict]]=None,azure_search_endpoint:str="",azure_search_key:str="",azure_ad_access_token:Optional[str]=None,index_name:str="langchain-index",fields:Optional[List[SearchField]]=None,**kwargs:Any,)->AzureSearch:# Creating a new Azure Search instanceazure_search=cls(azure_search_endpoint,azure_search_key,index_name,embedding,fields=fields,azure_ad_access_token=azure_ad_access_token,**kwargs,)azure_search.add_texts(texts,metadatas,**kwargs)returnazure_search
[docs]@classmethodasyncdefafrom_texts(cls:Type[AzureSearch],texts:List[str],embedding:Embeddings,metadatas:Optional[List[dict]]=None,azure_search_endpoint:str="",azure_search_key:str="",azure_ad_access_token:Optional[str]=None,index_name:str="langchain-index",fields:Optional[List[SearchField]]=None,**kwargs:Any,)->AzureSearch:# Creating a new Azure Search instanceazure_search=cls(azure_search_endpoint,azure_search_key,index_name,embedding,fields=fields,azure_ad_access_token=azure_ad_access_token,**kwargs,)awaitazure_search.aadd_texts(texts,metadatas,**kwargs)returnazure_search
[docs]@classmethodasyncdefafrom_embeddings(cls:Type[AzureSearch],text_embeddings:Iterable[Tuple[str,List[float]]],embedding:Embeddings,metadatas:Optional[List[dict]]=None,*,azure_search_endpoint:str="",azure_search_key:str="",index_name:str="langchain-index",fields:Optional[List[SearchField]]=None,**kwargs:Any,)->AzureSearch:text_embeddings,first_text_embedding=_peek(text_embeddings)iffirst_text_embeddingisNone:raiseValueError("Cannot create AzureSearch from empty embeddings.")vector_search_dimensions=len(first_text_embedding[1])azure_search=cls(azure_search_endpoint=azure_search_endpoint,azure_search_key=azure_search_key,index_name=index_name,embedding_function=embedding,fields=fields,vector_search_dimensions=vector_search_dimensions,**kwargs,)awaitazure_search.aadd_embeddings(text_embeddings,metadatas,**kwargs)returnazure_search
[docs]@classmethoddeffrom_embeddings(cls:Type[AzureSearch],text_embeddings:Iterable[Tuple[str,List[float]]],embedding:Embeddings,metadatas:Optional[List[dict]]=None,*,azure_search_endpoint:str="",azure_search_key:str="",index_name:str="langchain-index",fields:Optional[List[SearchField]]=None,**kwargs:Any,)->AzureSearch:# Creating a new Azure Search instancetext_embeddings,first_text_embedding=_peek(text_embeddings)iffirst_text_embeddingisNone:raiseValueError("Cannot create AzureSearch from empty embeddings.")vector_search_dimensions=len(first_text_embedding[1])azure_search=cls(azure_search_endpoint=azure_search_endpoint,azure_search_key=azure_search_key,index_name=index_name,embedding_function=embedding,fields=fields,vector_search_dimensions=vector_search_dimensions,**kwargs,)azure_search.add_embeddings(text_embeddings,metadatas,**kwargs)returnazure_search
[docs]defas_retriever(self,**kwargs:Any)->AzureSearchVectorStoreRetriever:# type: ignore"""Return AzureSearchVectorStoreRetriever initialized from this VectorStore. Args: search_type (Optional[str]): Overrides the type of search that the Retriever should perform. Defaults to `self.search_type`. Can be "similarity", "hybrid", or "semantic_hybrid". search_kwargs (Optional[Dict]): Keyword arguments to pass to the search function. Can include things like: score_threshold: Minimum relevance threshold for similarity_score_threshold fetch_k: Amount of documents to pass to MMR algorithm (Default: 20) lambda_mult: Diversity of results returned by MMR; 1 for minimum diversity and 0 for maximum. (Default: 0.5) filter: Filter by document metadata Returns: AzureSearchVectorStoreRetriever: Retriever class for VectorStore. """search_type=kwargs.get("search_type",self.search_type)kwargs["search_type"]=search_typetags=kwargs.pop("tags",None)or[]tags.extend(self._get_retriever_tags())returnAzureSearchVectorStoreRetriever(vectorstore=self,**kwargs,tags=tags)
[docs]classAzureSearchVectorStoreRetriever(BaseRetriever):"""Retriever that uses `Azure Cognitive Search`."""vectorstore:AzureSearch"""Azure Search instance used to find similar documents."""search_type:str="hybrid""""Type of search to perform. Options are "similarity", "hybrid", "semantic_hybrid", "similarity_score_threshold", "hybrid_score_threshold", or "semantic_hybrid_score_threshold"."""k:int=4"""Number of documents to return."""search_kwargs:dict={}"""Search params. score_threshold: Minimum relevance threshold for similarity_score_threshold fetch_k: Amount of documents to pass to MMR algorithm (Default: 20) lambda_mult: Diversity of results returned by MMR; 1 for minimum diversity and 0 for maximum. (Default: 0.5) filter: Filter by document metadata """allowed_search_types:ClassVar[Collection[str]]=("similarity","similarity_score_threshold","hybrid","hybrid_score_threshold","semantic_hybrid","semantic_hybrid_score_threshold",)model_config=ConfigDict(arbitrary_types_allowed=True,)@model_validator(mode="before")@classmethoddefvalidate_search_type(cls,values:Dict)->Any:"""Validate search type."""if"search_type"invalues:search_type=values["search_type"]ifsearch_typenotincls.allowed_search_types:raiseValueError(f"search_type of {search_type} not allowed. Valid values are: "f"{cls.allowed_search_types}")returnvaluesdef_get_relevant_documents(self,query:str,run_manager:CallbackManagerForRetrieverRun,**kwargs:Any,)->List[Document]:params={**self.search_kwargs,**kwargs}ifself.search_type=="similarity":docs=self.vectorstore.vector_search(query,k=self.k,**params)elifself.search_type=="similarity_score_threshold":docs=[docfordoc,_inself.vectorstore.similarity_search_with_relevance_scores(query,k=self.k,**params)]elifself.search_type=="hybrid":docs=self.vectorstore.hybrid_search(query,k=self.k,**params)elifself.search_type=="hybrid_score_threshold":docs=[docfordoc,_inself.vectorstore.hybrid_search_with_relevance_scores(query,k=self.k,**params)]elifself.search_type=="semantic_hybrid":docs=self.vectorstore.semantic_hybrid_search(query,k=self.k,**params)elifself.search_type=="semantic_hybrid_score_threshold":docs=[docfordoc,_inself.vectorstore.semantic_hybrid_search_with_score(query,k=self.k,**params)]else:raiseValueError(f"search_type of {self.search_type} not allowed.")returndocsasyncdef_aget_relevant_documents(self,query:str,*,run_manager:AsyncCallbackManagerForRetrieverRun,**kwargs:Any,)->List[Document]:params={**self.search_kwargs,**kwargs}ifself.search_type=="similarity":docs=awaitself.vectorstore.avector_search(query,k=self.k,**params)elifself.search_type=="similarity_score_threshold":docs_and_scores=(awaitself.vectorstore.asimilarity_search_with_relevance_scores(query,k=self.k,**params))docs=[docfordoc,_indocs_and_scores]elifself.search_type=="hybrid":docs=awaitself.vectorstore.ahybrid_search(query,k=self.k,**params)elifself.search_type=="hybrid_score_threshold":docs_and_scores=(awaitself.vectorstore.ahybrid_search_with_relevance_scores(query,k=self.k,**params))docs=[docfordoc,_indocs_and_scores]elifself.search_type=="semantic_hybrid":docs=awaitself.vectorstore.asemantic_hybrid_search(query,k=self.k,**params)elifself.search_type=="semantic_hybrid_score_threshold":docs=[docfordoc,_inawaitself.vectorstore.asemantic_hybrid_search_with_score(query,k=self.k,**params)]else:raiseValueError(f"search_type of {self.search_type} not allowed.")returndocs
def_results_to_documents(results:SearchItemPaged[Dict],)->List[Tuple[Document,float]]:docs=[(_result_to_document(result),float(result["@search.score"]),)forresultinresults]returndocsasyncdef_aresults_to_documents(results:AsyncSearchItemPaged[Dict],)->List[Tuple[Document,float]]:docs=[(_result_to_document(result),float(result["@search.score"]),)asyncforresultinresults]returndocsasyncdef_areorder_results_with_maximal_marginal_relevance(results:SearchItemPaged[Dict],query_embedding:np.ndarray,lambda_mult:float=0.5,k:int=4,)->List[Tuple[Document,float]]:# Convert results to Document objectsdocs=[(_result_to_document(result),float(result["@search.score"]),result[FIELDS_CONTENT_VECTOR],)asyncforresultinresults]documents,scores,vectors=map(list,zip(*docs))# Get the new order of results.new_ordering=maximal_marginal_relevance(query_embedding,vectors,k=k,lambda_mult=lambda_mult)# Reorder the values and return.ret:List[Tuple[Document,float]]=[]forxinnew_ordering:# Function can return -1 indexifx==-1:breakret.append((documents[x],scores[x]))# type: ignorereturnretdef_reorder_results_with_maximal_marginal_relevance(results:SearchItemPaged[Dict],query_embedding:np.ndarray,lambda_mult:float=0.5,k:int=4,)->List[Tuple[Document,float]]:# Convert results to Document objectsdocs=[(_result_to_document(result),float(result["@search.score"]),result[FIELDS_CONTENT_VECTOR],)forresultinresults]ifnotdocs:return[]documents,scores,vectors=map(list,zip(*docs))# Get the new order of results.new_ordering=maximal_marginal_relevance(query_embedding,vectors,k=k,lambda_mult=lambda_mult)# Reorder the values and return.ret:List[Tuple[Document,float]]=[]forxinnew_ordering:# Function can return -1 indexifx==-1:breakret.append((documents[x],scores[x]))# type: ignorereturnretdef_result_to_document(result:Dict)->Document:# Fields metadataifFIELDS_METADATAinresult:ifisinstance(result[FIELDS_METADATA],dict):fields_metadata=result[FIELDS_METADATA]else:fields_metadata=json.loads(result[FIELDS_METADATA])else:fields_metadata={key:valueforkey,valueinresult.items()ifkeynotin[FIELDS_CONTENT_VECTOR,FIELDS_CONTENT]}# IDsifFIELDS_IDinresult:fields_id={FIELDS_ID:result.pop(FIELDS_ID)}else:fields_id={}returnDocument(page_content=result[FIELDS_CONTENT],metadata={**fields_id,**fields_metadata,},)def_peek(iterable:Iterable,default:Optional[Any]=None)->Tuple[Iterable,Any]:try:iterator=iter(iterable)value=next(iterator)iterable=itertools.chain([value],iterator)returniterable,valueexceptStopIteration:returniterable,default