[docs]classAzureCosmosDBNoSqlVectorSearch(VectorStore):"""`Azure Cosmos DB for NoSQL` vector store. To use, you should have both: - the ``azure-cosmos`` python package installed You can read more about vector search, full text search and hybrid search using AzureCosmosDBNoSQL here: https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/vector-search https://learn.microsoft.com/en-us/azure/cosmos-db/gen-ai/full-text-search https://learn.microsoft.com/en-us/azure/cosmos-db/gen-ai/hybrid-search """
[docs]def__init__(self,*,cosmos_client:CosmosClient,embedding:Embeddings,vector_embedding_policy:Dict[str,Any],indexing_policy:Dict[str,Any],cosmos_container_properties:Dict[str,Any],cosmos_database_properties:Dict[str,Any],full_text_policy:Optional[Dict[str,Any]]=None,database_name:str="vectorSearchDB",container_name:str="vectorSearchContainer",text_key:str="text",embedding_key:str="embedding",metadata_key:str="metadata",create_container:bool=True,full_text_search_enabled:bool=False,):""" Constructor for AzureCosmosDBNoSqlVectorSearch Args: cosmos_client: Client used to connect to azure cosmosdb no sql account. database_name: Name of the database to be created. container_name: Name of the container to be created. embedding: Text embedding model to use. vector_embedding_policy: Vector Embedding Policy for the container. full_text_policy: Full Text Policy for the container. indexing_policy: Indexing Policy for the container. cosmos_container_properties: Container Properties for the container. cosmos_database_properties: Database Properties for the container. text_key: Text key to use for text property which will be embedded in the data schema. embedding_key: Embedding key to use for vector embedding. metadata_key: Metadata key to use for data schema. create_container: Set to true if the container does not exist. full_text_search_enabled: Set to true if the full text search is enabled. """self._cosmos_client=cosmos_clientself._database_name=database_nameself._container_name=container_nameself._embedding=embeddingself._vector_embedding_policy=vector_embedding_policyself._full_text_policy=full_text_policyself._indexing_policy=indexing_policyself._cosmos_container_properties=cosmos_container_propertiesself._cosmos_database_properties=cosmos_database_propertiesself._text_key=text_keyself._embedding_key=embedding_keyself._metadata_key=metadata_keyself._create_container=create_containerself._full_text_search_enabled=full_text_search_enabledifself._create_container:if(self._indexing_policy["vectorIndexes"]isNoneorlen(self._indexing_policy["vectorIndexes"])==0):raiseValueError("vectorIndexes cannot be null or empty in the indexing_policy.")if(self._vector_embedding_policyisNoneorlen(vector_embedding_policy["vectorEmbeddings"])==0):raiseValueError("vectorEmbeddings cannot be null ""or empty in the vector_embedding_policy.")ifself._cosmos_container_properties["partition_key"]isNone:raiseValueError("partition_key cannot be null or empty for a container.")ifself._full_text_search_enabled:if(self._indexing_policy["fullTextIndexes"]isNoneorlen(self._indexing_policy["fullTextIndexes"])==0):raiseValueError("fullTextIndexes cannot be null or empty in the ""indexing_policy if full text search is enabled.")if(self._full_text_policyisNoneorlen(self._full_text_policy["fullTextPaths"])==0):raiseValueError("fullTextPaths cannot be null or empty in the ""full_text_policy if full text search is enabled.")# Create the database if it already doesn't existself._database=self._cosmos_client.create_database_if_not_exists(id=self._database_name,offer_throughput=self._cosmos_database_properties.get("offer_throughput"),session_token=self._cosmos_database_properties.get("session_token"),initial_headers=self._cosmos_database_properties.get("initial_headers"),etag=self._cosmos_database_properties.get("etag"),match_condition=self._cosmos_database_properties.get("match_condition"),)# Create the collection if it already doesn't existself._container=self._database.create_container_if_not_exists(id=self._container_name,partition_key=self._cosmos_container_properties["partition_key"],indexing_policy=self._indexing_policy,default_ttl=self._cosmos_container_properties.get("default_ttl"),offer_throughput=self._cosmos_container_properties.get("offer_throughput"),unique_key_policy=self._cosmos_container_properties.get("unique_key_policy"),conflict_resolution_policy=self._cosmos_container_properties.get("conflict_resolution_policy"),analytical_storage_ttl=self._cosmos_container_properties.get("analytical_storage_ttl"),computed_properties=self._cosmos_container_properties.get("computed_properties"),etag=self._cosmos_container_properties.get("etag"),match_condition=self._cosmos_container_properties.get("match_condition"),session_token=self._cosmos_container_properties.get("session_token"),initial_headers=self._cosmos_container_properties.get("initial_headers"),vector_embedding_policy=self._vector_embedding_policy,full_text_policy=self._full_text_policy,)
[docs]defadd_texts(self,texts:Iterable[str],metadatas:Optional[List[dict]]=None,**kwargs:Any,)->List[str]:"""Run more texts through the embeddings and add to the vectorstore. Args: texts: Iterable of strings to add to the vectorstore. metadatas: Optional list of metadatas associated with the texts. Returns: List of ids from adding the texts into the vectorstore. """_metadatas=list(metadatasifmetadatasisnotNoneelse({}for_intexts))returnself._insert_texts(list(texts),_metadatas)
def_insert_texts(self,texts:List[str],metadatas:List[Dict[str,Any]])->List[str]:"""Used to Load Documents into the collection Args: texts: The list of documents strings to load metadatas: The list of metadata objects associated with each document Returns: List of ids from adding the texts into the vectorstore. """# If the texts is empty, throw an errorifnottexts:raiseException("Texts can not be null or empty")# Embed and create the documentsembeddings=self._embedding.embed_documents(texts)text_key="text"to_insert=[{"id":str(uuid.uuid4()),text_key:t,self._embedding_key:embedding,"metadata":m,}fort,m,embeddinginzip(texts,metadatas,embeddings)]# insert the documents in CosmosDB No Sqldoc_ids:List[str]=[]foriteminto_insert:created_doc=self._container.create_item(item)doc_ids.append(created_doc["id"])returndoc_ids@classmethoddef_from_kwargs(cls,embedding:Embeddings,*,cosmos_client:CosmosClient,vector_embedding_policy:Dict[str,Any],indexing_policy:Dict[str,Any],cosmos_container_properties:Dict[str,Any],cosmos_database_properties:Dict[str,Any],full_text_policy:Optional[Dict[str,Any]]=None,database_name:str="vectorSearchDB",container_name:str="vectorSearchContainer",text_key:str="text",embedding_key:str="embedding",metadata_key:str="metadata",create_container:bool=True,full_text_search_enabled:bool=False,**kwargs:Any,)->AzureCosmosDBNoSqlVectorSearch:ifkwargs:warnings.warn("Method 'from_texts' of AzureCosmosDBNoSql vector ""store invoked with "f"unsupported arguments "f"({', '.join(sorted(kwargs))}), ""which will be ignored.")returncls(embedding=embedding,cosmos_client=cosmos_client,vector_embedding_policy=vector_embedding_policy,full_text_policy=full_text_policy,indexing_policy=indexing_policy,cosmos_container_properties=cosmos_container_properties,cosmos_database_properties=cosmos_database_properties,database_name=database_name,container_name=container_name,text_key=text_key,embedding_key=embedding_key,metadata_key=metadata_key,create_container=create_container,full_text_search_enabled=full_text_search_enabled,)
[docs]@classmethoddeffrom_texts(cls,texts:List[str],embedding:Embeddings,metadatas:Optional[List[dict]]=None,**kwargs:Any,)->AzureCosmosDBNoSqlVectorSearch:"""Create an AzureCosmosDBNoSqlVectorSearch vectorstore from raw texts. Args: texts: the texts to insert. embedding: the embedding function to use in the store. metadatas: metadata dicts for the texts. **kwargs: you can pass any argument that you would to :meth:`~add_texts` and/or to the 'AstraDB' constructor (see these methods for details). These arguments will be routed to the respective methods as they are. Returns: an `AzureCosmosDBNoSqlVectorSearch` vectorstore. """vectorstore=AzureCosmosDBNoSqlVectorSearch._from_kwargs(embedding,**kwargs)vectorstore.add_texts(texts=texts,metadatas=metadatas,)returnvectorstore
[docs]defdelete(self,ids:Optional[List[str]]=None,**kwargs:Any)->Optional[bool]:ifidsisNone:raiseValueError("No document ids provided to delete.")fordocument_idinids:self.delete_document_by_id(document_id)returnTrue
[docs]defdelete_document_by_id(self,document_id:Optional[str]=None)->None:"""Removes a Specific Document by id Args: document_id: The document identifier """ifdocument_idisNone:raiseValueError("No document ids provided to delete.")self._container.delete_item(document_id,partition_key=document_id)
[docs]defsimilarity_search(self,query:str,k:int=4,pre_filter:Optional[PreFilter]=None,with_embedding:bool=False,query_type:CosmosDBQueryType=CosmosDBQueryType.VECTOR,offset_limit:Optional[str]=None,**kwargs:Any,)->List[Document]:ifquery_typenotinCosmosDBQueryType.__members__.values():raiseValueError(f"Invalid query_type: {query_type}. "f"Expected one of: {', '.join(t.valuefortinCosmosDBQueryType)}.")else:docs_and_scores=self.similarity_search_with_score(query,k=k,pre_filter=pre_filter,with_embedding=with_embedding,query_type=query_type,offset_limit=offset_limit,kwargs=kwargs,)return[docfordoc,_indocs_and_scores]
[docs]defmax_marginal_relevance_search_by_vector(self,embedding:List[float],k:int=4,fetch_k:int=20,lambda_mult:float=0.5,query_type:CosmosDBQueryType=CosmosDBQueryType.VECTOR,pre_filter:Optional[PreFilter]=None,with_embedding:bool=False,**kwargs:Any,)->List[Document]:# Retrieves the docs with similarity scores# if kwargs["pre_filter"]:# pre_filter = kwargs["pre_filter"]# if kwargs["with_embedding"]:# with_embedding = kwargs["with_embedding"]docs=self._similarity_search_with_score(embeddings=embedding,k=fetch_k,query_type=query_type,pre_filter=pre_filter,with_embedding=with_embedding,)# Re-ranks the docs using MMRmmr_doc_indexes=maximal_marginal_relevance(np.array(embedding),[doc.metadata[self._embedding_key]fordoc,_indocs],k=k,lambda_mult=lambda_mult,)mmr_docs=[docs[i][0]foriinmmr_doc_indexes]returnmmr_docs
[docs]defmax_marginal_relevance_search(self,query:str,k:int=4,fetch_k:int=20,lambda_mult:float=0.5,query_type:CosmosDBQueryType=CosmosDBQueryType.VECTOR,pre_filter:Optional[PreFilter]=None,with_embedding:bool=False,**kwargs:Any,)->List[Document]:# compute the embeddings vector from the query string# if kwargs["pre_filter"]:# pre_filter = kwargs["pre_filter"]# if kwargs["with_embedding"]:# with_embedding = kwargs["with_embedding"]embeddings=self._embedding.embed_query(query)docs=self.max_marginal_relevance_search_by_vector(embeddings,k=k,fetch_k=fetch_k,lambda_mult=lambda_mult,pre_filter=pre_filter,query_type=query_type,with_embedding=with_embedding,)returndocs
def_construct_query(self,k:int,query_type:CosmosDBQueryType,embeddings:Optional[List[float]]=None,search_text:Optional[str]=None,pre_filter:Optional[PreFilter]=None,offset_limit:Optional[str]=None,projection_mapping:Optional[Dict[str,Any]]=None,)->Tuple[str,List[Dict[str,Any]]]:if(query_type==CosmosDBQueryType.FULL_TEXT_RANKorquery_type==CosmosDBQueryType.HYBRID):query=f"SELECT {'TOP '+str(k)+' 'ifnotoffset_limitelse''}"else:query=f"""SELECT {"TOP @limit "ifnotoffset_limitelse""}"""query+=self._generate_projection_fields(projection_mapping,query_type,embeddings)query+=" FROM c "# Add where_clause if specifiedifpre_filter:where_clause=self._build_where_clause(pre_filter)query+=f"""{where_clause}"""# TODO: Update the code to use parameters once parametrized queries# are allowed for these query functionsifquery_type==CosmosDBQueryType.FULL_TEXT_RANK:ifsearch_textisNone:raiseValueError("search text cannot be None for FULL_TEXT_RANK queries.")query+=f""" ORDER BY RANK FullTextScore(c.{self._text_key}, [{", ".join(f"'{term}'"forterminsearch_text.split())}])"""elifquery_type==CosmosDBQueryType.VECTOR:query+=" ORDER BY VectorDistance(c[@embeddingKey], @embeddings)"elifquery_type==CosmosDBQueryType.HYBRID:ifsearch_textisNone:raiseValueError("search text cannot be None for HYBRID queries.")query+=f""" ORDER BY RANK RRF(FullTextScore(c.{self._text_key}, [{", ".join(f"'{term}'"forterminsearch_text.split())}]), VectorDistance(c.{self._embedding_key}, {embeddings}))"""else:query+=""# Add limit_offset_clause if specifiedifoffset_limitisnotNone:query+=f""" {offset_limit}"""# TODO: Remove this if check once parametrized queries# are allowed for these query functionsparameters=[]if(query_type==CosmosDBQueryType.FULL_TEXT_SEARCHorquery_type==CosmosDBQueryType.VECTOR):parameters=self._build_parameters(k=k,query_type=query_type,embeddings=embeddings,projection_mapping=projection_mapping,)returnquery,parametersdef_generate_projection_fields(self,projection_mapping:Optional[Dict[str,Any]],query_type:CosmosDBQueryType,embeddings:Optional[List[float]]=None,)->str:# TODO: Remove this if check once parametrized queries# are allowed for these query functionsif(query_type==CosmosDBQueryType.FULL_TEXT_RANKorquery_type==CosmosDBQueryType.HYBRID):ifprojection_mapping:projection=", ".join(f"c.{key} as {alias}"forkey,aliasinprojection_mapping.items())else:projection=(f"c.id, c.{self._text_key} as text, "f"c.{self._metadata_key} as metadata")ifquery_type==CosmosDBQueryType.HYBRID:projection+=(f", c.{self._embedding_key} as embedding, "f"VectorDistance(c.{self._embedding_key}, "f"{embeddings}) as SimilarityScore")else:ifprojection_mapping:projection=", ".join(f"c.[@{key}] as {alias}"forkey,aliasinprojection_mapping.items())else:projection="c.id, c[@textKey] as text, c[@metadataKey] as metadata"if(query_type==CosmosDBQueryType.VECTORorquery_type==CosmosDBQueryType.HYBRID):projection+=(", c[@embeddingKey] as embedding, ""VectorDistance(c[@embeddingKey], ""@embeddings) as SimilarityScore")returnprojectiondef_build_parameters(self,k:int,query_type:CosmosDBQueryType,embeddings:Optional[List[float]],search_terms:Optional[List[str]]=None,projection_mapping:Optional[Dict[str,Any]]=None,)->List[Dict[str,Any]]:parameters:List[Dict[str,Any]]=[{"name":"@limit","value":k},{"name":"@textKey","value":self._text_key},]ifprojection_mapping:forkeyinprojection_mapping.keys():parameters.append({"name":f"@{key}","value":key})else:parameters.append({"name":"@metadataKey","value":self._metadata_key})if(query_type==CosmosDBQueryType.FULL_TEXT_RANKorquery_type==CosmosDBQueryType.HYBRID):parameters.append({"name":"@searchTerms","value":search_terms})elif(query_type==CosmosDBQueryType.VECTORorquery_type==CosmosDBQueryType.HYBRID):parameters.append({"name":"@embeddingKey","value":self._embedding_key})parameters.append({"name":"@embeddings","value":embeddings})returnparametersdef_build_where_clause(self,pre_filter:PreFilter)->str:""" Builds a where clause based on the given pre_filter. """operator_map=self._where_clause_operator_map()if(pre_filter.logical_operatorandpre_filter.logical_operatornotinoperator_map):raiseValueError(f"unsupported logical_operator: {pre_filter.logical_operator}")sql_logical_operator=operator_map.get(pre_filter.logical_operatoror"","")clauses=[]forconditioninpre_filter.conditions:ifcondition.operatornotinoperator_map:raiseValueError(f"Unsupported operator: {condition.operator}")if"full_text"incondition.operator:ifnotisinstance(condition.value,str):raiseValueError(f"Expected a string for {condition.operator}, "f"got {type(condition.value)}")search_terms=", ".join(f"'{term}'"fortermincondition.value.split())sql_function=operator_map[condition.operator]clauses.append(f"{sql_function}(c.{condition.property}, {search_terms})")else:sql_operator=operator_map[condition.operator]ifisinstance(condition.value,str):value=f"'{condition.value}'"elifisinstance(condition.value,list):# e.g., for IN clausesvalue=f"({', '.join(map(str,condition.value))})"elifisinstance(condition.value,(int,float,bool)):value=str(condition.value)elifcondition.valueisNone:value="NULL"else:raiseValueError(f"Unsupported value type: {type(condition.value)}")clauses.append(f"c.{condition.property}{sql_operator}{value}")returnf""" WHERE {" {} ".format(sql_logical_operator).join(clauses)}""".strip()def_execute_query(self,query:str,query_type:CosmosDBQueryType,parameters:List[Dict[str,Any]],with_embedding:bool,projection_mapping:Optional[Dict[str,Any]],)->List[Tuple[Document,float]]:docs_and_scores=[]items=list(self._container.query_items(query=query,parameters=parameters,enable_cross_partition_query=True))foriteminitems:text=item[self._text_key]metadata=item.pop(self._metadata_key,{})score=0.0ifprojection_mapping:forkey,aliasinprojection_mapping.items():ifkey==self._text_key:continuemetadata[alias]=item[alias]else:metadata["id"]=item["id"]if(query_type==CosmosDBQueryType.VECTORorquery_type==CosmosDBQueryType.HYBRID):score=item["SimilarityScore"]ifwith_embedding:metadata[self._embedding_key]=item[self._embedding_key]docs_and_scores.append((Document(page_content=text,metadata=metadata),score,))returndocs_and_scoresdef_where_clause_operator_map(self)->Dict[str,str]:operator_map={"$eq":"=","$ne":"!=","$in":"IN","$lt":"<","$lte":"<=","$gt":">","$gte":">=","$add":"+","$sub":"-","$mul":"*","$div":"/","$mod":"%","$or":"OR","$and":"AND","$not":"NOT","$concat":"||","$bit_or":"|","$bit_and":"&","$bit_xor":"^","$bit_lshift":"<<","$bit_rshift":">>","$bit_zerofill_rshift":">>>","$full_text_contains":"FullTextContains","$full_text_contains_all":"FullTextContainsAll","$full_text_contains_any":"FullTextContainsAny",}returnoperator_map