Source code for langchain_community.vectorstores.baiduvectordb
"""Wrapper around the Baidu vector database."""from__future__importannotationsimportjsonimportloggingimporttimefromtypingimportAny,Dict,Iterable,List,Optional,Tupleimportnumpyasnpfromlangchain_core.documentsimportDocumentfromlangchain_core.embeddingsimportEmbeddingsfromlangchain_core.utilsimportguard_importfromlangchain_core.vectorstoresimportVectorStorefromlangchain_community.vectorstores.utilsimportmaximal_marginal_relevancelogger=logging.getLogger(__name__)
[docs]classConnectionParams:"""Baidu VectorDB Connection params. See the following documentation for details: https://cloud.baidu.com/doc/VDB/s/6lrsob0wy Attribute: endpoint (str) : The access address of the vector database server that the client needs to connect to. api_key (str): API key for client to access the vector database server, which is used for authentication. account (str) : Account for client to access the vector database server. connection_timeout_in_mills (int) : Request Timeout. """
[docs]classBaiduVectorDB(VectorStore):"""Baidu VectorDB as a vector store. In order to use this you need to have a database instance. See the following documentation for details: https://cloud.baidu.com/doc/VDB/index.html """field_id:str="id"field_vector:str="vector"field_text:str="text"field_metadata:str="metadata"index_vector:str="vector_idx"
[docs]@classmethoddeffrom_texts(cls,texts:List[str],embedding:Embeddings,metadatas:Optional[List[dict]]=None,connection_params:Optional[ConnectionParams]=None,table_params:Optional[TableParams]=None,database_name:str="LangChainDatabase",table_name:str="LangChainTable",drop_old:Optional[bool]=False,**kwargs:Any,)->BaiduVectorDB:"""Create a table, indexes it with HNSW, and insert data."""iflen(texts)==0:raiseValueError("texts is empty")ifconnection_paramsisNone:raiseValueError("connection_params is empty")try:embeddings=embedding.embed_documents(texts[0:1])exceptNotImplementedError:embeddings=[embedding.embed_query(texts[0])]dimension=len(embeddings[0])iftable_paramsisNone:table_params=TableParams(dimension=dimension)else:table_params.dimension=dimensionvector_db=cls(embedding=embedding,connection_params=connection_params,table_params=table_params,database_name=database_name,table_name=table_name,drop_old=drop_old,)vector_db.add_texts(texts=texts,metadatas=metadatas)returnvector_db
[docs]defadd_texts(self,texts:Iterable[str],metadatas:Optional[List[dict]]=None,batch_size:int=1000,**kwargs:Any,)->List[str]:"""Insert text data into Baidu VectorDB."""texts=list(texts)try:embeddings=self.embedding_func.embed_documents(texts)exceptNotImplementedError:embeddings=[self.embedding_func.embed_query(x)forxintexts]iflen(embeddings)==0:logger.debug("Nothing to insert, skipping.")return[]pks:list[str]=[]total_count=len(embeddings)forstartinrange(0,total_count,batch_size):# Grab end indexrows=[]end=min(start+batch_size,total_count)foridinrange(start,end,1):metadata="{}"ifmetadatasisnotNone:metadata=json.dumps(metadatas[id])row=self.mochowtable.Row(id="{}-{}-{}".format(time.time_ns(),hash(texts[id]),id),vector=[float(num)fornuminembeddings[id]],text=texts[id],metadata=metadata,)rows.append(row)pks.append(str(id))self.table.upsert(rows=rows)# need rebuild vindex after upsertself.table.rebuild_index(self.index_vector)whileTrue:time.sleep(2)index=self.table.describe_index(self.index_vector)ifindex.state==self.mochowenum.IndexState.NORMAL:breakreturnpks
[docs]defsimilarity_search(self,query:str,k:int=4,param:Optional[dict]=None,expr:Optional[str]=None,**kwargs:Any,)->List[Document]:"""Perform a similarity search against the query string."""res=self.similarity_search_with_score(query=query,k=k,param=param,expr=expr,**kwargs)return[docfordoc,_inres]
[docs]defsimilarity_search_with_score(self,query:str,k:int=4,param:Optional[dict]=None,expr:Optional[str]=None,**kwargs:Any,)->List[Tuple[Document,float]]:"""Perform a search on a query string and return results with score."""# Embed the query text.embedding=self.embedding_func.embed_query(query)res=self._similarity_search_with_score(embedding=embedding,k=k,param=param,expr=expr,**kwargs)returnres
[docs]defsimilarity_search_by_vector(self,embedding:List[float],k:int=4,param:Optional[dict]=None,expr:Optional[str]=None,**kwargs:Any,)->List[Document]:"""Perform a similarity search against the query string."""res=self._similarity_search_with_score(embedding=embedding,k=k,param=param,expr=expr,**kwargs)return[docfordoc,_inres]
def_similarity_search_with_score(self,embedding:List[float],k:int=4,param:Optional[dict]=None,expr:Optional[str]=None,**kwargs:Any,)->List[Tuple[Document,float]]:"""Perform a search on a query string and return results with score."""ef=10ifparamisNoneelseparam.get("ef",10)anns=self.mochowtable.AnnSearch(vector_field=self.field_vector,vector_floats=[float(num)fornuminembedding],params=self.mochowtable.HNSWSearchParams(ef=ef,limit=k),filter=expr,)res=self.table.search(anns=anns)rows=[[item]foriteminres.rows]# Organize results.ret:List[Tuple[Document,float]]=[]ifrowsisNoneorlen(rows)==0:returnretforrowinrows:forresultinrow:row_data=result.get("row",{})meta=row_data.get(self.field_metadata)ifmetaisnotNone:meta=json.loads(meta)doc=Document(page_content=row_data.get(self.field_text),metadata=meta)pair=(doc,result.get("score",0.0))ret.append(pair)returnret
[docs]defmax_marginal_relevance_search(self,query:str,k:int=4,fetch_k:int=20,lambda_mult:float=0.5,param:Optional[dict]=None,expr:Optional[str]=None,**kwargs:Any,)->List[Document]:"""Perform a search and return results that are reordered by MMR."""embedding=self.embedding_func.embed_query(query)returnself._max_marginal_relevance_search(embedding=embedding,k=k,fetch_k=fetch_k,lambda_mult=lambda_mult,param=param,expr=expr,**kwargs,)
def_max_marginal_relevance_search(self,embedding:list[float],k:int=4,fetch_k:int=20,lambda_mult:float=0.5,param:Optional[dict]=None,expr:Optional[str]=None,**kwargs:Any,)->List[Document]:"""Perform a search and return results that are reordered by MMR."""ef=10ifparamisNoneelseparam.get("ef",10)anns=self.mochowtable.AnnSearch(vector_field=self.field_vector,vector_floats=[float(num)fornuminembedding],params=self.mochowtable.HNSWSearchParams(ef=ef,limit=k),filter=expr,)res=self.table.search(anns=anns,retrieve_vector=True)# Organize results.documents:List[Document]=[]ordered_result_embeddings=[]rows=[[item]foriteminres.rows]ifrowsisNoneorlen(rows)==0:returndocumentsforrowinrows:forresultinrow:row_data=result.get("row",{})meta=row_data.get(self.field_metadata)ifmetaisnotNone:meta=json.loads(meta)doc=Document(page_content=row_data.get(self.field_text),metadata=meta)documents.append(doc)ordered_result_embeddings.append(row_data.get(self.field_vector))# Get the new order of results.new_ordering=maximal_marginal_relevance(np.array(embedding),ordered_result_embeddings,k=k,lambda_mult=lambda_mult)# Reorder the values and return.ret=[]forxinnew_ordering:# Function can return -1 indexifx==-1:breakelse:ret.append(documents[x])returnret