[docs]def__init__(self,embedding_function:Embeddings,path_or_url:Optional[str]=None,table_name:str=_DEFAULT_TABLE_NAME,db_name:str=_DEFAULT_CLUSTER_DB_NAME,flag:int=_DEFAULT_VERSION,**kwargs:Any,)->None:"""Initialize vearch vector store flag 1 for cluster,0 for standalone """try:ifflag:importvearch_clusterelse:importvearchexceptImportError:raiseImportError("Could not import suitable python package. ""Please install it with `pip install vearch or vearch_cluster`.")ifflag:ifpath_or_urlisNone:raiseValueError("Please input url of cluster")ifnotdb_name:db_name=self._DEFAULT_CLUSTER_DB_NAMEdb_name+="_"db_name+=str(uuid.uuid4()).split("-")[-1]self.using_db_name=db_nameself.url=path_or_urlself.vearch=vearch_cluster.VearchCluster(path_or_url)else:ifpath_or_urlisNone:metadata_path=os.getcwd().replace("\\","/")else:metadata_path=path_or_urlifnotos.path.isdir(metadata_path):os.makedirs(metadata_path)log_path=os.path.join(metadata_path,"log")ifnotos.path.isdir(log_path):os.makedirs(log_path)self.vearch=vearch.Engine(metadata_path,log_path)self.using_metapath=metadata_pathifnottable_name:table_name=self._DEFAULT_TABLE_NAMEtable_name+="_"table_name+=str(uuid.uuid4()).split("-")[-1]self.using_table_name=table_nameself.embedding_func=embedding_functionself.flag=flag
def_create_table(self,dim:int=1024,field_list:List[dict]=[{"field":"text","type":"str"},{"field":"metadata","type":"str"},],)->int:""" Create VectorStore Table Args: dim:dimension of vector fields_list: the field you want to store Return: code,0 for success,1 for failed """type_dict={"int":vearch.dataType.INT,"str":vearch.dataType.STRING}engine_info={"index_size":10000,"retrieval_type":"IVFPQ","retrieval_param":{"ncentroids":2048,"nsubvector":32},}fields=[vearch.GammaFieldInfo(fi["field"],type_dict[fi["type"]])forfiinfield_list]vector_field=vearch.GammaVectorInfo(name="text_embedding",type=vearch.dataType.VECTOR,is_index=True,dimension=dim,model_id="",store_type="MemoryOnly",store_param={"cache_size":10000},has_source=False,)response_code=self.vearch.create_table(engine_info,name=self.using_table_name,fields=fields,vector_field=vector_field,)returnresponse_codedef_create_space(self,dim:int=1024,)->int:""" Create VectorStore space Args: dim:dimension of vector Return: code,0 failed for ,1 for success """space_config={"name":self.using_table_name,"partition_num":1,"replica_num":1,"engine":{"name":"gamma","index_size":1,"retrieval_type":"FLAT","retrieval_param":{"metric_type":"L2",},},"properties":{"text":{"type":"string",},"metadata":{"type":"string",},"text_embedding":{"type":"vector","index":True,"dimension":dim,"store_type":"MemoryOnly",},},}response_code=self.vearch.create_space(self.using_db_name,space_config)returnresponse_code
[docs]defadd_texts(self,texts:Iterable[str],metadatas:Optional[List[dict]]=None,**kwargs:Any,)->List[str]:""" Returns: List of ids from adding the texts into the vectorstore. """embeddings=Noneifself.embedding_funcisnotNone:embeddings=self.embedding_func.embed_documents(list(texts))ifembeddingsisNone:raiseValueError("embeddings is None")ifself.flag:dbs_list=self.vearch.list_dbs()ifself.using_db_namenotindbs_list:create_db_code=self.vearch.create_db(self.using_db_name)ifnotcreate_db_code:raiseValueError("create db failed!!!")space_list=self.vearch.list_spaces(self.using_db_name)ifself.using_table_namenotinspace_list:create_space_code=self._create_space(len(embeddings[0]))ifnotcreate_space_code:raiseValueError("create space failed!!!")docid=[]ifembeddingsisnotNoneandmetadatasisnotNone:fortext,metadata,embedinzip(texts,metadatas,embeddings):profiles:dict[str,Any]={}profiles["text"]=textprofiles["metadata"]=metadata["source"]embed_np=np.array(embed)profiles["text_embedding"]={"feature":(embed_np/np.linalg.norm(embed_np)).tolist()}insert_res=self.vearch.insert_one(self.using_db_name,self.using_table_name,profiles)ifinsert_res["status"]==200:docid.append(insert_res["_id"])continueelse:retry_insert=self.vearch.insert_one(self.using_db_name,self.using_table_name,profiles)docid.append(retry_insert["_id"])continueelse:table_path=os.path.join(self.using_metapath,self.using_table_name+".schema")ifnotos.path.exists(table_path):dim=len(embeddings[0])response_code=self._create_table(dim)ifresponse_code:raiseValueError("create table failed!!!")ifembeddingsisnotNoneandmetadatasisnotNone:doc_items=[]fortext,metadata,embedinzip(texts,metadatas,embeddings):profiles_v:dict[str,Any]={}profiles_v["text"]=textprofiles_v["metadata"]=metadata["source"]embed_np=np.array(embed)profiles_v["text_embedding"]=embed_np/np.linalg.norm(embed_np)doc_items.append(profiles_v)docid=self.vearch.add(doc_items)t_time=0whilelen(docid)!=len(embeddings):time.sleep(0.5)ift_time>6:breakt_time+=1self.vearch.dump()returndocid
def_load(self)->None:""" load vearch engine for standalone vearch """self.vearch.load()
[docs]@classmethoddefload_local(cls,embedding:Embeddings,path_or_url:Optional[str]=None,table_name:str=_DEFAULT_TABLE_NAME,db_name:str=_DEFAULT_CLUSTER_DB_NAME,flag:int=_DEFAULT_VERSION,**kwargs:Any,)->Vearch:"""Load the local specified table of standalone vearch. Returns: Success or failure of loading the local specified table """ifnotpath_or_url:raiseValueError("No metadata path!!!")ifnottable_name:raiseValueError("No table name!!!")table_path=os.path.join(path_or_url,table_name+".schema")ifnotos.path.exists(table_path):raiseValueError("vearch vectorbase table not exist!!!")vearch_db=cls(embedding_function=embedding,path_or_url=path_or_url,table_name=table_name,db_name=db_name,flag=flag,)vearch_db._load()returnvearch_db
[docs]defsimilarity_search(self,query:str,k:int=DEFAULT_TOPN,**kwargs:Any,)->List[Document]:""" Return docs most similar to query. """ifself.embedding_funcisNone:raiseValueError("embedding_func is None!!!")embeddings=self.embedding_func.embed_query(query)docs=self.similarity_search_by_vector(embeddings,k)returndocs
[docs]defsimilarity_search_by_vector(self,embedding:List[float],k:int=DEFAULT_TOPN,**kwargs:Any,)->List[Document]:"""The most k similar documents and scores of the specified query. Args: embeddings: embedding vector of the query. k: The k most similar documents to the text query. min_score: the score of similar documents to the text query Returns: The k most similar documents to the specified text query. 0 is dissimilar, 1 is the most similar. """embed=np.array(embedding)ifself.flag:query_data={"query":{"sum":[{"field":"text_embedding","feature":(embed/np.linalg.norm(embed)).tolist(),}],},"size":k,"fields":["text","metadata"],}query_result=self.vearch.search(self.using_db_name,self.using_table_name,query_data)res=query_result["hits"]["hits"]else:query_data={"vector":[{"field":"text_embedding","feature":embed/np.linalg.norm(embed),}],"fields":[],"is_brute_search":1,"retrieval_param":{"metric_type":"InnerProduct","nprobe":20},"topn":k,}query_result=self.vearch.search(query_data)res=query_result[0]["result_items"]docs=[]foriteminres:content=""meta_data={}ifself.flag:item=item["_source"]foritem_keyinitem:ifitem_key=="text":content=item[item_key]continueifitem_key=="metadata":meta_data["source"]=item[item_key]continuedocs.append(Document(page_content=content,metadata=meta_data))returndocs
[docs]defsimilarity_search_with_score(self,query:str,k:int=DEFAULT_TOPN,**kwargs:Any,)->List[Tuple[Document,float]]:"""The most k similar documents and scores of the specified query. Args: embeddings: embedding vector of the query. k: The k most similar documents to the text query. min_score: the score of similar documents to the text query Returns: The k most similar documents to the specified text query. 0 is dissimilar, 1 is the most similar. """ifself.embedding_funcisNone:raiseValueError("embedding_func is None!!!")embeddings=self.embedding_func.embed_query(query)embed=np.array(embeddings)ifself.flag:query_data={"query":{"sum":[{"field":"text_embedding","feature":(embed/np.linalg.norm(embed)).tolist(),}],},"size":k,"fields":["text_embedding","text","metadata"],}query_result=self.vearch.search(self.using_db_name,self.using_table_name,query_data)res=query_result["hits"]["hits"]else:query_data={"vector":[{"field":"text_embedding","feature":embed/np.linalg.norm(embed),}],"fields":[],"is_brute_search":1,"retrieval_param":{"metric_type":"InnerProduct","nprobe":20},"topn":k,}query_result=self.vearch.search(query_data)res=query_result[0]["result_items"]results:List[Tuple[Document,float]]=[]foriteminres:content=""meta_data={}ifself.flag:score=item["_score"]item=item["_source"]foritem_keyinitem:ifitem_key=="text":content=item[item_key]continueifitem_key=="metadata":meta_data["source"]=item[item_key]continueifself.flag!=1anditem_key=="score":score=item[item_key]continuetmp_res=(Document(page_content=content,metadata=meta_data),score)results.append(tmp_res)returnresults
[docs]defdelete(self,ids:Optional[List[str]]=None,**kwargs:Any,)->Optional[bool]:"""Delete the documents which have the specified ids. Args: ids: The ids of the embedding vectors. **kwargs: Other keyword arguments that subclasses might use. Returns: Optional[bool]: True if deletion is successful. False otherwise, None if not implemented. """ret:Optional[bool]=Nonetmp_res=[]ifidsisNoneorids.__len__()==0:returnretfor_idinids:ifself.flag:ret=self.vearch.delete(self.using_db_name,self.using_table_name,_id)else:ret=self.vearch.del_doc(_id)tmp_res.append(ret)ret=all(i==0foriintmp_res)returnret
[docs]defget(self,ids:Optional[List[str]]=None,**kwargs:Any,)->Dict[str,Document]:"""Return docs according ids. Args: ids: The ids of the embedding vectors. Returns: Documents which satisfy the input conditions. """results:Dict[str,Document]={}ifidsisNoneorids.__len__()==0:returnresultsifself.flag:query_data={"query":{"ids":ids}}docs_detail=self.vearch.mget_by_ids(self.using_db_name,self.using_table_name,query_data)forrecordindocs_detail:ifrecord["found"]isFalse:continuecontent=""meta_info={}forfieldinrecord["_source"]:iffield=="text":content=record["_source"][field]continueeliffield=="metadata":meta_info["source"]=record["_source"][field]continueresults[record["_id"]]=Document(page_content=content,metadata=meta_info)else:foridinids:docs_detail=self.vearch.get_doc_by_id(id)ifdocs_detail=={}:continuecontent=""meta_info={}forfieldindocs_detail:iffield=="text":content=docs_detail[field]continueeliffield=="metadata":meta_info["source"]=docs_detail[field]continueresults[docs_detail["_id"]]=Document(page_content=content,metadata=meta_info)returnresults