[docs]def__init__(self,table_name:str=_DEFAULT_TABLE_NAME,embedding:Optional[Embeddings]=None,log_and_data_dir:Optional[str]=None,client:Optional[awadb.Client]=None,**kwargs:Any,)->None:"""Initialize with AwaDB client. If table_name is not specified, a random table name of `_DEFAULT_TABLE_NAME + last segment of uuid` would be created automatically. Args: table_name: Name of the table created, default _DEFAULT_TABLE_NAME. embedding: Optional Embeddings initially set. log_and_data_dir: Optional the root directory of log and data. client: Optional AwaDB client. kwargs: Any possible extend parameters in the future. Returns: None. """try:importawadbexceptImportError:raiseImportError("Could not import awadb python package. ""Please install it with `pip install awadb`.")ifclientisnotNone:self.awadb_client=clientelse:iflog_and_data_dirisnotNone:self.awadb_client=awadb.Client(log_and_data_dir)else:self.awadb_client=awadb.Client()iftable_name==self._DEFAULT_TABLE_NAME:table_name+="_"table_name+=str(uuid.uuid4()).split("-")[-1]self.awadb_client.Create(table_name)self.table2embeddings:dict[str,Embeddings]={}ifembeddingisnotNone:self.table2embeddings[table_name]=embeddingself.using_table_name=table_name
[docs]defadd_texts(self,texts:Iterable[str],metadatas:Optional[List[dict]]=None,is_duplicate_texts:Optional[bool]=None,**kwargs:Any,)->List[str]:"""Run more texts through the embeddings and add to the vectorstore. Args: texts: Iterable of strings to add to the vectorstore. metadatas: Optional list of metadatas associated with the texts. is_duplicate_texts: Optional whether to duplicate texts. Defaults to True. kwargs: any possible extend parameters in the future. Returns: List of ids from adding the texts into the vectorstore. """ifself.awadb_clientisNone:raiseValueError("AwaDB client is None!!!")embeddings=Noneifself.using_table_nameinself.table2embeddings:embeddings=self.table2embeddings[self.using_table_name].embed_documents(list(texts))returnself.awadb_client.AddTexts("embedding_text","text_embedding",texts,embeddings,metadatas,is_duplicate_texts,)
[docs]defload_local(self,table_name:str,**kwargs:Any,)->bool:"""Load the local specified table. Args: table_name: Table name kwargs: Any possible extend parameters in the future. Returns: Success or failure of loading the local specified table """ifself.awadb_clientisNone:raiseValueError("AwaDB client is None!!!")returnself.awadb_client.Load(table_name)
[docs]defsimilarity_search(self,query:str,k:int=DEFAULT_TOPN,text_in_page_content:Optional[str]=None,meta_filter:Optional[dict]=None,**kwargs:Any,)->List[Document]:"""Return docs most similar to query. Args: query: Text query. k: The maximum number of documents to return. text_in_page_content: Filter by the text in page_content of Document. meta_filter (Optional[dict]): Filter by metadata. Defaults to None. E.g. `{"color" : "red", "price": 4.20}`. Optional. E.g. `{"max_price" : 15.66, "min_price": 4.20}` `price` is the metadata field, means range filter(4.20<'price'<15.66). E.g. `{"maxe_price" : 15.66, "mine_price": 4.20}` `price` is the metadata field, means range filter(4.20<='price'<=15.66). kwargs: Any possible extend parameters in the future. Returns: Returns the k most similar documents to the specified text query. """ifself.awadb_clientisNone:raiseValueError("AwaDB client is None!!!")embedding=Noneifself.using_table_nameinself.table2embeddings:embedding=self.table2embeddings[self.using_table_name].embed_query(query)else:fromawadbimportAwaEmbeddingembedding=AwaEmbedding().Embedding(query)not_include_fields:Set[str]={"text_embedding","_id","score"}returnself.similarity_search_by_vector(embedding,k,text_in_page_content=text_in_page_content,meta_filter=meta_filter,not_include_fields_in_metadata=not_include_fields,)
[docs]defsimilarity_search_with_score(self,query:str,k:int=DEFAULT_TOPN,text_in_page_content:Optional[str]=None,meta_filter:Optional[dict]=None,**kwargs:Any,)->List[Tuple[Document,float]]:"""The most k similar documents and scores of the specified query. Args: query: Text query. k: The k most similar documents to the text query. text_in_page_content: Filter by the text in page_content of Document. meta_filter: Filter by metadata. Defaults to None. kwargs: Any possible extend parameters in the future. Returns: The k most similar documents to the specified text query. 0 is dissimilar, 1 is the most similar. """ifself.awadb_clientisNone:raiseValueError("AwaDB client is None!!!")embedding=Noneifself.using_table_nameinself.table2embeddings:embedding=self.table2embeddings[self.using_table_name].embed_query(query)else:fromawadbimportAwaEmbeddingembedding=AwaEmbedding().Embedding(query)results:List[Tuple[Document,float]]=[]not_include_fields:Set[str]={"text_embedding","_id"}retrieval_docs=self.similarity_search_by_vector(embedding,k,text_in_page_content=text_in_page_content,meta_filter=meta_filter,not_include_fields_in_metadata=not_include_fields,)fordocinretrieval_docs:score=doc.metadata["score"]deldoc.metadata["score"]doc_tuple=(doc,score)results.append(doc_tuple)returnresults
[docs]defsimilarity_search_by_vector(self,embedding:Optional[List[float]]=None,k:int=DEFAULT_TOPN,text_in_page_content:Optional[str]=None,meta_filter:Optional[dict]=None,not_include_fields_in_metadata:Optional[Set[str]]=None,**kwargs:Any,)->List[Document]:"""Return docs most similar to embedding vector. Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. text_in_page_content: Filter by the text in page_content of Document. meta_filter: Filter by metadata. Defaults to None. not_incude_fields_in_metadata: Not include meta fields of each document. Returns: List of Documents which are the most similar to the query vector. """ifself.awadb_clientisNone:raiseValueError("AwaDB client is None!!!")results:List[Document]=[]ifembeddingisNone:returnresultsshow_results=self.awadb_client.Search(embedding,k,text_in_page_content=text_in_page_content,meta_filter=meta_filter,not_include_fields=not_include_fields_in_metadata,)ifshow_results.__len__()==0:returnresultsforitem_detailinshow_results[0]["ResultItems"]:content=""meta_data={}foritem_keyinitem_detail:ifitem_key=="embedding_text":content=item_detail[item_key]continueelifnot_include_fields_in_metadataisnotNone:ifitem_keyinnot_include_fields_in_metadata:continuemeta_data[item_key]=item_detail[item_key]results.append(Document(page_content=content,metadata=meta_data))returnresults
[docs]defmax_marginal_relevance_search(self,query:str,k:int=4,fetch_k:int=20,lambda_mult:float=0.5,text_in_page_content:Optional[str]=None,meta_filter:Optional[dict]=None,**kwargs:Any,)->List[Document]:"""Return docs selected using the maximal marginal relevance. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. text_in_page_content: Filter by the text in page_content of Document. meta_filter (Optional[dict]): Filter by metadata. Defaults to None. Returns: List of Documents selected by maximal marginal relevance. """ifself.awadb_clientisNone:raiseValueError("AwaDB client is None!!!")embedding:List[float]=[]ifself.using_table_nameinself.table2embeddings:embedding=self.table2embeddings[self.using_table_name].embed_query(query)else:fromawadbimportAwaEmbeddingembedding=AwaEmbedding().Embedding(query)ifembedding.__len__()==0:return[]results=self.max_marginal_relevance_search_by_vector(embedding,k,fetch_k,lambda_mult=lambda_mult,text_in_page_content=text_in_page_content,meta_filter=meta_filter,)returnresults
[docs]defmax_marginal_relevance_search_by_vector(self,embedding:List[float],k:int=4,fetch_k:int=20,lambda_mult:float=0.5,text_in_page_content:Optional[str]=None,meta_filter:Optional[dict]=None,**kwargs:Any,)->List[Document]:"""Return docs selected using the maximal marginal relevance. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. text_in_page_content: Filter by the text in page_content of Document. meta_filter (Optional[dict]): Filter by metadata. Defaults to None. Returns: List of Documents selected by maximal marginal relevance. """ifself.awadb_clientisNone:raiseValueError("AwaDB client is None!!!")results:List[Document]=[]ifembeddingisNone:returnresultsnot_include_fields:set={"_id","score"}retrieved_docs=self.similarity_search_by_vector(embedding,fetch_k,text_in_page_content=text_in_page_content,meta_filter=meta_filter,not_include_fields_in_metadata=not_include_fields,)top_embeddings=[]fordocinretrieved_docs:top_embeddings.append(doc.metadata["text_embedding"])selected_docs=maximal_marginal_relevance(np.array(embedding,dtype=np.float32),embedding_list=top_embeddings)fors_idinselected_docs:if"text_embedding"inretrieved_docs[s_id].metadata:delretrieved_docs[s_id].metadata["text_embedding"]results.append(retrieved_docs[s_id])returnresults
[docs]defget(self,ids:Optional[List[str]]=None,text_in_page_content:Optional[str]=None,meta_filter:Optional[dict]=None,not_include_fields:Optional[Set[str]]=None,limit:Optional[int]=None,**kwargs:Any,)->Dict[str,Document]:"""Return docs according ids. Args: ids: The ids of the embedding vectors. text_in_page_content: Filter by the text in page_content of Document. meta_filter: Filter by any metadata of the document. not_include_fields: Not pack the specified fields of each document. limit: The number of documents to return. Defaults to 5. Optional. Returns: Documents which satisfy the input conditions. """ifself.awadb_clientisNone:raiseValueError("AwaDB client is None!!!")docs_detail=self.awadb_client.Get(ids=ids,text_in_page_content=text_in_page_content,meta_filter=meta_filter,not_include_fields=not_include_fields,limit=limit,)results:Dict[str,Document]={}fordoc_detailindocs_detail:content=""meta_info={}forfieldindoc_detail:iffield=="embedding_text":content=doc_detail[field]continueeliffield=="text_embedding"orfield=="_id":continuemeta_info[field]=doc_detail[field]doc=Document(page_content=content,metadata=meta_info)results[doc_detail["_id"]]=docreturnresults
[docs]defdelete(self,ids:Optional[List[str]]=None,**kwargs:Any,)->Optional[bool]:"""Delete the documents which have the specified ids. Args: ids: The ids of the embedding vectors. **kwargs: Other keyword arguments that subclasses might use. Returns: Optional[bool]: True if deletion is successful. False otherwise, None if not implemented. """ifself.awadb_clientisNone:raiseValueError("AwaDB client is None!!!")ret:Optional[bool]=NoneifidsisNoneorids.__len__()==0:returnretret=self.awadb_client.Delete(ids)returnret
[docs]defupdate(self,ids:List[str],texts:Iterable[str],metadatas:Optional[List[dict]]=None,**kwargs:Any,)->List[str]:"""Update the documents which have the specified ids. Args: ids: The id list of the updating embedding vector. texts: The texts of the updating documents. metadatas: The metadatas of the updating documents. Returns: the ids of the updated documents. """ifself.awadb_clientisNone:raiseValueError("AwaDB client is None!!!")returnself.awadb_client.UpdateTexts(ids=ids,text_field_name="embedding_text",texts=texts,metadatas=metadatas)
[docs]defcreate_table(self,table_name:str,**kwargs:Any,)->bool:"""Create a new table."""ifself.awadb_clientisNone:returnFalseret=self.awadb_client.Create(table_name)ifret:self.using_table_name=table_namereturnret
[docs]defuse(self,table_name:str,**kwargs:Any,)->bool:"""Use the specified table. Don't know the tables, please invoke list_tables."""ifself.awadb_clientisNone:returnFalseret=self.awadb_client.Use(table_name)ifret:self.using_table_name=table_namereturnret
[docs]deflist_tables(self,**kwargs:Any,)->List[str]:"""List all the tables created by the client."""ifself.awadb_clientisNone:return[]returnself.awadb_client.ListAllTables()
[docs]defget_current_table(self,**kwargs:Any,)->str:"""Get the current table."""returnself.using_table_name
[docs]@classmethoddeffrom_texts(cls:Type[AwaDB],texts:List[str],embedding:Optional[Embeddings]=None,metadatas:Optional[List[dict]]=None,table_name:str=_DEFAULT_TABLE_NAME,log_and_data_dir:Optional[str]=None,client:Optional[awadb.Client]=None,**kwargs:Any,)->AwaDB:"""Create an AwaDB vectorstore from a raw documents. Args: texts (List[str]): List of texts to add to the table. embedding (Optional[Embeddings]): Embedding function. Defaults to None. metadatas (Optional[List[dict]]): List of metadatas. Defaults to None. table_name (str): Name of the table to create. log_and_data_dir (Optional[str]): Directory of logging and persistence. client (Optional[awadb.Client]): AwaDB client Returns: AwaDB: AwaDB vectorstore. """awadb_client=cls(table_name=table_name,embedding=embedding,log_and_data_dir=log_and_data_dir,client=client,)awadb_client.add_texts(texts=texts,metadatas=metadatas)returnawadb_client
[docs]@classmethoddeffrom_documents(cls:Type[AwaDB],documents:List[Document],embedding:Optional[Embeddings]=None,table_name:str=_DEFAULT_TABLE_NAME,log_and_data_dir:Optional[str]=None,client:Optional[awadb.Client]=None,**kwargs:Any,)->AwaDB:"""Create an AwaDB vectorstore from a list of documents. If a log_and_data_dir specified, the table will be persisted there. Args: documents (List[Document]): List of documents to add to the vectorstore. embedding (Optional[Embeddings]): Embedding function. Defaults to None. table_name (str): Name of the table to create. log_and_data_dir (Optional[str]): Directory to persist the table. client (Optional[awadb.Client]): AwaDB client. Any: Any possible parameters in the future Returns: AwaDB: AwaDB vectorstore. """texts=[doc.page_contentfordocindocuments]metadatas=[doc.metadatafordocindocuments]returncls.from_texts(texts=texts,embedding=embedding,metadatas=metadatas,table_name=table_name,log_and_data_dir=log_and_data_dir,client=client,)