[docs]classKDBAI(VectorStore):"""`KDB.AI` vector store. See https://kdb.ai. To use, you should have the `kdbai_client` python package installed. Args: table: kdbai_client.Table object to use as storage, embedding: Any embedding function implementing `langchain.embeddings.base.Embeddings` interface, distance_strategy: One option from DistanceStrategy.EUCLIDEAN_DISTANCE, DistanceStrategy.DOT_PRODUCT or DistanceStrategy.COSINE. See the example [notebook](https://github.com/KxSystems/langchain/blob/KDB.AI/docs/docs/integrations/vectorstores/kdbai.ipynb). """
[docs]def__init__(self,table:Any,embedding:Embeddings,distance_strategy:Optional[DistanceStrategy]=DistanceStrategy.EUCLIDEAN_DISTANCE,):try:importkdbai_client# noqaexceptImportError:raiseImportError("Could not import kdbai_client python package. ""Please install it with `pip install kdbai_client`.")self._table=tableself._embedding=embeddingself.distance_strategy=distance_strategy
@propertydefembeddings(self)->Optional[Embeddings]:ifisinstance(self._embedding,Embeddings):returnself._embeddingreturnNonedef_embed_documents(self,texts:Iterable[str])->List[List[float]]:ifisinstance(self._embedding,Embeddings):returnself._embedding.embed_documents(list(texts))return[self._embedding(t)fortintexts]def_embed_query(self,text:str)->List[float]:ifisinstance(self._embedding,Embeddings):returnself._embedding.embed_query(text)returnself._embedding(text)def_insert(self,texts:List[str],ids:Optional[List[str]],metadata:Optional[Any]=None,)->None:try:importnumpyasnpexceptImportError:raiseImportError("Could not import numpy python package. ""Please install it with `pip install numpy`.")try:importpandasaspdexceptImportError:raiseImportError("Could not import pandas python package. ""Please install it with `pip install pandas`.")embeds=self._embedding.embed_documents(texts)df=pd.DataFrame()df["id"]=idsdf["text"]=[t.encode("utf-8")fortintexts]df["embeddings"]=[np.array(e,dtype="float32")foreinembeds]ifmetadataisnotNone:df=pd.concat([df,metadata],axis=1)self._table.insert(df,warn=False)
[docs]defadd_texts(self,texts:Iterable[str],metadatas:Optional[List[dict]]=None,ids:Optional[List[str]]=None,batch_size:int=32,**kwargs:Any,)->List[str]:"""Run more texts through the embeddings and add to the vectorstore. Args: texts (Iterable[str]): Texts to add to the vectorstore. metadatas (Optional[List[dict]]): List of metadata corresponding to each chunk of text. ids (Optional[List[str]]): List of IDs corresponding to each chunk of text. batch_size (Optional[int]): Size of batch of chunks of text to insert at once. Returns: List[str]: List of IDs of the added texts. """try:importpandasaspdexceptImportError:raiseImportError("Could not import pandas python package. ""Please install it with `pip install pandas`.")texts=list(texts)metadf:pd.DataFrame=NoneifmetadatasisnotNone:ifisinstance(metadatas,pd.DataFrame):metadf=metadataselse:metadf=pd.DataFrame(metadatas)out_ids:List[str]=[]nbatches=(len(texts)-1)//batch_size+1foriinrange(nbatches):istart=i*batch_sizeiend=(i+1)*batch_sizebatch=texts[istart:iend]ifids:batch_ids=ids[istart:iend]else:batch_ids=[str(uuid.uuid4())for_inrange(len(batch))]ifmetadfisnotNone:batch_meta=metadf.iloc[istart:iend].reset_index(drop=True)else:batch_meta=Noneself._insert(batch,batch_ids,batch_meta)out_ids=out_ids+batch_idsreturnout_ids
[docs]defadd_documents(self,documents:List[Document],batch_size:int=32,**kwargs:Any)->List[str]:"""Run more documents through the embeddings and add to the vectorstore. Args: documents (List[Document]: Documents to add to the vectorstore. batch_size (Optional[int]): Size of batch of documents to insert at once. Returns: List[str]: List of IDs of the added texts. """try:importpandasaspdexceptImportError:raiseImportError("Could not import pandas python package. ""Please install it with `pip install pandas`.")texts=[x.page_contentforxindocuments]metadata=pd.DataFrame([x.metadataforxindocuments])returnself.add_texts(texts,metadata=metadata,batch_size=batch_size)
[docs]defsimilarity_search_with_score(self,query:str,k:int=1,filter:Optional[List]=[],**kwargs:Any,)->List[Tuple[Document,float]]:"""Run similarity search with distance from a query string. Args: query (str): Query string. k (Optional[int]): number of neighbors to retrieve. filter (Optional[List]): KDB.AI metadata filter clause: https://code.kx.com/kdbai/use/filter.html Returns: List[Document]: List of similar documents. """returnself.similarity_search_by_vector_with_score(self._embed_query(query),k=k,filter=filter,**kwargs)
[docs]defsimilarity_search_by_vector_with_score(self,embedding:List[float],*,k:int=1,filter:Optional[List]=[],**kwargs:Any,)->List[Tuple[Document,float]]:"""Return documents most similar to embedding, along with scores. Args: embedding (List[float]): query vector. k (Optional[int]): number of neighbors to retrieve. filter (Optional[List]): KDB.AI metadata filter clause: https://code.kx.com/kdbai/use/filter.html Returns: List[Document]: List of similar documents. """if"n"inkwargs:k=kwargs.pop("n")matches=self._table.search(vectors=[embedding],n=k,filter=filter,**kwargs)docs:list=[]ifisinstance(matches,list):matches=matches[0]else:returndocsforrowinmatches.to_dict(orient="records"):text=row.pop("text")score=row.pop("__nn_distance")docs.append((Document(page_content=text,metadata={k:vfork,vinrow.items()ifk!="text"},),score,))returndocs
[docs]defsimilarity_search(self,query:str,k:int=1,filter:Optional[List]=[],**kwargs:Any,)->List[Document]:"""Run similarity search from a query string. Args: query (str): Query string. k (Optional[int]): number of neighbors to retrieve. filter (Optional[List]): KDB.AI metadata filter clause: https://code.kx.com/kdbai/use/filter.html Returns: List[Document]: List of similar documents. """docs_and_scores=self.similarity_search_with_score(query,k=k,filter=filter,**kwargs)return[docfordoc,_indocs_and_scores]