Source code for langchain_community.vectorstores.docarray.base
fromabcimportABCfromtypingimportTYPE_CHECKING,Any,Iterable,List,Optional,Tuple,Typeimportnumpyasnpfromlangchain_core.documentsimportDocumentfromlangchain_core.embeddingsimportEmbeddingsfromlangchain_core.vectorstoresimportVectorStorefrompydanticimportFieldfromlangchain_community.vectorstores.utilsimportmaximal_marginal_relevanceifTYPE_CHECKING:fromdocarrayimportBaseDocfromdocarray.index.abstractimportBaseDocIndexdef_check_docarray_import()->None:try:importdocarrayda_version=docarray.__version__.split(".")ifint(da_version[0])==0andint(da_version[1])<=31:raiseImportError(f"To use the DocArrayHnswSearch VectorStore the docarray "f"version >=0.32.0 is expected, received: {docarray.__version__}."f"To upgrade, please run: `pip install -U docarray`.")exceptImportError:raiseImportError("Could not import docarray python package. ""Please install it with `pip install docarray`.")
[docs]classDocArrayIndex(VectorStore,ABC):"""Base class for `DocArray` based vector stores."""
[docs]def__init__(self,doc_index:"BaseDocIndex",embedding:Embeddings,):"""Initialize a vector store from DocArray's DocIndex."""self.doc_index=doc_indexself.embedding=embedding
@staticmethoddef_get_doc_cls(**embeddings_params:Any)->Type["BaseDoc"]:"""Get docarray Document class describing the schema of DocIndex."""fromdocarrayimportBaseDocfromdocarray.typingimportNdArrayclassDocArrayDoc(BaseDoc):text:Optional[str]=Field(default=None)embedding:Optional[NdArray]=Field(**embeddings_params)metadata:Optional[dict]=Field(default=None)returnDocArrayDoc@propertydefdoc_cls(self)->Type["BaseDoc"]:ifself.doc_index._schemaisNone:raiseValueError("doc_index expected to have non-null _schema attribute.")returnself.doc_index._schema
[docs]defadd_texts(self,texts:Iterable[str],metadatas:Optional[List[dict]]=None,**kwargs:Any,)->List[str]:"""Embed texts and add to the vector store. Args: texts: Iterable of strings to add to the vectorstore. metadatas: Optional list of metadatas associated with the texts. Returns: List of ids from adding the texts into the vectorstore. """ids:List[str]=[]embeddings=self.embedding.embed_documents(list(texts))fori,(t,e)inenumerate(zip(texts,embeddings)):m=metadatas[i]ifmetadataselse{}doc=self.doc_cls(text=t,embedding=e,metadata=m)self.doc_index.index([doc])ids.append(str(doc.id))returnids
[docs]defsimilarity_search_with_score(self,query:str,k:int=4,**kwargs:Any)->List[Tuple[Document,float]]:"""Return docs most similar to query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. Returns: List of documents most similar to the query text and cosine distance in float for each. Lower score represents more similarity. """query_embedding=self.embedding.embed_query(query)query_doc=self.doc_cls(embedding=query_embedding)# type: ignoredocs,scores=self.doc_index.find(query_doc,search_field="embedding",limit=k)result=[(Document(page_content=doc.text,metadata=doc.metadata),score)fordoc,scoreinzip(docs,scores)]returnresult
[docs]defsimilarity_search(self,query:str,k:int=4,**kwargs:Any)->List[Document]:"""Return docs most similar to query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. Returns: List of Documents most similar to the query. """results=self.similarity_search_with_score(query,k=k,**kwargs)return[docfordoc,_inresults]
def_similarity_search_with_relevance_scores(self,query:str,k:int=4,**kwargs:Any,)->List[Tuple[Document,float]]:"""Return docs and relevance scores, normalized on a scale from 0 to 1. 0 is dissimilar, 1 is most similar. """raiseNotImplementedError()
[docs]defsimilarity_search_by_vector(self,embedding:List[float],k:int=4,**kwargs:Any)->List[Document]:"""Return docs most similar to embedding vector. Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. Returns: List of Documents most similar to the query vector. """query_doc=self.doc_cls(embedding=embedding)# type: ignoredocs=self.doc_index.find(query_doc,search_field="embedding",limit=k).documentsresult=[Document(page_content=doc.text,metadata=doc.metadata)fordocindocs]returnresult
[docs]defmax_marginal_relevance_search(self,query:str,k:int=4,fetch_k:int=20,lambda_mult:float=0.5,**kwargs:Any,)->List[Document]:"""Return docs selected using the maximal marginal relevance. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. Returns: List of Documents selected by maximal marginal relevance. """query_embedding=self.embedding.embed_query(query)query_doc=self.doc_cls(embedding=query_embedding)# type: ignoredocs=self.doc_index.find(query_doc,search_field="embedding",limit=fetch_k).documentsmmr_selected=maximal_marginal_relevance(np.array(query_embedding),docs.embedding,k=k)results=[Document(page_content=docs[idx].text,metadata=docs[idx].metadata)foridxinmmr_selected]returnresults