[docs]defdependable_annoy_import()->Any:"""Import annoy if available, otherwise raise error."""returnguard_import("annoy")
[docs]classAnnoy(VectorStore):"""`Annoy` vector store. To use, you should have the ``annoy`` python package installed. Example: .. code-block:: python from langchain_community.vectorstores import Annoy db = Annoy(embedding_function, index, docstore, index_to_docstore_id) """
[docs]def__init__(self,embedding_function:Callable,index:Any,metric:str,docstore:Docstore,index_to_docstore_id:Dict[int,str],):"""Initialize with necessary components."""self.embedding_function=embedding_functionself.index=indexself.metric=metricself.docstore=docstoreself.index_to_docstore_id=index_to_docstore_id
[docs]defadd_texts(self,texts:Iterable[str],metadatas:Optional[List[dict]]=None,**kwargs:Any,)->List[str]:raiseNotImplementedError("Annoy does not allow to add new data once the index is build.")
[docs]defprocess_index_results(self,idxs:List[int],dists:List[float])->List[Tuple[Document,float]]:"""Turns annoy results into a list of documents and scores. Args: idxs: List of indices of the documents in the index. dists: List of distances of the documents in the index. Returns: List of Documents and scores. """docs=[]foridx,distinzip(idxs,dists):_id=self.index_to_docstore_id[idx]doc=self.docstore.search(_id)ifnotisinstance(doc,Document):raiseValueError(f"Could not find document for id {_id}, got {doc}")docs.append((doc,dist))returndocs
[docs]defsimilarity_search_with_score_by_vector(self,embedding:List[float],k:int=4,search_k:int=-1)->List[Tuple[Document,float]]:"""Return docs most similar to query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. search_k: inspect up to search_k nodes which defaults to n_trees * n if not provided Returns: List of Documents most similar to the query and score for each """idxs,dists=self.index.get_nns_by_vector(embedding,k,search_k=search_k,include_distances=True)returnself.process_index_results(idxs,dists)
[docs]defsimilarity_search_with_score_by_index(self,docstore_index:int,k:int=4,search_k:int=-1)->List[Tuple[Document,float]]:"""Return docs most similar to query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. search_k: inspect up to search_k nodes which defaults to n_trees * n if not provided Returns: List of Documents most similar to the query and score for each """idxs,dists=self.index.get_nns_by_item(docstore_index,k,search_k=search_k,include_distances=True)returnself.process_index_results(idxs,dists)
[docs]defsimilarity_search_with_score(self,query:str,k:int=4,search_k:int=-1)->List[Tuple[Document,float]]:"""Return docs most similar to query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. search_k: inspect up to search_k nodes which defaults to n_trees * n if not provided Returns: List of Documents most similar to the query and score for each """embedding=self.embedding_function(query)docs=self.similarity_search_with_score_by_vector(embedding,k,search_k)returndocs
[docs]defsimilarity_search_by_vector(self,embedding:List[float],k:int=4,search_k:int=-1,**kwargs:Any)->List[Document]:"""Return docs most similar to embedding vector. Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. search_k: inspect up to search_k nodes which defaults to n_trees * n if not provided Returns: List of Documents most similar to the embedding. """docs_and_scores=self.similarity_search_with_score_by_vector(embedding,k,search_k)return[docfordoc,_indocs_and_scores]
[docs]defsimilarity_search_by_index(self,docstore_index:int,k:int=4,search_k:int=-1,**kwargs:Any)->List[Document]:"""Return docs most similar to docstore_index. Args: docstore_index: Index of document in docstore k: Number of Documents to return. Defaults to 4. search_k: inspect up to search_k nodes which defaults to n_trees * n if not provided Returns: List of Documents most similar to the embedding. """docs_and_scores=self.similarity_search_with_score_by_index(docstore_index,k,search_k)return[docfordoc,_indocs_and_scores]
[docs]defsimilarity_search(self,query:str,k:int=4,search_k:int=-1,**kwargs:Any)->List[Document]:"""Return docs most similar to query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. search_k: inspect up to search_k nodes which defaults to n_trees * n if not provided Returns: List of Documents most similar to the query. """docs_and_scores=self.similarity_search_with_score(query,k,search_k)return[docfordoc,_indocs_and_scores]
[docs]defmax_marginal_relevance_search_by_vector(self,embedding:List[float],k:int=4,fetch_k:int=20,lambda_mult:float=0.5,**kwargs:Any,)->List[Document]:"""Return docs selected using the maximal marginal relevance. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: embedding: Embedding to look up documents similar to. fetch_k: Number of Documents to fetch to pass to MMR algorithm. k: Number of Documents to return. Defaults to 4. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. Returns: List of Documents selected by maximal marginal relevance. """idxs=self.index.get_nns_by_vector(embedding,fetch_k,search_k=-1,include_distances=False)embeddings=[self.index.get_item_vector(i)foriinidxs]mmr_selected=maximal_marginal_relevance(np.array([embedding],dtype=np.float32),embeddings,k=k,lambda_mult=lambda_mult,)# ignore the -1's if not enough docs are returned/indexedselected_indices=[idxs[i]foriinmmr_selectedifi!=-1]docs=[]foriinselected_indices:_id=self.index_to_docstore_id[i]doc=self.docstore.search(_id)ifnotisinstance(doc,Document):raiseValueError(f"Could not find document for id {_id}, got {doc}")docs.append(doc)returndocs
[docs]defmax_marginal_relevance_search(self,query:str,k:int=4,fetch_k:int=20,lambda_mult:float=0.5,**kwargs:Any,)->List[Document]:"""Return docs selected using the maximal marginal relevance. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. Returns: List of Documents selected by maximal marginal relevance. """embedding=self.embedding_function(query)docs=self.max_marginal_relevance_search_by_vector(embedding,k,fetch_k,lambda_mult=lambda_mult)returndocs
@classmethoddef__from(cls,texts:List[str],embeddings:List[List[float]],embedding:Embeddings,metadatas:Optional[List[dict]]=None,metric:str=DEFAULT_METRIC,trees:int=100,n_jobs:int=-1,**kwargs:Any,)->Annoy:ifmetricnotinINDEX_METRICS:raiseValueError((f"Unsupported distance metric: {metric}. "f"Expected one of {list(INDEX_METRICS)}"))annoy=guard_import("annoy")ifnotembeddings:raiseValueError("embeddings must be provided to build AnnoyIndex")f=len(embeddings[0])index=annoy.AnnoyIndex(f,metric=metric)fori,embinenumerate(embeddings):index.add_item(i,emb)index.build(trees,n_jobs=n_jobs)documents=[]fori,textinenumerate(texts):metadata=metadatas[i]ifmetadataselse{}documents.append(Document(page_content=text,metadata=metadata))index_to_id={i:str(uuid.uuid4())foriinrange(len(documents))}docstore=InMemoryDocstore({index_to_id[i]:docfori,docinenumerate(documents)})returncls(embedding.embed_query,index,metric,docstore,index_to_id)
[docs]@classmethoddeffrom_texts(cls,texts:List[str],embedding:Embeddings,metadatas:Optional[List[dict]]=None,metric:str=DEFAULT_METRIC,trees:int=100,n_jobs:int=-1,**kwargs:Any,)->Annoy:"""Construct Annoy wrapper from raw documents. Args: texts: List of documents to index. embedding: Embedding function to use. metadatas: List of metadata dictionaries to associate with documents. metric: Metric to use for indexing. Defaults to "angular". trees: Number of trees to use for indexing. Defaults to 100. n_jobs: Number of jobs to use for indexing. Defaults to -1. This is a user friendly interface that: 1. Embeds documents. 2. Creates an in memory docstore 3. Initializes the Annoy database This is intended to be a quick way to get started. Example: .. code-block:: python from langchain_community.vectorstores import Annoy from langchain_community.embeddings import OpenAIEmbeddings embeddings = OpenAIEmbeddings() index = Annoy.from_texts(texts, embeddings) """embeddings=embedding.embed_documents(texts)returncls.__from(texts,embeddings,embedding,metadatas,metric,trees,n_jobs,**kwargs)
[docs]@classmethoddeffrom_embeddings(cls,text_embeddings:List[Tuple[str,List[float]]],embedding:Embeddings,metadatas:Optional[List[dict]]=None,metric:str=DEFAULT_METRIC,trees:int=100,n_jobs:int=-1,**kwargs:Any,)->Annoy:"""Construct Annoy wrapper from embeddings. Args: text_embeddings: List of tuples of (text, embedding) embedding: Embedding function to use. metadatas: List of metadata dictionaries to associate with documents. metric: Metric to use for indexing. Defaults to "angular". trees: Number of trees to use for indexing. Defaults to 100. n_jobs: Number of jobs to use for indexing. Defaults to -1 This is a user friendly interface that: 1. Creates an in memory docstore with provided embeddings 2. Initializes the Annoy database This is intended to be a quick way to get started. Example: .. code-block:: python from langchain_community.vectorstores import Annoy from langchain_community.embeddings import OpenAIEmbeddings embeddings = OpenAIEmbeddings() text_embeddings = embeddings.embed_documents(texts) text_embedding_pairs = list(zip(texts, text_embeddings)) db = Annoy.from_embeddings(text_embedding_pairs, embeddings) """texts=[t[0]fortintext_embeddings]embeddings=[t[1]fortintext_embeddings]returncls.__from(texts,embeddings,embedding,metadatas,metric,trees,n_jobs,**kwargs)
[docs]defsave_local(self,folder_path:str,prefault:bool=False)->None:"""Save Annoy index, docstore, and index_to_docstore_id to disk. Args: folder_path: folder path to save index, docstore, and index_to_docstore_id to. prefault: Whether to pre-load the index into memory. """path=Path(folder_path)os.makedirs(path,exist_ok=True)# save index, index config, docstore and index_to_docstore_idconfig_object=ConfigParser()config_object["ANNOY"]={"f":self.index.f,"metric":self.metric,}self.index.save(str(path/"index.annoy"),prefault=prefault)withopen(path/"index.pkl","wb")asfile:pickle.dump((self.docstore,self.index_to_docstore_id,config_object),file)
[docs]@classmethoddefload_local(cls,folder_path:str,embeddings:Embeddings,*,allow_dangerous_deserialization:bool=False,)->Annoy:"""Load Annoy index, docstore, and index_to_docstore_id to disk. Args: folder_path: folder path to load index, docstore, and index_to_docstore_id from. embeddings: Embeddings to use when generating queries. allow_dangerous_deserialization: whether to allow deserialization of the data which involves loading a pickle file. Pickle files can be modified by malicious actors to deliver a malicious payload that results in execution of arbitrary code on your machine. """ifnotallow_dangerous_deserialization:raiseValueError("The de-serialization relies loading a pickle file. ""Pickle files can be modified to deliver a malicious payload that ""results in execution of arbitrary code on your machine.""You will need to set `allow_dangerous_deserialization` to `True` to ""enable deserialization. If you do this, make sure that you ""trust the source of the data. For example, if you are loading a ""file that you created, and know that no one else has modified the ""file, then this is safe to do. Do not set this to `True` if you are ""loading a file from an untrusted source (e.g., some random site on ""the internet.).")path=Path(folder_path)# load index separately since it is not picklableannoy=guard_import("annoy")# load docstore and index_to_docstore_idwithopen(path/"index.pkl","rb")asfile:# Code path can only be reached if allow_dangerous_deserialization is True(docstore,index_to_docstore_id,config_object,)=pickle.load(# ignore[pickle]: explicit-opt-infile)f=int(config_object["ANNOY"]["f"])metric=config_object["ANNOY"]["metric"]index=annoy.AnnoyIndex(f,metric=metric)index.load(str(path/"index.annoy"))returncls(embeddings.embed_query,index,metric,docstore,index_to_docstore_id)