[docs]defnormalize(x:np.ndarray)->np.ndarray:"""Normalize vectors to unit length."""x/=np.clip(np.linalg.norm(x,axis=-1,keepdims=True),1e-12,None)returnx
[docs]defdependable_scann_import()->Any:""" Import `scann` if available, otherwise raise error. """returnguard_import("scann")
[docs]classScaNN(VectorStore):"""`ScaNN` vector store. To use, you should have the ``scann`` python package installed. Example: .. code-block:: python from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import ScaNN model_name = "sentence-transformers/all-mpnet-base-v2" db = ScaNN.from_texts( ['foo', 'bar', 'barz', 'qux'], HuggingFaceEmbeddings(model_name=model_name)) db.similarity_search('foo?', k=1) """
[docs]def__init__(self,embedding:Embeddings,index:Any,docstore:Docstore,index_to_docstore_id:Dict[int,str],relevance_score_fn:Optional[Callable[[float],float]]=None,normalize_L2:bool=False,distance_strategy:DistanceStrategy=DistanceStrategy.EUCLIDEAN_DISTANCE,scann_config:Optional[str]=None,):"""Initialize with necessary components."""self.embedding=embeddingself.index=indexself.docstore=docstoreself.index_to_docstore_id=index_to_docstore_idself.distance_strategy=distance_strategyself.override_relevance_score_fn=relevance_score_fnself._normalize_L2=normalize_L2self._scann_config=scann_config
def__add(self,texts:Iterable[str],embeddings:Iterable[List[float]],metadatas:Optional[List[dict]]=None,ids:Optional[List[str]]=None,**kwargs:Any,)->List[str]:ifnotisinstance(self.docstore,AddableMixin):raiseValueError("If trying to add texts, the underlying docstore should support "f"adding items, which {self.docstore} does not")raiseNotImplementedError("Updates are not available in ScaNN, yet.")
[docs]defadd_texts(self,texts:Iterable[str],metadatas:Optional[List[dict]]=None,ids:Optional[List[str]]=None,**kwargs:Any,)->List[str]:"""Run more texts through the embeddings and add to the vectorstore. Args: texts: Iterable of strings to add to the vectorstore. metadatas: Optional list of metadatas associated with the texts. ids: Optional list of unique IDs. Returns: List of ids from adding the texts into the vectorstore. """# Embed and create the documents.embeddings=self.embedding.embed_documents(list(texts))returnself.__add(texts,embeddings,metadatas=metadatas,ids=ids,**kwargs)
[docs]defadd_embeddings(self,text_embeddings:Iterable[Tuple[str,List[float]]],metadatas:Optional[List[dict]]=None,ids:Optional[List[str]]=None,**kwargs:Any,)->List[str]:"""Run more texts through the embeddings and add to the vectorstore. Args: text_embeddings: Iterable pairs of string and embedding to add to the vectorstore. metadatas: Optional list of metadatas associated with the texts. ids: Optional list of unique IDs. Returns: List of ids from adding the texts into the vectorstore. """ifnotisinstance(self.docstore,AddableMixin):raiseValueError("If trying to add texts, the underlying docstore should support "f"adding items, which {self.docstore} does not")# Embed and create the documents.texts,embeddings=zip(*text_embeddings)returnself.__add(texts,embeddings,metadatas=metadatas,ids=ids,**kwargs)
[docs]defdelete(self,ids:Optional[List[str]]=None,**kwargs:Any)->Optional[bool]:"""Delete by vector ID or other criteria. Args: ids: List of ids to delete. **kwargs: Other keyword arguments that subclasses might use. Returns: Optional[bool]: True if deletion is successful, False otherwise, None if not implemented. """raiseNotImplementedError("Deletions are not available in ScaNN, yet.")
[docs]defsimilarity_search_with_score_by_vector(self,embedding:List[float],k:int=4,filter:Optional[Dict[str,Any]]=None,fetch_k:int=20,**kwargs:Any,)->List[Tuple[Document,float]]:"""Return docs most similar to query. Args: embedding: Embedding vector to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter (Optional[Dict[str, Any]]): Filter by metadata. Defaults to None. fetch_k: (Optional[int]) Number of Documents to fetch before filtering. Defaults to 20. **kwargs: kwargs to be passed to similarity search. Can include: score_threshold: Optional, a floating point value between 0 to 1 to filter the resulting set of retrieved docs Returns: List of documents most similar to the query text and L2 distance in float for each. Lower score represents more similarity. """vector=np.array([embedding],dtype=np.float32)ifself._normalize_L2:vector=normalize(vector)indices,scores=self.index.search_batched(vector,kiffilterisNoneelsefetch_k)docs=[]forj,iinenumerate(indices[0]):ifi==-1:# This happens when not enough docs are returned.continue_id=self.index_to_docstore_id[i]doc=self.docstore.search(_id)ifnotisinstance(doc,Document):raiseValueError(f"Could not find document for id {_id}, got {doc}")iffilterisnotNone:filter={key:[value]ifnotisinstance(value,list)elsevalueforkey,valueinfilter.items()}ifall(doc.metadata.get(key)invalueforkey,valueinfilter.items()):docs.append((doc,scores[0][j]))else:docs.append((doc,scores[0][j]))score_threshold=kwargs.get("score_threshold")ifscore_thresholdisnotNone:cmp=(operator.geifself.distance_strategyin(DistanceStrategy.MAX_INNER_PRODUCT,DistanceStrategy.JACCARD)elseoperator.le)docs=[(doc,similarity)fordoc,similarityindocsifcmp(similarity,score_threshold)]returndocs[:k]
[docs]defsimilarity_search_with_score(self,query:str,k:int=4,filter:Optional[Dict[str,Any]]=None,fetch_k:int=20,**kwargs:Any,)->List[Tuple[Document,float]]:"""Return docs most similar to query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. fetch_k: (Optional[int]) Number of Documents to fetch before filtering. Defaults to 20. Returns: List of documents most similar to the query text with L2 distance in float. Lower score represents more similarity. """embedding=self.embedding.embed_query(query)docs=self.similarity_search_with_score_by_vector(embedding,k,filter=filter,fetch_k=fetch_k,**kwargs,)returndocs
[docs]defsimilarity_search_by_vector(self,embedding:List[float],k:int=4,filter:Optional[Dict[str,Any]]=None,fetch_k:int=20,**kwargs:Any,)->List[Document]:"""Return docs most similar to embedding vector. Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. fetch_k: (Optional[int]) Number of Documents to fetch before filtering. Defaults to 20. Returns: List of Documents most similar to the embedding. """docs_and_scores=self.similarity_search_with_score_by_vector(embedding,k,filter=filter,fetch_k=fetch_k,**kwargs,)return[docfordoc,_indocs_and_scores]
[docs]defsimilarity_search(self,query:str,k:int=4,filter:Optional[Dict[str,Any]]=None,fetch_k:int=20,**kwargs:Any,)->List[Document]:"""Return docs most similar to query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. fetch_k: (Optional[int]) Number of Documents to fetch before filtering. Defaults to 20. Returns: List of Documents most similar to the query. """docs_and_scores=self.similarity_search_with_score(query,k,filter=filter,fetch_k=fetch_k,**kwargs)return[docfordoc,_indocs_and_scores]
@classmethoddef__from(cls,texts:List[str],embeddings:List[List[float]],embedding:Embeddings,metadatas:Optional[List[dict]]=None,ids:Optional[List[str]]=None,normalize_L2:bool=False,**kwargs:Any,)->ScaNN:scann=guard_import("scann")distance_strategy=kwargs.get("distance_strategy",DistanceStrategy.EUCLIDEAN_DISTANCE)scann_config=kwargs.get("scann_config",None)vector=np.array(embeddings,dtype=np.float32)ifnormalize_L2:vector=normalize(vector)ifscann_configisnotNone:index=scann.scann_ops_pybind.create_searcher(vector,scann_config)else:ifdistance_strategy==DistanceStrategy.MAX_INNER_PRODUCT:index=(scann.scann_ops_pybind.builder(vector,1,"dot_product").score_brute_force().build())else:# Default to L2, currently other metric types not initialized.index=(scann.scann_ops_pybind.builder(vector,1,"squared_l2").score_brute_force().build())documents=[]ifidsisNone:ids=[str(uuid.uuid4())for_intexts]fori,textinenumerate(texts):metadata=metadatas[i]ifmetadataselse{}documents.append(Document(page_content=text,metadata=metadata))index_to_id=dict(enumerate(ids))iflen(index_to_id)!=len(documents):raiseException(f"{len(index_to_id)} ids provided for {len(documents)} documents."" Each document should have an id.")docstore=InMemoryDocstore(dict(zip(index_to_id.values(),documents)))returncls(embedding,index,docstore,index_to_id,normalize_L2=normalize_L2,**kwargs,)
[docs]@classmethoddeffrom_texts(cls,texts:List[str],embedding:Embeddings,metadatas:Optional[List[dict]]=None,ids:Optional[List[str]]=None,**kwargs:Any,)->ScaNN:"""Construct ScaNN wrapper from raw documents. This is a user friendly interface that: 1. Embeds documents. 2. Creates an in memory docstore 3. Initializes the ScaNN database This is intended to be a quick way to get started. Example: .. code-block:: python from langchain_community.vectorstores import ScaNN from langchain_community.embeddings import OpenAIEmbeddings embeddings = OpenAIEmbeddings() scann = ScaNN.from_texts(texts, embeddings) """embeddings=embedding.embed_documents(texts)returncls.__from(texts,embeddings,embedding,metadatas=metadatas,ids=ids,**kwargs,)
[docs]@classmethoddeffrom_embeddings(cls,text_embeddings:List[Tuple[str,List[float]]],embedding:Embeddings,metadatas:Optional[List[dict]]=None,ids:Optional[List[str]]=None,**kwargs:Any,)->ScaNN:"""Construct ScaNN wrapper from raw documents. This is a user friendly interface that: 1. Embeds documents. 2. Creates an in memory docstore 3. Initializes the ScaNN database This is intended to be a quick way to get started. Example: .. code-block:: python from langchain_community.vectorstores import ScaNN from langchain_community.embeddings import OpenAIEmbeddings embeddings = OpenAIEmbeddings() text_embeddings = embeddings.embed_documents(texts) text_embedding_pairs = list(zip(texts, text_embeddings)) scann = ScaNN.from_embeddings(text_embedding_pairs, embeddings) """texts=[t[0]fortintext_embeddings]embeddings=[t[1]fortintext_embeddings]returncls.__from(texts,embeddings,embedding,metadatas=metadatas,ids=ids,**kwargs,)
[docs]defsave_local(self,folder_path:str,index_name:str="index")->None:"""Save ScaNN index, docstore, and index_to_docstore_id to disk. Args: folder_path: folder path to save index, docstore, and index_to_docstore_id to. """path=Path(folder_path)scann_path=path/"{index_name}.scann".format(index_name=index_name)scann_path.mkdir(exist_ok=True,parents=True)# save index separately since it is not picklableself.index.serialize(str(scann_path))# save docstore and index_to_docstore_idwithopen(path/"{index_name}.pkl".format(index_name=index_name),"wb")asf:pickle.dump((self.docstore,self.index_to_docstore_id),f)
[docs]@classmethoddefload_local(cls,folder_path:str,embedding:Embeddings,index_name:str="index",*,allow_dangerous_deserialization:bool=False,**kwargs:Any,)->ScaNN:"""Load ScaNN index, docstore, and index_to_docstore_id from disk. Args: folder_path: folder path to load index, docstore, and index_to_docstore_id from. embedding: Embeddings to use when generating queries index_name: for saving with a specific index file name allow_dangerous_deserialization: whether to allow deserialization of the data which involves loading a pickle file. Pickle files can be modified by malicious actors to deliver a malicious payload that results in execution of arbitrary code on your machine. """ifnotallow_dangerous_deserialization:raiseValueError("The de-serialization relies loading a pickle file. ""Pickle files can be modified to deliver a malicious payload that ""results in execution of arbitrary code on your machine.""You will need to set `allow_dangerous_deserialization` to `True` to ""enable deserialization. If you do this, make sure that you ""trust the source of the data. For example, if you are loading a ""file that you created, and know that no one else has modified the ""file, then this is safe to do. Do not set this to `True` if you are ""loading a file from an untrusted source (e.g., some random site on ""the internet.).")path=Path(folder_path)scann_path=path/"{index_name}.scann".format(index_name=index_name)scann_path.mkdir(exist_ok=True,parents=True)# load index separately since it is not picklablescann=guard_import("scann")index=scann.scann_ops_pybind.load_searcher(str(scann_path))# load docstore and index_to_docstore_idwithopen(path/"{index_name}.pkl".format(index_name=index_name),"rb")asf:(docstore,index_to_docstore_id,)=pickle.load(# ignore[pickle]: explicit-opt-inf)returncls(embedding,index,docstore,index_to_docstore_id,**kwargs)
def_select_relevance_score_fn(self)->Callable[[float],float]:""" The 'correct' relevance function may differ depending on a few things, including: - the distance / similarity metric used by the VectorStore - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) - embedding dimensionality - etc. """ifself.override_relevance_score_fnisnotNone:returnself.override_relevance_score_fn# Default strategy is to rely on distance strategy provided in# vectorstore constructorifself.distance_strategy==DistanceStrategy.MAX_INNER_PRODUCT:returnself._max_inner_product_relevance_score_fnelifself.distance_strategy==DistanceStrategy.EUCLIDEAN_DISTANCE:# Default behavior is to use euclidean distance relevancyreturnself._euclidean_relevance_score_fnelse:raiseValueError("Unknown distance strategy, must be cosine, max_inner_product,"" or euclidean")def_similarity_search_with_relevance_scores(self,query:str,k:int=4,filter:Optional[Dict[str,Any]]=None,fetch_k:int=20,**kwargs:Any,)->List[Tuple[Document,float]]:"""Return docs and their similarity scores on a scale from 0 to 1."""# Pop score threshold so that only relevancy scores, not raw scores, are# filtered.score_threshold=kwargs.pop("score_threshold",None)relevance_score_fn=self._select_relevance_score_fn()ifrelevance_score_fnisNone:raiseValueError("normalize_score_fn must be provided to"" ScaNN constructor to normalize scores")docs_and_scores=self.similarity_search_with_score(query,k=k,filter=filter,fetch_k=fetch_k,**kwargs,)docs_and_rel_scores=[(doc,relevance_score_fn(score))fordoc,scoreindocs_and_scores]ifscore_thresholdisnotNone:docs_and_rel_scores=[(doc,similarity)fordoc,similarityindocs_and_rel_scoresifsimilarity>=score_threshold]returndocs_and_rel_scores