"""Module contains code for a cache backed embedder.The cache backed embedder is a wrapper around an embedder that cachesembeddings in a key-value store. The cache is used to avoid recomputingembeddings for the same text.The text is hashed and the hash is used as the key in the cache."""from__future__importannotationsimporthashlibimportjsonimportuuidfromfunctoolsimportpartialfromtypingimportCallable,List,Optional,Sequence,Union,castfromlangchain_core.embeddingsimportEmbeddingsfromlangchain_core.storesimportBaseStore,ByteStorefromlangchain_core.utils.iterimportbatch_iteratefromlangchain.storage.encoder_backedimportEncoderBackedStoreNAMESPACE_UUID=uuid.UUID(int=1985)def_hash_string_to_uuid(input_string:str)->uuid.UUID:"""Hash a string and returns the corresponding UUID."""hash_value=hashlib.sha1(input_string.encode("utf-8")).hexdigest()returnuuid.uuid5(NAMESPACE_UUID,hash_value)def_key_encoder(key:str,namespace:str)->str:"""Encode a key."""returnnamespace+str(_hash_string_to_uuid(key))def_create_key_encoder(namespace:str)->Callable[[str],str]:"""Create an encoder for a key."""returnpartial(_key_encoder,namespace=namespace)def_value_serializer(value:Sequence[float])->bytes:"""Serialize a value."""returnjson.dumps(value).encode()def_value_deserializer(serialized_value:bytes)->List[float]:"""Deserialize a value."""returncast(List[float],json.loads(serialized_value.decode()))
[docs]classCacheBackedEmbeddings(Embeddings):"""Interface for caching results from embedding models. The interface allows works with any store that implements the abstract store interface accepting keys of type str and values of list of floats. If need be, the interface can be extended to accept other implementations of the value serializer and deserializer, as well as the key encoder. Note that by default only document embeddings are cached. To cache query embeddings too, pass in a query_embedding_store to constructor. Examples: .. code-block: python from langchain.embeddings import CacheBackedEmbeddings from langchain.storage import LocalFileStore from langchain_community.embeddings import OpenAIEmbeddings store = LocalFileStore('./my_cache') underlying_embedder = OpenAIEmbeddings() embedder = CacheBackedEmbeddings.from_bytes_store( underlying_embedder, store, namespace=underlying_embedder.model ) # Embedding is computed and cached embeddings = embedder.embed_documents(["hello", "goodbye"]) # Embeddings are retrieved from the cache, no computation is done embeddings = embedder.embed_documents(["hello", "goodbye"]) """
[docs]def__init__(self,underlying_embeddings:Embeddings,document_embedding_store:BaseStore[str,List[float]],*,batch_size:Optional[int]=None,query_embedding_store:Optional[BaseStore[str,List[float]]]=None,)->None:"""Initialize the embedder. Args: underlying_embeddings: the embedder to use for computing embeddings. document_embedding_store: The store to use for caching document embeddings. batch_size: The number of documents to embed between store updates. query_embedding_store: The store to use for caching query embeddings. If None, query embeddings are not cached. """super().__init__()self.document_embedding_store=document_embedding_storeself.query_embedding_store=query_embedding_storeself.underlying_embeddings=underlying_embeddingsself.batch_size=batch_size
[docs]defembed_documents(self,texts:List[str])->List[List[float]]:"""Embed a list of texts. The method first checks the cache for the embeddings. If the embeddings are not found, the method uses the underlying embedder to embed the documents and stores the results in the cache. Args: texts: A list of texts to embed. Returns: A list of embeddings for the given texts. """vectors:List[Union[List[float],None]]=self.document_embedding_store.mget(texts)all_missing_indices:List[int]=[ifori,vectorinenumerate(vectors)ifvectorisNone]formissing_indicesinbatch_iterate(self.batch_size,all_missing_indices):missing_texts=[texts[i]foriinmissing_indices]missing_vectors=self.underlying_embeddings.embed_documents(missing_texts)self.document_embedding_store.mset(list(zip(missing_texts,missing_vectors)))forindex,updated_vectorinzip(missing_indices,missing_vectors):vectors[index]=updated_vectorreturncast(List[List[float]],vectors)# Nones should have been resolved by now
[docs]asyncdefaembed_documents(self,texts:List[str])->List[List[float]]:"""Embed a list of texts. The method first checks the cache for the embeddings. If the embeddings are not found, the method uses the underlying embedder to embed the documents and stores the results in the cache. Args: texts: A list of texts to embed. Returns: A list of embeddings for the given texts. """vectors:List[Union[List[float],None]]=awaitself.document_embedding_store.amget(texts)all_missing_indices:List[int]=[ifori,vectorinenumerate(vectors)ifvectorisNone]# batch_iterate supports None batch_size which returns all elements at once# as a single batch.formissing_indicesinbatch_iterate(self.batch_size,all_missing_indices):missing_texts=[texts[i]foriinmissing_indices]missing_vectors=awaitself.underlying_embeddings.aembed_documents(missing_texts)awaitself.document_embedding_store.amset(list(zip(missing_texts,missing_vectors)))forindex,updated_vectorinzip(missing_indices,missing_vectors):vectors[index]=updated_vectorreturncast(List[List[float]],vectors)# Nones should have been resolved by now
[docs]defembed_query(self,text:str)->List[float]:"""Embed query text. By default, this method does not cache queries. To enable caching, set the `cache_query` parameter to `True` when initializing the embedder. Args: text: The text to embed. Returns: The embedding for the given text. """ifnotself.query_embedding_store:returnself.underlying_embeddings.embed_query(text)(cached,)=self.query_embedding_store.mget([text])ifcachedisnotNone:returncachedvector=self.underlying_embeddings.embed_query(text)self.query_embedding_store.mset([(text,vector)])returnvector
[docs]asyncdefaembed_query(self,text:str)->List[float]:"""Embed query text. By default, this method does not cache queries. To enable caching, set the `cache_query` parameter to `True` when initializing the embedder. Args: text: The text to embed. Returns: The embedding for the given text. """ifnotself.query_embedding_store:returnawaitself.underlying_embeddings.aembed_query(text)(cached,)=awaitself.query_embedding_store.amget([text])ifcachedisnotNone:returncachedvector=awaitself.underlying_embeddings.aembed_query(text)awaitself.query_embedding_store.amset([(text,vector)])returnvector
[docs]@classmethoddeffrom_bytes_store(cls,underlying_embeddings:Embeddings,document_embedding_cache:ByteStore,*,namespace:str="",batch_size:Optional[int]=None,query_embedding_cache:Union[bool,ByteStore]=False,)->CacheBackedEmbeddings:"""On-ramp that adds the necessary serialization and encoding to the store. Args: underlying_embeddings: The embedder to use for embedding. document_embedding_cache: The cache to use for storing document embeddings. *, namespace: The namespace to use for document cache. This namespace is used to avoid collisions with other caches. For example, set it to the name of the embedding model used. batch_size: The number of documents to embed between store updates. query_embedding_cache: The cache to use for storing query embeddings. True to use the same cache as document embeddings. False to not cache query embeddings. """namespace=namespacekey_encoder=_create_key_encoder(namespace)document_embedding_store=EncoderBackedStore[str,List[float]](document_embedding_cache,key_encoder,_value_serializer,_value_deserializer,)ifquery_embedding_cacheisTrue:query_embedding_store=document_embedding_storeelifquery_embedding_cacheisFalse:query_embedding_store=Noneelse:query_embedding_store=EncoderBackedStore[str,List[float]](query_embedding_cache,key_encoder,_value_serializer,_value_deserializer,)returncls(underlying_embeddings,document_embedding_store,batch_size=batch_size,query_embedding_store=query_embedding_store,)