[docs]classInMemoryVectorStore(VectorStore):"""In-memory vector store implementation. Uses a dictionary, and computes cosine similarity for search using numpy. Setup: Install ``langchain-core``. .. code-block:: bash pip install -U langchain-core Key init args — indexing params: embedding_function: Embeddings Embedding function to use. Instantiate: .. code-block:: python from langchain_core.vectorstores import InMemoryVectorStore from langchain_openai import OpenAIEmbeddings vector_store = InMemoryVectorStore(OpenAIEmbeddings()) Add Documents: .. code-block:: python from langchain_core.documents import Document document_1 = Document(id="1", page_content="foo", metadata={"baz": "bar"}) document_2 = Document(id="2", page_content="thud", metadata={"bar": "baz"}) document_3 = Document(id="3", page_content="i will be deleted :(") documents = [document_1, document_2, document_3] vector_store.add_documents(documents=documents) Inspect documents: .. code-block:: python top_n = 10 for index, (id, doc) in enumerate(vector_store.store.items()): if index < top_n: # docs have keys 'id', 'vector', 'text', 'metadata' print(f"{id}: {doc['text']}") else: break Delete Documents: .. code-block:: python vector_store.delete(ids=["3"]) Search: .. code-block:: python results = vector_store.similarity_search(query="thud",k=1) for doc in results: print(f"* {doc.page_content} [{doc.metadata}]") .. code-block:: none * thud [{'bar': 'baz'}] Search with filter: .. code-block:: python def _filter_function(doc: Document) -> bool: return doc.metadata.get("bar") == "baz" results = vector_store.similarity_search( query="thud", k=1, filter=_filter_function ) for doc in results: print(f"* {doc.page_content} [{doc.metadata}]") .. code-block:: none * thud [{'bar': 'baz'}] Search with score: .. code-block:: python results = vector_store.similarity_search_with_score( query="qux", k=1 ) for doc, score in results: print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]") .. code-block:: none * [SIM=0.832268] foo [{'baz': 'bar'}] Async: .. code-block:: python # add documents # await vector_store.aadd_documents(documents=documents) # delete documents # await vector_store.adelete(ids=["3"]) # search # results = vector_store.asimilarity_search(query="thud", k=1) # search with score results = await vector_store.asimilarity_search_with_score(query="qux", k=1) for doc,score in results: print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]") .. code-block:: none * [SIM=0.832268] foo [{'baz': 'bar'}] Use as Retriever: .. code-block:: python retriever = vector_store.as_retriever( search_type="mmr", search_kwargs={"k": 1, "fetch_k": 2, "lambda_mult": 0.5}, ) retriever.invoke("thud") .. code-block:: none [Document(id='2', metadata={'bar': 'baz'}, page_content='thud')] """
[docs]def__init__(self,embedding:Embeddings)->None:"""Initialize with the given embedding function. Args: embedding: embedding function to use. """# TODO: would be nice to change to# dict[str, Document] at some point (will be a breaking change)self.store:dict[str,dict[str,Any]]={}self.embedding=embedding
[docs]@overridedefadd_documents(self,documents:list[Document],ids:Optional[list[str]]=None,**kwargs:Any,)->list[str]:"""Add documents to the store."""texts=[doc.page_contentfordocindocuments]vectors=self.embedding.embed_documents(texts)ifidsandlen(ids)!=len(texts):msg=(f"ids must be the same length as texts. "f"Got {len(ids)} ids and {len(texts)} texts.")raiseValueError(msg)id_iterator:Iterator[Optional[str]]=(iter(ids)ifidselseiter(doc.idfordocindocuments))ids_=[]fordoc,vectorinzip(documents,vectors):doc_id=next(id_iterator)doc_id_=doc_idorstr(uuid.uuid4())ids_.append(doc_id_)self.store[doc_id_]={"id":doc_id_,"vector":vector,"text":doc.page_content,"metadata":doc.metadata,}returnids_
[docs]@overrideasyncdefaadd_documents(self,documents:list[Document],ids:Optional[list[str]]=None,**kwargs:Any)->list[str]:"""Add documents to the store."""texts=[doc.page_contentfordocindocuments]vectors=awaitself.embedding.aembed_documents(texts)ifidsandlen(ids)!=len(texts):msg=(f"ids must be the same length as texts. "f"Got {len(ids)} ids and {len(texts)} texts.")raiseValueError(msg)id_iterator:Iterator[Optional[str]]=(iter(ids)ifidselseiter(doc.idfordocindocuments))ids_:list[str]=[]fordoc,vectorinzip(documents,vectors):doc_id=next(id_iterator)doc_id_=doc_idorstr(uuid.uuid4())ids_.append(doc_id_)self.store[doc_id_]={"id":doc_id_,"vector":vector,"text":doc.page_content,"metadata":doc.metadata,}returnids_
[docs]@overridedefget_by_ids(self,ids:Sequence[str],/)->list[Document]:"""Get documents by their ids. Args: ids: The ids of the documents to get. Returns: A list of Document objects. """documents=[]fordoc_idinids:doc=self.store.get(doc_id)ifdoc:documents.append(Document(id=doc["id"],page_content=doc["text"],metadata=doc["metadata"],))returndocuments
[docs]@deprecated(alternative="VectorStore.add_documents",message=("This was a beta API that was added in 0.2.11. It'll be removed in 0.3.0."),since="0.2.29",removal="1.0",)defupsert(self,items:Sequence[Document],/,**_kwargs:Any)->UpsertResponse:"""[DEPRECATED] Upsert documents into the store. Args: items: The documents to upsert. Returns: The upsert response. """vectors=self.embedding.embed_documents([item.page_contentforiteminitems])ids=[]foritem,vectorinzip(items,vectors):doc_id=item.idorstr(uuid.uuid4())ids.append(doc_id)self.store[doc_id]={"id":doc_id,"vector":vector,"text":item.page_content,"metadata":item.metadata,}return{"succeeded":ids,"failed":[],}
[docs]@deprecated(alternative="VectorStore.aadd_documents",message=("This was a beta API that was added in 0.2.11. It'll be removed in 0.3.0."),since="0.2.29",removal="1.0",)asyncdefaupsert(self,items:Sequence[Document],/,**_kwargs:Any)->UpsertResponse:"""[DEPRECATED] Upsert documents into the store. Args: items: The documents to upsert. Returns: The upsert response. """vectors=awaitself.embedding.aembed_documents([item.page_contentforiteminitems])ids=[]foritem,vectorinzip(items,vectors):doc_id=item.idorstr(uuid.uuid4())ids.append(doc_id)self.store[doc_id]={"id":doc_id,"vector":vector,"text":item.page_content,"metadata":item.metadata,}return{"succeeded":ids,"failed":[],}
[docs]@overrideasyncdefaget_by_ids(self,ids:Sequence[str],/)->list[Document]:"""Async get documents by their ids. Args: ids: The ids of the documents to get. Returns: A list of Document objects. """returnself.get_by_ids(ids)
def_similarity_search_with_score_by_vector(self,embedding:list[float],k:int=4,filter:Optional[Callable[[Document],bool]]=None,# noqa: A002)->list[tuple[Document,float,list[float]]]:# get all docs with fixed order in listdocs=list(self.store.values())iffilterisnotNone:docs=[docfordocindocsiffilter(Document(page_content=doc["text"],metadata=doc["metadata"]))]ifnotdocs:return[]similarity=cosine_similarity([embedding],[doc["vector"]fordocindocs])[0]# get the indices ordered by similarity scoretop_k_idx=similarity.argsort()[::-1][:k]return[(Document(id=doc_dict["id"],page_content=doc_dict["text"],metadata=doc_dict["metadata"],),float(similarity[idx].item()),doc_dict["vector"],)foridxintop_k_idx# Assign using walrus operator to avoid multiple lookupsif(doc_dict:=docs[idx])]
[docs]defsimilarity_search_with_score_by_vector(self,embedding:list[float],k:int=4,filter:Optional[Callable[[Document],bool]]=None,# noqa: A002**_kwargs:Any,)->list[tuple[Document,float]]:"""Search for the most similar documents to the given embedding. Args: embedding: The embedding to search for. k: The number of documents to return. filter: A function to filter the documents. Returns: A list of tuples of Document objects and their similarity scores. """return[(doc,similarity)fordoc,similarity,_inself._similarity_search_with_score_by_vector(embedding=embedding,k=k,filter=filter)]
[docs]@overridedefmax_marginal_relevance_search_by_vector(self,embedding:list[float],k:int=4,fetch_k:int=20,lambda_mult:float=0.5,*,filter:Optional[Callable[[Document],bool]]=None,**kwargs:Any,)->list[Document]:prefetch_hits=self._similarity_search_with_score_by_vector(embedding=embedding,k=fetch_k,filter=filter,)try:importnumpyasnpexceptImportErrorase:msg=("numpy must be installed to use max_marginal_relevance_search ""pip install numpy")raiseImportError(msg)fromemmr_chosen_indices=maximal_marginal_relevance(np.array(embedding,dtype=np.float32),[vectorfor_,_,vectorinprefetch_hits],k=k,lambda_mult=lambda_mult,)return[prefetch_hits[idx][0]foridxinmmr_chosen_indices]
[docs]@classmethoddefload(cls,path:str,embedding:Embeddings,**kwargs:Any)->InMemoryVectorStore:"""Load a vector store from a file. Args: path: The path to load the vector store from. embedding: The embedding to use. kwargs: Additional arguments to pass to the constructor. Returns: A VectorStore object. """_path:Path=Path(path)with_path.open("r")asf:store=load(json.load(f))vectorstore=cls(embedding=embedding,**kwargs)vectorstore.store=storereturnvectorstore
[docs]defdump(self,path:str)->None:"""Dump the vector store to a file. Args: path: The path to dump the vector store to. """_path:Path=Path(path)_path.parent.mkdir(exist_ok=True,parents=True)with_path.open("w")asf:json.dump(dumpd(self.store),f,indent=2)