Source code for langchain_google_genai.google_vector_store
"""Google Generative AI Vector Store.The GenAI Semantic Retriever API is a managed end-to-end service that allowsdevelopers to create a corpus of documents to perform semantic search onrelated passages given a user query. For more information visit:https://developers.generativeai.google/guide"""importasynciofromfunctoolsimportpartialfromtypingimport(Any,Callable,Dict,Iterable,List,Optional,Tuple,)importgoogle.ai.generativelanguageasgenaifromlangchain_core.documentsimportDocumentfromlangchain_core.embeddingsimportEmbeddingsfromlangchain_core.runnablesimportRunnable,RunnableLambda,RunnablePassthroughfromlangchain_core.vectorstoresimportVectorStorefrompydanticimportBaseModel,PrivateAttrfrom.import_genai_extensionasgenaixfrom.genai_aqaimport(AqaInput,AqaOutput,GenAIAqa,)
[docs]classServerSideEmbedding(Embeddings):"""Do nothing embedding model where the embedding is done by the server."""
[docs]classDoesNotExistsException(Exception):def__init__(self,*,corpus_id:str,document_id:Optional[str]=None)->None:ifdocument_idisNone:message=f"No such corpus {corpus_id}"else:message=f"No such document {document_id} under corpus {corpus_id}"super().__init__(message)
class_SemanticRetriever(BaseModel):"""Wrapper class to Google's internal semantric retriever service."""name:genaix.EntityName_client:genai.RetrieverServiceClient=PrivateAttr()def__init__(self,*,client:genai.RetrieverServiceClient,**kwargs:Any)->None:super().__init__(**kwargs)self._client=client@classmethoddeffrom_ids(cls,corpus_id:str,document_id:Optional[str])->"_SemanticRetriever":name=genaix.EntityName(corpus_id=corpus_id,document_id=document_id)client=genaix.build_semantic_retriever()# Check the entity exists on Google server.ifname.is_corpus():ifgenaix.get_corpus(corpus_id=corpus_id,client=client)isNone:raiseDoesNotExistsException(corpus_id=corpus_id)elifname.is_document():assertdocument_idisnotNoneif(genaix.get_document(corpus_id=corpus_id,document_id=document_id,client=client)isNone):raiseDoesNotExistsException(corpus_id=corpus_id,document_id=document_id)returncls(name=name,client=client)defadd_texts(self,texts:Iterable[str],metadatas:Optional[List[Dict[str,Any]]]=None,document_id:Optional[str]=None,)->List[str]:ifself.name.document_idisNoneanddocument_idisNone:raiseNotImplementedError("Adding texts to a corpus directly is not supported. ""Please provide a document ID under the corpus first. ""Then add the texts to the document.")if(self.name.document_idisnotNoneanddocument_idisnotNoneandself.name.document_id!=document_id):raiseNotImplementedError(f"Parameter `document_id` {document_id} does not match the "f"vector store's `document_id` {self.name.document_id}")assertself.name.document_idordocument_idisnotNonenew_document_id=self.name.document_idordocument_idor""texts=list(texts)ifmetadatasisNone:metadatas=[{}for_intexts]iflen(texts)!=len(metadatas):raiseValueError(f"metadatas's length {len(metadatas)} and "f"texts's length {len(texts)} are mismatched")chunks=genaix.batch_create_chunk(corpus_id=self.name.corpus_id,document_id=new_document_id,texts=texts,metadatas=metadatas,client=self._client,)return[chunk.nameforchunkinchunksifchunk.name]defsimilarity_search(self,query:str,k:int=4,filter:Optional[Dict[str,Any]]=None,)->List[Tuple[str,float]]:ifself.name.is_corpus():relevant_chunks=genaix.query_corpus(corpus_id=self.name.corpus_id,query=query,k=k,filter=filter,client=self._client,)else:assertself.name.is_document()assertself.name.document_idisnotNonerelevant_chunks=genaix.query_document(corpus_id=self.name.corpus_id,document_id=self.name.document_id,query=query,k=k,filter=filter,client=self._client,)return[(chunk.chunk.data.string_value,chunk.chunk_relevance_score)forchunkinrelevant_chunks]defdelete(self,ids:Optional[List[str]]=None,**kwargs:Any)->Optional[bool]:foridinidsor[]:name=genaix.EntityName.from_str(id)_delete_chunk(corpus_id=name.corpus_id,document_id=name.document_id,chunk_id=name.chunk_id,client=self._client,)returnTruedef_delete_chunk(*,corpus_id:str,document_id:Optional[str],chunk_id:Optional[str],client:genai.RetrieverServiceClient,)->None:ifchunk_idisnotNone:ifdocument_idisNone:raiseValueError(f"Chunk {chunk_id} requires a document ID")genaix.delete_chunk(corpus_id=corpus_id,document_id=document_id,chunk_id=chunk_id,client=client,)elifdocument_idisnotNone:genaix.delete_document(corpus_id=corpus_id,document_id=document_id,client=client)else:genaix.delete_corpus(corpus_id=corpus_id,client=client)
[docs]classGoogleVectorStore(VectorStore):"""Google GenerativeAI Vector Store. Currently, it computes the embedding vectors on the server side. Example: Add texts to an existing corpus. store = GoogleVectorStore(corpus_id="123") store.add_documents(documents, document_id="456") Example: Create a new corpus. store = GoogleVectorStore.create_corpus( corpus_id="123", display_name="My Google corpus") Example: Query the corpus for relevant passages. store.as_retriever() \ .get_relevant_documents("Who caught the gingerbread man?") Example: Ask the corpus for grounded responses! aqa = store.as_aqa() response = aqa.invoke("Who caught the gingerbread man?") print(response.answer) print(response.attributed_passages) print(response.answerability_probability) You can also operate at Google's Document level. Example: Add texts to an existing Google Vector Store Document. doc_store = GoogleVectorStore(corpus_id="123", document_id="456") doc_store.add_documents(documents) Example: Create a new Google Vector Store Document. doc_store = GoogleVectorStore.create_document( corpus_id="123", document_id="456", display_name="My Google document") Example: Query the Google document. doc_store.as_retriever() \ .get_relevant_documents("Who caught the gingerbread man?") For more details, see the class's methods. """_retriever:_SemanticRetriever
[docs]def__init__(self,*,corpus_id:str,document_id:Optional[str]=None,**kwargs:Any):"""Returns an existing Google Semantic Retriever corpus or document. If just the corpus ID is provided, the vector store operates over all documents within that corpus. If the document ID is provided, the vector store operates over just that document. Raises: DoesNotExistsException if the IDs do not match to anything on Google server. In this case, consider using `create_corpus` or `create_document` to create one. """super().__init__(**kwargs)self._retriever=_SemanticRetriever.from_ids(corpus_id,document_id)
[docs]@classmethoddefcreate_corpus(cls,corpus_id:Optional[str]=None,display_name:Optional[str]=None,)->"GoogleVectorStore":"""Create a Google Semantic Retriever corpus. Args: corpus_id: The ID to use to create the new corpus. If not provided, Google server will provide one. display_name: The title of the new corpus. If not provided, Google server will provide one. Returns: An instance of vector store that points to the newly created corpus. """client=genaix.build_semantic_retriever()corpus=genaix.create_corpus(corpus_id=corpus_id,display_name=display_name,client=client)n=genaix.EntityName.from_str(corpus.name)returncls(corpus_id=n.corpus_id)
[docs]@classmethoddefcreate_document(cls,corpus_id:str,document_id:Optional[str]=None,display_name:Optional[str]=None,metadata:Optional[Dict[str,Any]]=None,)->"GoogleVectorStore":"""Create a Google Semantic Retriever document. Args: corpus_id: ID of an existing corpus. document_id: The ID to use to create the new Google Semantic Retriever document. If not provided, Google server will provide one. display_name: The title of the new document. If not provided, Google server will provide one. Returns: An instance of vector store that points to the newly created document. """client=genaix.build_semantic_retriever()document=genaix.create_document(corpus_id=corpus_id,document_id=document_id,display_name=display_name,metadata=metadata,client=client,)assertdocument.nameisnotNoned=genaix.EntityName.from_str(document.name)returncls(corpus_id=d.corpus_id,document_id=d.document_id)
[docs]@classmethoddeffrom_texts(cls,texts:List[str],embedding:Optional[Embeddings]=None,metadatas:Optional[List[dict[str,Any]]]=None,*,corpus_id:Optional[str]=None,# str requireddocument_id:Optional[str]=None,# str required**kwargs:Any,)->"GoogleVectorStore":"""Returns a vector store of an existing document with the specified text. Args: corpus_id: REQUIRED. Must be an existing corpus. document_id: REQUIRED. Must be an existing document. texts: Texts to be loaded into the vector store. Returns: A vector store pointing to the specified Google Semantic Retriever Document. Raises: DoesNotExistsException if the IDs do not match to anything at Google server. """ifcorpus_idisNoneordocument_idisNone:raiseNotImplementedError("Must provide an existing corpus ID and document ID")doc_store=cls(corpus_id=corpus_id,document_id=document_id,**kwargs)doc_store.add_texts(texts,metadatas)returndoc_store
@propertydefname(self)->str:"""Returns the name of the Google entity. You shouldn't need to care about this unless you want to access your corpus or document via Google Generative AI API. """returnstr(self._retriever.name)@propertydefcorpus_id(self)->str:"""Returns the corpus ID managed by this vector store."""returnself._retriever.name.corpus_id@propertydefdocument_id(self)->Optional[str]:"""Returns the document ID managed by this vector store."""returnself._retriever.name.document_id
[docs]defadd_texts(self,texts:Iterable[str],metadatas:Optional[List[Dict[str,Any]]]=None,*,document_id:Optional[str]=None,**kwargs:Any,)->List[str]:"""Add texts to the vector store. If the vector store points to a corpus (instead of a document), you must also provide a `document_id`. Returns: Chunk's names created on Google servers. """returnself._retriever.add_texts(texts,metadatas,document_id)
[docs]defsimilarity_search(self,query:str,k:int=4,filter:Optional[Dict[str,Any]]=None,**kwargs:Any,)->List[Document]:"""Search the vector store for relevant texts."""return[documentfordocument,_inself.similarity_search_with_score(query,k,filter,**kwargs)]
[docs]defsimilarity_search_with_score(self,query:str,k:int=4,filter:Optional[Dict[str,Any]]=None,**kwargs:Any,)->List[Tuple[Document,float]]:"""Run similarity search with distance."""return[(Document(page_content=text),score)fortext,scoreinself._retriever.similarity_search(query,k,filter)]
[docs]defdelete(self,ids:Optional[List[str]]=None,**kwargs:Any)->Optional[bool]:"""Delete chunnks. Note that the "ids" are not corpus ID or document ID. Rather, these are the entity names returned by `add_texts`. Returns: True if successful. Otherwise, you should get an exception anyway. """returnself._retriever.delete(ids)
def_select_relevance_score_fn(self)->Callable[[float],float]:""" TODO: Check with the team about this! The underlying vector store already returns a "score proper", i.e. one in [0, 1] where higher means more *similar*. """returnlambdascore:score
[docs]defas_aqa(self,**kwargs:Any)->Runnable[str,AqaOutput]:"""Construct a Google Generative AI AQA engine. All arguments are optional. Args: answer_style: See `google.ai.generativelanguage.GenerateAnswerRequest.AnswerStyle`. safety_settings: See `google.ai.generativelanguage.SafetySetting`. temperature: 0.0 to 1.0. """return(RunnablePassthrough[str]()|{"prompt":RunnablePassthrough(),# type: ignore[dict-item]"passages":self.as_retriever(),}|RunnableLambda(_toAqaInput)|GenAIAqa(**kwargs))