Source code for langchain_community.retrievers.google_cloud_documentai_warehouse
"""Retriever wrapper for Google Cloud Document AI Warehouse."""fromtypingimportTYPE_CHECKING,Any,Dict,List,Optionalfromlangchain_core._api.deprecationimportdeprecatedfromlangchain_core.callbacksimportCallbackManagerForRetrieverRunfromlangchain_core.documentsimportDocumentfromlangchain_core.retrieversimportBaseRetrieverfromlangchain_core.utilsimportget_from_dict_or_env,pre_initfromlangchain_community.utilities.vertexaiimportget_client_infoifTYPE_CHECKING:fromgoogle.cloud.contentwarehouse_v1import(DocumentServiceClient,RequestMetadata,SearchDocumentsRequest,)fromgoogle.cloud.contentwarehouse_v1.services.document_service.pagersimport(SearchDocumentsPager,)
[docs]@deprecated(since="0.0.32",removal="1.0",alternative_import="langchain_google_community.DocumentAIWarehouseRetriever",)classGoogleDocumentAIWarehouseRetriever(BaseRetriever):"""A retriever based on Document AI Warehouse. Documents should be created and documents should be uploaded in a separate flow, and this retriever uses only Document AI schema_id provided to search for relevant documents. More info: https://cloud.google.com/document-ai-warehouse. """location:str="us""""Google Cloud location where Document AI Warehouse is placed."""project_number:str"""Google Cloud project number, should contain digits only."""schema_id:Optional[str]=None"""Document AI Warehouse schema to query against. If nothing is provided, all documents in the project will be searched."""qa_size_limit:int=5"""The limit on the number of documents returned."""client:"DocumentServiceClient"=None#: :meta private:
[docs]@pre_initdefvalidate_environment(cls,values:Dict)->Dict:"""Validates the environment."""try:fromgoogle.cloud.contentwarehouse_v1importDocumentServiceClientexceptImportErrorasexc:raiseImportError("google.cloud.contentwarehouse is not installed.""Please install it with pip install google-cloud-contentwarehouse")fromexcvalues["project_number"]=get_from_dict_or_env(values,"project_number","PROJECT_NUMBER")values["client"]=DocumentServiceClient(client_info=get_client_info(module="document-ai-warehouse"))returnvalues
def_prepare_request_metadata(self,user_ldap:str)->"RequestMetadata":fromgoogle.cloud.contentwarehouse_v1importRequestMetadata,UserInfouser_info=UserInfo(id=f"user:{user_ldap}")returnRequestMetadata(user_info=user_info)def_get_relevant_documents(self,query:str,*,run_manager:CallbackManagerForRetrieverRun,**kwargs:Any)->List[Document]:request=self._prepare_search_request(query,**kwargs)response=self.client.search_documents(request=request)returnself._parse_search_response(response=response)def_prepare_search_request(self,query:str,**kwargs:Any)->"SearchDocumentsRequest":fromgoogle.cloud.contentwarehouse_v1import(DocumentQuery,SearchDocumentsRequest,)try:user_ldap=kwargs["user_ldap"]exceptKeyError:raiseValueError("Argument user_ldap should be provided!")request_metadata=self._prepare_request_metadata(user_ldap=user_ldap)schemas=[]ifself.schema_id:schemas.append(self.client.document_schema_path(project=self.project_number,location=self.location,document_schema=self.schema_id,))returnSearchDocumentsRequest(parent=self.client.common_location_path(self.project_number,self.location),request_metadata=request_metadata,document_query=DocumentQuery(query=query,is_nl_query=True,document_schema_names=schemas),qa_size_limit=self.qa_size_limit,)def_parse_search_response(self,response:"SearchDocumentsPager")->List[Document]:documents=[]fordocinresponse.matching_documents:metadata={"title":doc.document.title,"source":doc.document.raw_document_path,}documents.append(Document(page_content=doc.search_text_snippet,metadata=metadata))returndocuments