# Prerequisites:# 1. Create a Google Cloud project# 2. Enable the Google Drive API:# https://console.cloud.google.com/flows/enableapi?apiid=drive.googleapis.com# 3. Authorize credentials for desktop app:# https://developers.google.com/drive/api/quickstart/python#authorize_credentials_for_a_desktop_application # noqa: E501# 4. For service accounts visit# https://cloud.google.com/iam/docs/service-accounts-createimportosfrompathlibimportPathfromtypingimportAny,ClassVar,Dict,List,Optional,Sequence,Tuple,Unionfromlangchain_core.document_loadersimportBaseLoaderfromlangchain_core.documentsimportDocumentfrompydanticimportBaseModel,field_validator,model_validator
[docs]classGoogleDriveLoader(BaseLoader,BaseModel):"""Load Google Docs from `Google Drive`."""# Generated from https://developers.google.com/drive/api/guides/api-specific-auth# limiting to the scopes that are required to read the filesVALID_SCOPES:ClassVar[Tuple[str,...]]=("https://www.googleapis.com/auth/drive.file","https://www.googleapis.com/auth/drive.readonly","https://www.googleapis.com/auth/drive.meet.readonly","https://www.googleapis.com/auth/drive.metadata.readonly","https://www.googleapis.com/auth/drive.metadata",)service_account_key:Path=Path.home()/".credentials"/"keys.json""""Path to the service account key file."""credentials_path:Path=Path.home()/".credentials"/"credentials.json""""Path to the credentials file."""token_path:Path=Path.home()/".credentials"/"token.json""""Path to the token file."""credentials:Any=None"""Your own google credentials created via your own mechanism"""folder_id:Optional[str]=None"""The folder id to load from."""document_ids:Optional[List[str]]=None"""The document ids to load from."""file_ids:Optional[List[str]]=None"""The file ids to load from."""recursive:bool=False"""Whether to load recursively. Only applies when folder_id is given."""file_types:Optional[Sequence[str]]=None"""The file types to load. Only applies when folder_id is given."""load_trashed_files:bool=False"""Whether to load trashed files. Only applies when folder_id is given."""# NOTE(MthwRobinson) - changing the file_loader_cls to type here currently# results in pydantic validation errorsfile_loader_cls:Any=None"""The file loader class to use."""file_loader_kwargs:Dict["str",Any]={}"""The file loader kwargs to use."""load_auth:bool=False"""Whether to load authorization identities."""load_extended_metadata:bool=False"""Whether to load extended metadata."""scopes:List[str]=["https://www.googleapis.com/auth/drive.file"]"""The credential scopes to use for Google Drive API access. Default is drive.file scope."""def_get_file_size_from_id(self,id:str)->str:"""Fetch the size of the file."""try:importgoogleapiclient.errors# type: ignore[import]fromgoogleapiclient.discoveryimportbuild# type: ignore[import]exceptImportErrorasexc:raiseImportError("You must run ""`pip install --upgrade ""google-api-python-client` ""to load authorization identities.")fromexccreds=self._load_credentials()service=build("drive","v3",credentials=creds)try:file=service.files().get(fileId=id,fields="size").execute()returnfile["size"]exceptgoogleapiclient.errors.HttpError:print(f"insufficientFilePermissions: The user does not have sufficient \ permissions to retrieve size for the file with fileId: {id}")return"unknown"exceptExceptionasexc:print(f"Error occurred while fetching the size for the file with fileId: {id}")print(f"Error: {exc}")return"unknown"def_get_owner_metadata_from_id(self,id:str)->str:"""Fetch the owner of the file."""try:importgoogleapiclient.errors# type: ignore[import]fromgoogleapiclient.discoveryimportbuild# type: ignore[import]exceptImportErrorasexc:raiseImportError("You must run ""`pip install --upgrade ""google-api-python-client` ""to load authorization identities.")fromexccreds=self._load_credentials()service=build("drive","v3",credentials=creds)try:file=service.files().get(fileId=id,fields="owners").execute()returnfile["owners"][0].get("emailAddress")exceptgoogleapiclient.errors.HttpError:print(f"insufficientFilePermissions: The user does not have sufficient \ permissions to retrieve owner for the file with fileId: {id}")return"unknown"exceptExceptionasexc:print(f"Error occurred while fetching the owner for the file with fileId: \{id} with error: {exc}")return"unknown"def_get_file_path_from_id(self,id:str)->str:"""Fetch the full path of the file starting from the root."""try:importgoogleapiclient.errors# type: ignore[import]fromgoogleapiclient.discoveryimportbuild# type: ignore[import]exceptImportErrorasexc:raiseImportError("You must run ""`pip install --upgrade ""google-api-python-client` ""to load authorization identities.")fromexccreds=self._load_credentials()service=build("drive","v3",credentials=creds)path=[]current_id=idwhileTrue:try:file=(service.files().get(fileId=current_id,fields="name, parents").execute())path.append(file["name"])if"parents"infile:current_id=file["parents"][0]else:breakexceptgoogleapiclient.errors.HttpError:print(f"insufficientFilePermissions: The user does not have sufficient\ permissions to retrieve path for the file with fileId: {id}")breakpath.reverse()return"/".join(path)def_get_identity_metadata_from_id(self,id:str)->List[str]:"""Fetch the list of people having access to ID file."""try:importgoogleapiclient.errors# type: ignore[import]fromgoogleapiclient.discoveryimportbuild# type: ignore[import]exceptImportErrorasexc:raiseImportError("You must run ""`pip install --upgrade ""google-api-python-client` ""to load authorization identities.")fromexcauthorized_identities:list=[]creds=self._load_credentials()service=build("drive","v3",credentials=creds)# Build the servicetry:permissions=service.permissions().list(fileId=id).execute()exceptgoogleapiclient.errors.HttpError:print(f"insufficientFilePermissions: The user does not have sufficient \ permissions to retrieve permission for the file with fileId: {id}")returnauthorized_identitiesexceptExceptionasexc:print(f"Error occurred while fetching the permissions for the file with \ fileId: {id}")print(f"Error: {exc}")returnauthorized_identitiesforperminpermissions.get("permissions",{}):email_id=(service.permissions().get(fileId=id,permissionId=perm.get("id",""),fields="emailAddress").execute().get("emailAddress"))ifemail_id:authorized_identities.append(email_id)returnauthorized_identities@model_validator(mode="before")@classmethoddefvalidate_inputs(cls,values:Dict[str,Any])->Any:"""Validate that either folder_id or document_ids is set, but not both."""ifvalues.get("folder_id")and(values.get("document_ids")orvalues.get("file_ids")):raiseValueError("Cannot specify both folder_id and document_ids nor ""folder_id and file_ids")if(notvalues.get("folder_id")andnotvalues.get("document_ids")andnotvalues.get("file_ids")):raiseValueError("Must specify either folder_id, document_ids, or file_ids")file_types=values.get("file_types")iffile_types:ifvalues.get("document_ids")orvalues.get("file_ids"):raiseValueError("file_types can only be given when folder_id is given,"" (not when document_ids or file_ids are given).")type_mapping={"document":"application/vnd.google-apps.document","sheet":"application/vnd.google-apps.spreadsheet","pdf":"application/pdf","presentation":"application/vnd.google-apps.presentation",}allowed_types=list(type_mapping.keys())+list(type_mapping.values())short_names=", ".join([f"'{x}'"forxintype_mapping.keys()])full_names=", ".join([f"'{x}'"forxintype_mapping.values()])forfile_typeinfile_types:iffile_typenotinallowed_types:raiseValueError(f"Given file type {file_type} is not supported. "f"Supported values are: {short_names}; and "f"their full-form names: {full_names}")# replace short-form file types by full-form file typesdeffull_form(x:str)->str:returntype_mapping[x]ifxintype_mappingelsexvalues["file_types"]=[full_form(file_type)forfile_typeinfile_types]returnvalues@field_validator("credentials_path")defvalidate_credentials_path(cls,v:Any,**kwargs:Any)->Any:"""Validate that credentials_path exists."""ifnotv.exists():raiseValueError(f"credentials_path {v} does not exist")returnv@field_validator("scopes")defvalidate_scopes(cls,v:List[str])->List[str]:"""Validate that the provided scopes are not empty and are valid Google Drive API scopes."""ifnotv:raiseValueError("At least one scope must be provided")invalid_scopes=[scopeforscopeinvifscopenotincls.VALID_SCOPES]ifinvalid_scopes:raiseValueError(f"Invalid Google Drive API scope(s): {', '.join(invalid_scopes)}. "f"Valid scopes are: {', '.join(cls.VALID_SCOPES)}")returnvdef_load_credentials(self)->Any:"""Load credentials."""# Adapted from https://developers.google.com/drive/api/v3/quickstart/pythontry:fromgoogle.authimportdefault# type: ignore[import]fromgoogle.auth.transport.requestsimportRequest# type: ignore[import]fromgoogle.oauth2importservice_account# type: ignore[import]fromgoogle.oauth2.credentialsimportCredentials# type: ignore[import]fromgoogle_auth_oauthlib.flowimport(# type: ignore[import]InstalledAppFlow,)exceptImportError:raiseImportError("Could execute GoogleDriveLoader. ""Please, install drive dependency group: ""`pip install langchain-google-community[drive]`")creds=Noneifself.service_account_key.exists():returnservice_account.Credentials.from_service_account_file(str(self.service_account_key),scopes=self.scopes)ifself.token_path.exists():creds=Credentials.from_authorized_user_file(str(self.token_path),self.scopes)ifself.credentials:# use whatever was passed to uscreds=self.credentialsreturncredsifnotcredsornotcreds.valid:ifcredsandcreds.expiredandcreds.refresh_token:creds.refresh(Request())elif"GOOGLE_APPLICATION_CREDENTIALS"notinos.environ:creds,project=default()creds=creds.with_scopes(self.scopes)# no need to write to fileifcreds:returncredselse:flow=InstalledAppFlow.from_client_secrets_file(str(self.credentials_path),self.scopes)creds=flow.run_local_server(port=0)withopen(self.token_path,"w")astoken:token.write(creds.to_json())returncredsdef_load_sheet_from_id(self,id:str)->List[Document]:"""Load a sheet and all tabs from an ID."""fromgoogleapiclient.discoveryimportbuild# type: ignore[import]creds=self._load_credentials()sheets_service=build("sheets","v4",credentials=creds)spreadsheet=sheets_service.spreadsheets().get(spreadsheetId=id).execute()sheets=spreadsheet.get("sheets",[])ifself.load_auth:authorized_identities=self._get_identity_metadata_from_id(id)ifself.load_extended_metadata:owner=self._get_owner_metadata_from_id(id)size=self._get_file_size_from_id(id)full_path=self._get_file_path_from_id(id)documents=[]forsheetinsheets:sheet_name=sheet["properties"]["title"]result=(sheets_service.spreadsheets().values().get(spreadsheetId=id,range=sheet_name).execute())values=result.get("values",[])ifnotvalues:continue# empty sheetheader=values[0]fori,rowinenumerate(values[1:],start=1):metadata={"source":(f"https://docs.google.com/spreadsheets/d/{id}/"f"edit?gid={sheet['properties']['sheetId']}"),"title":f"{spreadsheet['properties']['title']} - {sheet_name}","row":i,}ifself.load_auth:metadata["authorized_identities"]=authorized_identitiesifself.load_extended_metadata:metadata["owner"]=ownermetadata["size"]=sizemetadata["full_path"]=full_pathcontent=[]forj,vinenumerate(row):title=header[j].strip()iflen(header)>jelse""content.append(f"{title}: {v.strip()}")page_content="\n".join(content)documents.append(Document(page_content=page_content,metadata=metadata))returndocumentsdef_load_document_from_id(self,id:str)->Document:"""Load a document from an ID."""fromioimportBytesIOfromgoogleapiclient.discoveryimportbuildfromgoogleapiclient.errorsimportHttpError# type: ignore[import]fromgoogleapiclient.httpimportMediaIoBaseDownload# type: ignore[import]creds=self._load_credentials()service=build("drive","v3",credentials=creds)ifself.load_auth:authorized_identities=self._get_identity_metadata_from_id(id)ifself.load_extended_metadata:owner=self._get_owner_metadata_from_id(id)size=self._get_file_size_from_id(id)full_path=self._get_file_path_from_id(id)file=(service.files().get(fileId=id,supportsAllDrives=True,fields="modifiedTime,name,webViewLink",).execute())request=service.files().export_media(fileId=id,mimeType="text/plain")fh=BytesIO()downloader=MediaIoBaseDownload(fh,request)done=Falsetry:whiledoneisFalse:status,done=downloader.next_chunk()exceptHttpErrorase:ife.resp.status==404:print("File not found: {}".format(id))# noqa: T201else:print("An error occurred: {}".format(e))# noqa: T201text=fh.getvalue().decode("utf-8")metadata={"source":f"{file.get('webViewLink')}","title":f"{file.get('name')}","when":f"{file.get('modifiedTime')}",}ifself.load_auth:metadata["authorized_identities"]=authorized_identities# type: ignoreifself.load_extended_metadata:metadata["owner"]=ownermetadata["size"]=sizemetadata["full_path"]=full_pathreturnDocument(page_content=text,metadata=metadata)def_load_documents_from_folder(self,folder_id:str,*,file_types:Optional[Sequence[str]]=None)->List[Document]:"""Load documents from a folder."""fromgoogleapiclient.discoveryimportbuildcreds=self._load_credentials()service=build("drive","v3",credentials=creds)files=self._fetch_files_recursive(service,folder_id)# If file types filter is provided, we'll filter by the file type.iffile_types:_files=[fforfinfilesiff["mimeType"]infile_types]# type: ignoreelse:_files=filesreturns=[]forfilein_files:iffile["trashed"]andnotself.load_trashed_files:continueeliffile["mimeType"]in["application/vnd.google-apps.document","application/vnd.google-apps.presentation",]:returns.append(self._load_document_from_id(file["id"]))# type: ignoreeliffile["mimeType"]=="application/vnd.google-apps.spreadsheet":returns.extend(self._load_sheet_from_id(file["id"]))# type: ignoreelif(file["mimeType"]=="application/pdf"orself.file_loader_clsisnotNone):returns.extend(self._load_file_from_id(file["id"]))# type: ignoreelse:passreturnreturnsdef_fetch_files_recursive(self,service:Any,folder_id:str)->List[Dict[str,Union[str,List[str]]]]:"""Fetch all files and subfolders recursively."""results=(service.files().list(q=f"'{folder_id}' in parents",pageSize=1000,includeItemsFromAllDrives=True,supportsAllDrives=True,fields="nextPageToken, files(id, name, mimeType, parents, trashed)",).execute())files=results.get("files",[])returns=[]forfileinfiles:iffile["mimeType"]=="application/vnd.google-apps.folder":ifself.recursive:returns.extend(self._fetch_files_recursive(service,file["id"]))else:returns.append(file)returnreturnsdef_load_documents_from_ids(self)->List[Document]:"""Load documents from a list of IDs."""ifnotself.document_ids:raiseValueError("document_ids must be set")return[self._load_document_from_id(doc_id)fordoc_idinself.document_ids]def_load_file_from_id(self,id:str)->List[Document]:"""Load a file from an ID."""fromioimportBytesIOfromgoogleapiclient.discoveryimportbuildfromgoogleapiclient.httpimportMediaIoBaseDownloadcreds=self._load_credentials()service=build("drive","v3",credentials=creds)ifself.load_auth:authorized_identities=self._get_identity_metadata_from_id(id)ifself.load_extended_metadata:owner=self._get_owner_metadata_from_id(id)size=self._get_file_size_from_id(id)full_path=self._get_file_path_from_id(id)file=service.files().get(fileId=id,supportsAllDrives=True).execute()request=service.files().get_media(fileId=id)fh=BytesIO()downloader=MediaIoBaseDownload(fh,request)done=FalsewhiledoneisFalse:status,done=downloader.next_chunk()ifself.file_loader_clsisnotNone:fh.seek(0)loader=self.file_loader_cls(file=fh,**self.file_loader_kwargs)docs=loader.load()fordocindocs:doc.metadata["source"]=f"https://drive.google.com/file/d/{id}/view"if"title"notindoc.metadata:doc.metadata["title"]=f"{file.get('name')}"ifself.load_auth:doc.metadata["authorized_identities"]=authorized_identitiesifself.load_extended_metadata:doc.metadata["owner"]=ownerdoc.metadata["size"]=sizedoc.metadata["full_path"]=full_pathreturndocselse:fromPyPDF2importPdfReader# type: ignore[import]content=fh.getvalue()pdf_reader=PdfReader(BytesIO(content))docs=[]fori,pageinenumerate(pdf_reader.pages):metadata={"source":f"https://drive.google.com/file/d/{id}/view","title":f"{file.get('name')}","page":i,}ifself.load_auth:metadata["authorized_identities"]=authorized_identitiesifself.load_extended_metadata:metadata["owner"]=ownermetadata["size"]=sizemetadata["full_path"]=full_pathdocs.append(Document(page_content=page.extract_text(),metadata=metadata,))returndocsdef_load_file_from_ids(self)->List[Document]:"""Load files from a list of IDs."""ifnotself.file_ids:raiseValueError("file_ids must be set")docs=[]forfile_idinself.file_ids:docs.extend(self._load_file_from_id(file_id))returndocs