Source code for langchain_community.document_loaders.base_o365
"""Base class for all loaders that uses O365 Package"""from__future__importannotationsimportloggingimportosimporttempfilefromabcimportabstractmethodfromenumimportEnumfrompathlibimportPath,PurePathfromtypingimportTYPE_CHECKING,Any,Dict,Iterable,List,Sequence,Unionfromlangchain_core.pydantic_v1import(BaseModel,BaseSettings,Field,FilePath,SecretStr,)fromlangchain_community.document_loaders.baseimportBaseLoaderfromlangchain_community.document_loaders.blob_loaders.file_systemimport(FileSystemBlobLoader,)fromlangchain_community.document_loaders.blob_loaders.schemaimportBlobifTYPE_CHECKING:fromO365importAccountfromO365.driveimportDrive,Folderlogger=logging.getLogger(__name__)CHUNK_SIZE=1024*1024*5class_O365Settings(BaseSettings):client_id:str=Field(...,env="O365_CLIENT_ID")client_secret:SecretStr=Field(...,env="O365_CLIENT_SECRET")classConfig:case_sentive=Falseenv_file=".env"env_prefix=""class_O365TokenStorage(BaseSettings):token_path:FilePath=Path.home()/".credentials"/"o365_token.txt"class_FileType(str,Enum):DOC="doc"DOCX="docx"PDF="pdf"
[docs]deffetch_mime_types(file_types:Sequence[_FileType])->Dict[str,str]:"""Fetch the mime types for the specified file types."""mime_types_mapping={}forfile_typeinfile_types:iffile_type.value=="doc":mime_types_mapping[file_type.value]="application/msword"eliffile_type.value=="docx":mime_types_mapping[file_type.value]=("application/vnd.openxmlformats-officedocument.wordprocessingml.document"# noqa: E501)eliffile_type.value=="pdf":mime_types_mapping[file_type.value]="application/pdf"returnmime_types_mapping
[docs]classO365BaseLoader(BaseLoader,BaseModel):"""Base class for all loaders that uses O365 Package"""settings:_O365Settings=Field(default_factory=_O365Settings)# type: ignore[arg-type]"""Settings for the Office365 API client."""auth_with_token:bool=False"""Whether to authenticate with a token or not. Defaults to False."""chunk_size:Union[int,str]=CHUNK_SIZE"""Number of bytes to retrieve from each api call to the server. int or 'auto'."""recursive:bool=False"""Should the loader recursively load subfolders?"""@property@abstractmethoddef_file_types(self)->Sequence[_FileType]:"""Return supported file types."""@propertydef_fetch_mime_types(self)->Dict[str,str]:"""Return a dict of supported file types to corresponding mime types."""returnfetch_mime_types(self._file_types)@property@abstractmethoddef_scopes(self)->List[str]:"""Return required scopes."""def_load_from_folder(self,folder:Folder)->Iterable[Blob]:"""Lazily load all files from a specified folder of the configured MIME type. Args: folder: The Folder instance from which the files are to be loaded. This Folder instance should represent a directory in a file system where the files are stored. Yields: An iterator that yields Blob instances, which are binary representations of the files loaded from the folder. """file_mime_types=self._fetch_mime_typesitems=folder.get_items()metadata_dict:Dict[str,Dict[str,Any]]={}withtempfile.TemporaryDirectory()astemp_dir:os.makedirs(os.path.dirname(temp_dir),exist_ok=True)forfileinitems:iffile.is_file:iffile.mime_typeinlist(file_mime_types.values()):file.download(to_path=temp_dir,chunk_size=self.chunk_size)metadata_dict[file.name]={"source":file.web_url,"mime_type":file.mime_type,"created":file.created,"modified":file.modified,"created_by":str(file.created_by),"modified_by":str(file.modified_by),"description":file.description,"id":str(file.object_id),}loader=FileSystemBlobLoader(path=temp_dir)forblobinloader.yield_blobs():ifnotisinstance(blob.path,PurePath):raiseNotImplementedError("Expected blob path to be a PurePath")ifblob.path:file_metadata_=metadata_dict.get(str(blob.path.name),{})blob.metadata.update(file_metadata_)yieldblobifself.recursive:forsubfolderinfolder.get_child_folders():yield fromself._load_from_folder(subfolder)def_load_from_object_ids(self,drive:Drive,object_ids:List[str])->Iterable[Blob]:"""Lazily load files specified by their object_ids from a drive. Load files into the system as binary large objects (Blobs) and return Iterable. Args: drive: The Drive instance from which the files are to be loaded. This Drive instance should represent a cloud storage service or similar storage system where the files are stored. object_ids: A list of object_id strings. Each object_id represents a unique identifier for a file in the drive. Yields: An iterator that yields Blob instances, which are binary representations of the files loaded from the drive using the specified object_ids. """file_mime_types=self._fetch_mime_typesmetadata_dict:Dict[str,Dict[str,Any]]={}withtempfile.TemporaryDirectory()astemp_dir:forobject_idinobject_ids:file=drive.get_item(object_id)ifnotfile:logging.warning("There isn't a file with"f"object_id {object_id} in drive {drive}.")continueiffile.is_file:iffile.mime_typeinlist(file_mime_types.values()):file.download(to_path=temp_dir,chunk_size=self.chunk_size)metadata_dict[file.name]={"source":file.web_url,"mime_type":file.mime_type,"created":file.created,"modified":file.modified,"created_by":str(file.created_by),"modified_by":str(file.modified_by),"description":file.description,"id":str(file.object_id),}loader=FileSystemBlobLoader(path=temp_dir)forblobinloader.yield_blobs():ifnotisinstance(blob.path,PurePath):raiseNotImplementedError("Expected blob path to be a PurePath")ifblob.path:file_metadata_=metadata_dict.get(str(blob.path.name),{})blob.metadata.update(file_metadata_)yieldblobdef_auth(self)->Account:"""Authenticates the OneDrive API client Returns: The authenticated Account object. """try:fromO365importAccount,FileSystemTokenBackendexceptImportError:raiseImportError("O365 package not found, please install it with `pip install o365`")ifself.auth_with_token:token_storage=_O365TokenStorage()token_path=token_storage.token_pathtoken_backend=FileSystemTokenBackend(token_path=token_path.parent,token_filename=token_path.name)account=Account(credentials=(self.settings.client_id,self.settings.client_secret.get_secret_value(),),scopes=self._scopes,token_backend=token_backend,**{"raise_http_errors":False},)else:token_backend=FileSystemTokenBackend(token_path=Path.home()/".credentials")account=Account(credentials=(self.settings.client_id,self.settings.client_secret.get_secret_value(),),scopes=self._scopes,token_backend=token_backend,**{"raise_http_errors":False},)# make the authaccount.authenticate()returnaccount