Source code for langchain_community.document_loaders.base_o365
"""Base class for all loaders that uses O365 Package"""from__future__importannotationsimportloggingimportmimetypesimportosimportreimporttempfileimporturllibfromabcimportabstractmethodfromdatetimeimportdatetimefrompathlibimportPath,PurePathfromtypingimportTYPE_CHECKING,Any,Dict,Iterable,List,Optional,Sequence,Unionfrompydanticimport(BaseModel,Field,FilePath,PrivateAttr,SecretStr,)frompydantic_settingsimportBaseSettings,SettingsConfigDictfromlangchain_community.document_loaders.baseimportBaseBlobParser,BaseLoaderfromlangchain_community.document_loaders.blob_loaders.file_systemimport(FileSystemBlobLoader,)fromlangchain_community.document_loaders.blob_loaders.schemaimportBlobfromlangchain_community.document_loaders.parsers.genericimportMimeTypeBasedParserfromlangchain_community.document_loaders.parsers.registryimportget_parserifTYPE_CHECKING:fromO365importAccountfromO365.driveimportDrive,Folderlogger=logging.getLogger(__name__)CHUNK_SIZE=1024*1024*5class_O365Settings(BaseSettings):client_id:str=Field(...,alias="O365_CLIENT_ID")client_secret:SecretStr=Field(...,alias="O365_CLIENT_SECRET")model_config=SettingsConfigDict(case_sensitive=False,env_file=".env",env_prefix="",extra="ignore")class_O365TokenStorage(BaseSettings):token_path:FilePath=Path.home()/".credentials"/"o365_token.txt"
[docs]deffetch_mime_types(file_types:Sequence[str])->Dict[str,str]:"""Fetch the mime types for the specified file types."""mime_types_mapping={}forextinfile_types:mime_type,_=mimetypes.guess_type(f"file.{ext}")ifmime_type:mime_types_mapping[ext]=mime_typeelse:raiseValueError(f"Unknown mimetype of extension {ext}")returnmime_types_mapping
[docs]deffetch_extensions(mime_types:Sequence[str])->Dict[str,str]:"""Fetch the mime types for the specified file types."""mime_types_mapping={}formime_typeinmime_types:ext=mimetypes.guess_extension(mime_type)ifext:mime_types_mapping[ext[1:]]=mime_type# ignore leading `.`else:raiseValueError(f"Unknown mimetype {mime_type}")returnmime_types_mapping
[docs]classO365BaseLoader(BaseLoader,BaseModel):"""Base class for all loaders that uses O365 Package"""settings:_O365Settings=Field(default_factory=_O365Settings)# type: ignore[arg-type]"""Settings for the Office365 API client."""auth_with_token:bool=False"""Whether to authenticate with a token or not. Defaults to False."""chunk_size:Union[int,str]=CHUNK_SIZE"""Number of bytes to retrieve from each api call to the server. int or 'auto'."""recursive:bool=False"""Should the loader recursively load subfolders?"""modified_since:Optional[datetime]=None"""Only fetch documents modified since given datetime. The datetime object must be timezone aware."""handlers:Optional[Dict[str,Any]]={}""" Provide custom handlers for MimeTypeBasedParser. Pass a dictionary mapping either file extensions (like "doc", "pdf", etc.) or MIME types (like "application/pdf", "text/plain", etc.) to parsers. Note that you must use either file extensions or MIME types exclusively and cannot mix them. Do not include the leading dot for file extensions. Example using file extensions: ```python handlers = { "doc": MsWordParser(), "pdf": PDFMinerParser(), "txt": TextParser() } ``` Example using MIME types: ```python handlers = { "application/msword": MsWordParser(), "application/pdf": PDFMinerParser(), "text/plain": TextParser() } ``` """_blob_parser:BaseBlobParser=PrivateAttr()_file_types:Sequence[str]=PrivateAttr()_mime_types:Dict[str,str]=PrivateAttr()def__init__(self,**kwargs:Any)->None:super().__init__(**kwargs)ifself.handlers:handler_keys=list(self.handlers.keys())try:# assume handlers.keys() are file extensionsself._mime_types=fetch_mime_types(handler_keys)self._file_types=list(set(handler_keys))mime_handlers={self._mime_types[extension]:handlerforextension,handlerinself.handlers.items()}exceptValueError:try:# assume handlers.keys() are mime typesself._mime_types=fetch_extensions(handler_keys)self._file_types=list(set(self._mime_types.keys()))mime_handlers=self.handlersexceptValueError:raiseValueError("`handlers` keys must be either file extensions or mimetypes.\n"f"{handler_keys} could not be interpreted as either.\n""File extensions and mimetypes cannot mix. ""Use either one or the other")self._blob_parser=MimeTypeBasedParser(handlers=mime_handlers,fallback_parser=None)else:self._blob_parser=get_parser("default")ifnotisinstance(self._blob_parser,MimeTypeBasedParser):raiseTypeError('get_parser("default) was supposed to return MimeTypeBasedParser.'f"It returned {type(self._blob_parser)}")self._mime_types=fetch_extensions(list(self._blob_parser.handlers.keys()))@propertydef_fetch_mime_types(self)->Dict[str,str]:"""Return a dict of supported file types to corresponding mime types."""returnself._mime_types@property@abstractmethoddef_scopes(self)->List[str]:"""Return required scopes."""def_load_from_folder(self,folder:Folder)->Iterable[Blob]:"""Lazily load all files from a specified folder of the configured MIME type. Args: folder: The Folder instance from which the files are to be loaded. This Folder instance should represent a directory in a file system where the files are stored. Yields: An iterator that yields Blob instances, which are binary representations of the files loaded from the folder. """file_mime_types=self._fetch_mime_typesitems=folder.get_items()metadata_dict:Dict[str,Dict[str,Any]]={}withtempfile.TemporaryDirectory()astemp_dir:os.makedirs(os.path.dirname(temp_dir),exist_ok=True)forfileinitems:iffile.is_file:iffile.mime_typeinlist(file_mime_types.values()):if(notself.modified_since)or(file.modified>self.modified_since):source=file.web_urlifre.search(r"Doc.aspx\?sourcedoc=.*file=([^&]+)",file.web_url):source=(file._parent.web_url+"/"+urllib.parse.quote(file.name))file.download(to_path=temp_dir,chunk_size=self.chunk_size)metadata_dict[file.name]={"source":source,"mime_type":file.mime_type,"created":str(file.created),"modified":str(file.modified),"created_by":str(file.created_by),"modified_by":str(file.modified_by),"description":file.description,"id":str(file.object_id),}loader=FileSystemBlobLoader(path=temp_dir)forblobinloader.yield_blobs():ifnotisinstance(blob.path,PurePath):raiseNotImplementedError("Expected blob path to be a PurePath")ifblob.path:file_metadata_=metadata_dict.get(str(blob.path.name),{})blob.metadata.update(file_metadata_)yieldblobifself.recursive:forsubfolderinfolder.get_child_folders():yield fromself._load_from_folder(subfolder)def_load_from_object_ids(self,drive:Drive,object_ids:List[str])->Iterable[Blob]:"""Lazily load files specified by their object_ids from a drive. Load files into the system as binary large objects (Blobs) and return Iterable. Args: drive: The Drive instance from which the files are to be loaded. This Drive instance should represent a cloud storage service or similar storage system where the files are stored. object_ids: A list of object_id strings. Each object_id represents a unique identifier for a file in the drive. Yields: An iterator that yields Blob instances, which are binary representations of the files loaded from the drive using the specified object_ids. """file_mime_types=self._fetch_mime_typesmetadata_dict:Dict[str,Dict[str,Any]]={}withtempfile.TemporaryDirectory()astemp_dir:forobject_idinobject_ids:file=drive.get_item(object_id)ifnotfile:logging.warning("There isn't a file with"f"object_id {object_id} in drive {drive}.")continueiffile.is_file:iffile.mime_typeinlist(file_mime_types.values()):source=file.web_urlifre.search(r"Doc.aspx\?sourcedoc=.*file=([^&]+)",file.web_url):source=(file._parent.web_url+"/"+urllib.parse.quote(file.name))file.download(to_path=temp_dir,chunk_size=self.chunk_size)metadata_dict[file.name]={"source":source,"mime_type":file.mime_type,"created":file.created,"modified":file.modified,"created_by":str(file.created_by),"modified_by":str(file.modified_by),"description":file.description,"id":str(file.object_id),}loader=FileSystemBlobLoader(path=temp_dir)forblobinloader.yield_blobs():ifnotisinstance(blob.path,PurePath):raiseNotImplementedError("Expected blob path to be a PurePath")ifblob.path:file_metadata_=metadata_dict.get(str(blob.path.name),{})blob.metadata.update(file_metadata_)yieldblobdef_auth(self)->Account:"""Authenticates the OneDrive API client Returns: The authenticated Account object. """try:fromO365importAccount,FileSystemTokenBackendexceptImportError:raiseImportError("O365 package not found, please install it with `pip install o365`")ifself.auth_with_token:token_storage=_O365TokenStorage()token_path=token_storage.token_pathtoken_backend=FileSystemTokenBackend(token_path=token_path.parent,token_filename=token_path.name)account=Account(credentials=(self.settings.client_id,self.settings.client_secret.get_secret_value(),),scopes=self._scopes,token_backend=token_backend,**{"raise_http_errors":False},)else:token_backend=FileSystemTokenBackend(token_path=Path.home()/".credentials")account=Account(credentials=(self.settings.client_id,self.settings.client_secret.get_secret_value(),),scopes=self._scopes,token_backend=token_backend,**{"raise_http_errors":False},)# make the authaccount.authenticate()returnaccount