Source code for langchain_community.document_loaders.dropbox
# Prerequisites:# 1. Create a Dropbox app.# 2. Give the app these scope permissions: `files.metadata.read`# and `files.content.read`.# 3. Generate access token: https://www.dropbox.com/developers/apps/create.# 4. `pip install dropbox` (requires `pip install unstructured[pdf]` for PDF filetype).importosimporttempfilefrompathlibimportPathfromtypingimportAny,Dict,List,Optionalfromlangchain_core.documentsimportDocumentfrompydanticimportBaseModel,model_validatorfromlangchain_community.document_loaders.baseimportBaseLoader
[docs]classDropboxLoader(BaseLoader,BaseModel):"""Load files from `Dropbox`. In addition to common files such as text and PDF files, it also supports *Dropbox Paper* files. """dropbox_access_token:str"""Dropbox access token."""dropbox_folder_path:Optional[str]=None"""The folder path to load from."""dropbox_file_paths:Optional[List[str]]=None"""The file paths to load from."""recursive:bool=False"""Flag to indicate whether to load files recursively from subfolders."""@model_validator(mode="before")@classmethoddefvalidate_inputs(cls,values:Dict[str,Any])->Any:"""Validate that either folder_path or file_paths is set, but not both."""if(values.get("dropbox_folder_path")isnotNoneandvalues.get("dropbox_file_paths")isnotNone):raiseValueError("Cannot specify both folder_path and file_paths")ifvalues.get("dropbox_folder_path")isNoneandnotvalues.get("dropbox_file_paths"):raiseValueError("Must specify either folder_path or file_paths")returnvaluesdef_create_dropbox_client(self)->Any:"""Create a Dropbox client."""try:fromdropboximportDropbox,exceptionsexceptImportError:raiseImportError("You must run `pip install dropbox")try:dbx=Dropbox(self.dropbox_access_token)dbx.users_get_current_account()exceptexceptions.AuthErrorasex:raiseValueError("Invalid Dropbox access token. Please verify your token and try again.")fromexreturndbxdef_load_documents_from_folder(self,folder_path:str)->List[Document]:"""Load documents from a Dropbox folder."""dbx=self._create_dropbox_client()try:fromdropboximportexceptionsfromdropbox.filesimportFileMetadataexceptImportError:raiseImportError("You must run `pip install dropbox")try:results=dbx.files_list_folder(folder_path,recursive=self.recursive)exceptexceptions.ApiErrorasex:raiseValueError(f"Could not list files in the folder: {folder_path}. ""Please verify the folder path and try again.")fromexfiles=[entryforentryinresults.entriesifisinstance(entry,FileMetadata)]documents=[docfordocin(self._load_file_from_path(file.path_display)forfileinfiles)ifdocisnotNone]returndocumentsdef_load_file_from_path(self,file_path:str)->Optional[Document]:"""Load a file from a Dropbox path."""dbx=self._create_dropbox_client()try:fromdropboximportexceptionsexceptImportError:raiseImportError("You must run `pip install dropbox")try:file_metadata=dbx.files_get_metadata(file_path)iffile_metadata.is_downloadable:_,response=dbx.files_download(file_path)# Some types such as Paper, need to be exported.eliffile_metadata.export_info:_,response=dbx.files_export(file_path,"markdown")exceptexceptions.ApiErrorasex:raiseValueError(f"Could not load file: {file_path}. Please verify the file path""and try again.")fromextry:text=response.content.decode("utf-8")exceptUnicodeDecodeError:file_extension=os.path.splitext(file_path)[1].lower()iffile_extension==".pdf":print(f"File {file_path} type detected as .pdf")# noqa: T201fromlangchain_community.document_loadersimportUnstructuredPDFLoader# Download it to a temporary file.temp_dir=tempfile.TemporaryDirectory()temp_pdf=Path(temp_dir.name)/"tmp.pdf"withopen(temp_pdf,mode="wb")asf:f.write(response.content)try:loader=UnstructuredPDFLoader(str(temp_pdf))docs=loader.load()ifdocs:returndocs[0]exceptExceptionaspdf_ex:print(f"Error while trying to parse PDF {file_path}: {pdf_ex}")# noqa: T201returnNoneelse:print(# noqa: T201f"File {file_path} could not be decoded as pdf or text. Skipping.")returnNonemetadata={"source":f"dropbox://{file_path}","title":os.path.basename(file_path),}returnDocument(page_content=text,metadata=metadata)def_load_documents_from_paths(self)->List[Document]:"""Load documents from a list of Dropbox file paths."""ifnotself.dropbox_file_paths:raiseValueError("file_paths must be set")return[docfordocin(self._load_file_from_path(file_path)forfile_pathinself.dropbox_file_paths)ifdocisnotNone]