[docs]def__init__(self,file_path:Optional[str|Path|list[str]|list[Path]]=None,*,file:Optional[IO[bytes]|list[IO[bytes]]]=None,partition_via_api:bool=False,post_processors:Optional[list[Callable[[str],str]]]=None,# SDK parametersapi_key:Optional[str]=None,client:Optional[UnstructuredClient]=None,url:Optional[str]=None,web_url:Optional[str]=None,**kwargs:Any,):"""Initialize loader."""iffile_pathisnotNoneandfileisnotNone:raiseValueError("file_path and file cannot be defined simultaneously.")ifclientisnotNone:disallowed_params=[("api_key",api_key),("url",url)]bad_params=[paramforparam,valueindisallowed_paramsifvalueisnotNone]ifbad_params:raiseValueError("if you are passing a custom `client`, you cannot also pass these "f"params: {', '.join(bad_params)}.")unstructured_api_key=api_keyoros.getenv("UNSTRUCTURED_API_KEY")or""unstructured_url=urloros.getenv("UNSTRUCTURED_URL")or_DEFAULT_URLself.client=clientorUnstructuredClient(api_key_auth=unstructured_api_key,server_url=unstructured_url)self.file_path=file_pathself.file=fileself.partition_via_api=partition_via_apiself.post_processors=post_processorsself.unstructured_kwargs=kwargsifweb_url:self.unstructured_kwargs["url"]=web_url
[docs]deflazy_load(self)->Iterator[Document]:"""Load file(s) to the _UnstructuredBaseLoader."""defload_file(f:Optional[IO[bytes]]=None,f_path:Optional[str|Path]=None)->Iterator[Document]:"""Load an individual file to the _UnstructuredBaseLoader."""return_SingleDocumentLoader(file=f,file_path=f_path,partition_via_api=self.partition_via_api,post_processors=self.post_processors,# SDK parametersclient=self.client,**self.unstructured_kwargs,).lazy_load()ifisinstance(self.file,list):forfinself.file:yield fromload_file(f=f)returnifisinstance(self.file_path,list):forf_pathinself.file_path:yield fromload_file(f_path=f_path)return# Call _UnstructuredBaseLoader normally since file and file_path are not listsyield fromload_file(f=self.file,f_path=self.file_path)
class_SingleDocumentLoader(BaseLoader):"""Provides loader functionality for individual document/file objects. Encapsulates partitioning individual file objects (file or file_path) either locally or via the Unstructured API. """def__init__(self,file_path:Optional[str|Path]=None,*,client:UnstructuredClient,file:Optional[IO[bytes]]=None,partition_via_api:bool=False,post_processors:Optional[list[Callable[[str],str]]]=None,**kwargs:Any,):"""Initialize loader."""self.file_path=str(file_path)ifisinstance(file_path,Path)elsefile_pathself.file=fileself.partition_via_api=partition_via_apiself.post_processors=post_processors# SDK parametersself.client=clientself.unstructured_kwargs=kwargsdeflazy_load(self)->Iterator[Document]:"""Load file."""elements_json=(self._post_process_elements_json(self._elements_json)ifself.post_processorselseself._elements_json)forelementinelements_json:metadata=self._get_metadata()metadata.update(element.get("metadata"))# type: ignoremetadata.update({"category":element.get("category")orelement.get("type")})metadata.update({"element_id":element.get("element_id")})yieldDocument(page_content=cast(str,element.get("text")),metadata=metadata)@propertydef_elements_json(self)->list[dict[str,Any]]:"""Get elements as a list of dictionaries from local partition or via API."""ifself.partition_via_api:returnself._elements_via_apireturnself._convert_elements_to_dicts(self._elements_via_local)@propertydef_elements_via_local(self)->list[Element]:try:fromunstructured.partition.autoimportpartition# type: ignoreexceptImportError:raiseImportError("unstructured package not found, please install it with ""`pip install unstructured`")ifself.fileandself.unstructured_kwargs.get("metadata_filename")isNone:raiseValueError("If partitioning a fileIO object, metadata_filename must be specified"" as well.",)returnpartition(file=self.file,filename=self.file_path,**self.unstructured_kwargs)# type: ignore@propertydef_elements_via_api(self)->list[dict[str,Any]]:"""Retrieve a list of element dicts from the API using the SDK client."""client=self.clientreq=self._sdk_partition_requestresponse=client.general.partition(request=req)ifresponse.status_code==200:returnjson.loads(response.raw_response.text)raiseValueError(f"Receive unexpected status code {response.status_code} from the API.",)@propertydef_file_content(self)->bytes:"""Get content from either file or file_path."""ifself.fileisnotNone:returnself.file.read()elifself.file_path:withopen(self.file_path,"rb")asf:returnf.read()raiseValueError("file or file_path must be defined.")@propertydef_sdk_partition_request(self)->operations.PartitionRequest:returnoperations.PartitionRequest(partition_parameters=shared.PartitionParameters(files=shared.Files(content=self._file_content,file_name=str(self.file_path)),**self.unstructured_kwargs,),)def_convert_elements_to_dicts(self,elements:list[Element])->list[dict[str,Any]]:return[element.to_dict()forelementinelements]def_get_metadata(self)->dict[str,Any]:"""Get file_path metadata if available."""return{"source":self.file_path}ifself.file_pathelse{}def_post_process_elements_json(self,elements_json:list[dict[str,Any]])->list[dict[str,Any]]:"""Apply post processing functions to extracted unstructured elements. Post processing functions are str -> str callables passed in using the post_processors kwarg when the loader is instantiated. """ifself.post_processors:forelementinelements_json:forpost_processorinself.post_processors:element["text"]=post_processor(str(element.get("text")))returnelements_json