Source code for langchain_community.document_loaders.unstructured
"""Loader that uses unstructured to load files."""from__future__importannotationsimportloggingimportosfromabcimportABC,abstractmethodfrompathlibimportPathfromtypingimportIO,Any,Callable,Iterator,List,Optional,Sequence,Unionfromlangchain_core._api.deprecationimportdeprecatedfromlangchain_core.documentsimportDocumentfromtyping_extensionsimportTypeAliasfromlangchain_community.document_loaders.baseimportBaseLoaderElement:TypeAlias=Anylogger=logging.getLogger(__file__)
[docs]defsatisfies_min_unstructured_version(min_version:str)->bool:"""Check if the installed `Unstructured` version exceeds the minimum version for the feature in question."""fromunstructured.__version__import__version__as__unstructured_version__min_version_tuple=tuple([int(x)forxinmin_version.split(".")])# NOTE(MthwRobinson) - enables the loader to work when you're using pre-release# versions of unstructured like 0.4.17-dev1_unstructured_version=__unstructured_version__.split("-")[0]unstructured_version_tuple=tuple([int(x)forxin_unstructured_version.split(".")])returnunstructured_version_tuple>=min_version_tuple
[docs]defvalidate_unstructured_version(min_unstructured_version:str)->None:"""Raise an error if the `Unstructured` version does not exceed the specified minimum."""ifnotsatisfies_min_unstructured_version(min_unstructured_version):raiseValueError(f"unstructured>={min_unstructured_version} is required in this loader.")
[docs]classUnstructuredBaseLoader(BaseLoader,ABC):"""Base Loader that uses `Unstructured`."""
[docs]def__init__(self,mode:str="single",# deprecatedpost_processors:Optional[List[Callable[[str],str]]]=None,**unstructured_kwargs:Any,):"""Initialize with file path."""try:importunstructured# noqa:F401exceptImportError:raiseImportError("unstructured package not found, please install it with ""`pip install unstructured`")# `single` - elements are combined into one (default)# `elements` - maintain individual elements# `paged` - elements are combined by page_valid_modes={"single","elements","paged"}ifmodenotin_valid_modes:raiseValueError(f"Got {mode} for `mode`, but should be one of `{_valid_modes}`")ifnotsatisfies_min_unstructured_version("0.5.4"):if"strategy"inunstructured_kwargs:unstructured_kwargs.pop("strategy")self._check_if_both_mode_and_chunking_strategy_are_by_page(mode,unstructured_kwargs)self.mode=modeself.unstructured_kwargs=unstructured_kwargsself.post_processors=post_processorsor[]
@abstractmethoddef_get_elements(self)->List[Element]:"""Get elements."""@abstractmethoddef_get_metadata(self)->dict[str,Any]:"""Get file_path metadata if available."""def_post_process_elements(self,elements:List[Element])->List[Element]:"""Apply post processing functions to extracted unstructured elements. Post processing functions are str -> str callables passed in using the post_processors kwarg when the loader is instantiated. """forelementinelements:forpost_processorinself.post_processors:element.apply(post_processor)returnelements
[docs]deflazy_load(self)->Iterator[Document]:"""Load file."""elements=self._get_elements()self._post_process_elements(elements)ifself.mode=="elements":forelementinelements:metadata=self._get_metadata()# NOTE(MthwRobinson) - the attribute check is for backward compatibility# with unstructured<0.4.9. The metadata attributed was added in 0.4.9.ifhasattr(element,"metadata"):metadata.update(element.metadata.to_dict())ifhasattr(element,"category"):metadata["category"]=element.categoryifelement.to_dict().get("element_id"):metadata["element_id"]=element.to_dict().get("element_id")yieldDocument(page_content=str(element),metadata=metadata)elifself.mode=="paged":logger.warning("`mode='paged'` is deprecated in favor of the 'by_page' chunking"" strategy. Learn more about chunking here:"" https://docs.unstructured.io/open-source/core-functionality/chunking")text_dict:dict[int,str]={}meta_dict:dict[int,dict[str,Any]]={}forelementinelements:metadata=self._get_metadata()ifhasattr(element,"metadata"):metadata.update(element.metadata.to_dict())page_number=metadata.get("page_number",1)# Check if this page_number already exists in text_dictifpage_numbernotintext_dict:# If not, create new entry with initial text and metadatatext_dict[page_number]=str(element)+"\n\n"meta_dict[page_number]=metadataelse:# If exists, append to text and update the metadatatext_dict[page_number]+=str(element)+"\n\n"meta_dict[page_number].update(metadata)# Convert the dict to a list of Document objectsforkeyintext_dict.keys():yieldDocument(page_content=text_dict[key],metadata=meta_dict[key])elifself.mode=="single":metadata=self._get_metadata()text="\n\n".join([str(el)forelinelements])yieldDocument(page_content=text,metadata=metadata)else:raiseValueError(f"mode of {self.mode} not supported.")
def_check_if_both_mode_and_chunking_strategy_are_by_page(self,mode:str,unstructured_kwargs:dict[str,Any])->None:if(mode=="paged"andunstructured_kwargs.get("chunking_strategy")=="by_page"):raiseValueError("Only one of `chunking_strategy='by_page'` or `mode='paged'` may be"" set. `chunking_strategy` is preferred.")
[docs]@deprecated(since="0.2.8",removal="1.0",alternative_import="langchain_unstructured.UnstructuredLoader",)classUnstructuredFileLoader(UnstructuredBaseLoader):"""Load files using `Unstructured`. The file loader uses the unstructured partition function and will automatically detect the file type. You can run the loader in different modes: "single", "elements", and "paged". The default "single" mode will return a single langchain Document object. If you use "elements" mode, the unstructured library will split the document into elements such as Title and NarrativeText and return those as individual langchain Document objects. In addition to these post-processing modes (which are specific to the LangChain Loaders), Unstructured has its own "chunking" parameters for post-processing elements into more useful chunks for uses cases such as Retrieval Augmented Generation (RAG). You can pass in additional unstructured kwargs to configure different unstructured settings. Examples -------- from langchain_community.document_loaders import UnstructuredFileLoader loader = UnstructuredFileLoader( "example.pdf", mode="elements", strategy="fast", ) docs = loader.load() References ---------- https://docs.unstructured.io/open-source/core-functionality/partitioning https://docs.unstructured.io/open-source/core-functionality/chunking """
[docs]def__init__(self,file_path:Union[str,List[str],Path,List[Path]],*,mode:str="single",**unstructured_kwargs:Any,):"""Initialize with file path."""self.file_path=file_pathsuper().__init__(mode=mode,**unstructured_kwargs)
[docs]defget_elements_from_api(file_path:Union[str,List[str],Path,List[Path],None]=None,file:Union[IO[bytes],Sequence[IO[bytes]],None]=None,api_url:str="https://api.unstructuredapp.io/general/v0/general",api_key:str="",**unstructured_kwargs:Any,)->List[Element]:"""Retrieve a list of elements from the `Unstructured API`."""ifis_list:=isinstance(file_path,list):file_path=[str(path)forpathinfile_path]ifisinstance(file,Sequence)oris_list:fromunstructured.partition.apiimportpartition_multiple_via_api_doc_elements=partition_multiple_via_api(filenames=file_path,# type: ignorefiles=file,# type: ignoreapi_key=api_key,api_url=api_url,**unstructured_kwargs,)elements=[]for_elementsin_doc_elements:elements.extend(_elements)returnelementselse:fromunstructured.partition.apiimportpartition_via_apireturnpartition_via_api(filename=str(file_path)iffile_pathisnotNoneelseNone,file=file,api_key=api_key,api_url=api_url,**unstructured_kwargs,)
[docs]@deprecated(since="0.2.8",removal="1.0",alternative_import="langchain_unstructured.UnstructuredLoader",)classUnstructuredAPIFileLoader(UnstructuredBaseLoader):"""Load files using `Unstructured` API. By default, the loader makes a call to the hosted Unstructured API. If you are running the unstructured API locally, you can change the API rule by passing in the url parameter when you initialize the loader. The hosted Unstructured API requires an API key. See the links below to learn more about our API offerings and get an API key. You can run the loader in different modes: "single", "elements", and "paged". The default "single" mode will return a single langchain Document object. If you use "elements" mode, the unstructured library will split the document into elements such as Title and NarrativeText and return those as individual langchain Document objects. In addition to these post-processing modes (which are specific to the LangChain Loaders), Unstructured has its own "chunking" parameters for post-processing elements into more useful chunks for uses cases such as Retrieval Augmented Generation (RAG). You can pass in additional unstructured kwargs to configure different unstructured settings. Examples ```python from langchain_community.document_loaders import UnstructuredAPIFileLoader loader = UnstructuredAPIFileLoader( "example.pdf", mode="elements", strategy="fast", api_key="MY_API_KEY", ) docs = loader.load() References ---------- https://docs.unstructured.io/api-reference/api-services/sdk https://docs.unstructured.io/api-reference/api-services/overview https://docs.unstructured.io/open-source/core-functionality/partitioning https://docs.unstructured.io/open-source/core-functionality/chunking """
[docs]def__init__(self,file_path:Union[str,List[str]],*,mode:str="single",url:str="https://api.unstructuredapp.io/general/v0/general",api_key:str="",**unstructured_kwargs:Any,):"""Initialize with file path."""validate_unstructured_version(min_unstructured_version="0.10.15")self.file_path=file_pathself.url=urlself.api_key=os.getenv("UNSTRUCTURED_API_KEY")orapi_keysuper().__init__(mode=mode,**unstructured_kwargs)
def_get_metadata(self)->dict[str,Any]:return{"source":self.file_path}def_get_elements(self)->List[Element]:returnget_elements_from_api(file_path=self.file_path,api_key=self.api_key,api_url=self.url,**self.unstructured_kwargs,)def_post_process_elements(self,elements:List[Element])->List[Element]:"""Apply post processing functions to extracted unstructured elements. Post processing functions are str -> str callables passed in using the post_processors kwarg when the loader is instantiated. """forelementinelements:forpost_processorinself.post_processors:element.apply(post_processor)returnelements
[docs]@deprecated(since="0.2.8",removal="1.0",alternative_import="langchain_unstructured.UnstructuredLoader",)classUnstructuredFileIOLoader(UnstructuredBaseLoader):"""Load file-like objects opened in read mode using `Unstructured`. The file loader uses the unstructured partition function and will automatically detect the file type. You can run the loader in different modes: "single", "elements", and "paged". The default "single" mode will return a single langchain Document object. If you use "elements" mode, the unstructured library will split the document into elements such as Title and NarrativeText and return those as individual langchain Document objects. In addition to these post-processing modes (which are specific to the LangChain Loaders), Unstructured has its own "chunking" parameters for post-processing elements into more useful chunks for uses cases such as Retrieval Augmented Generation (RAG). You can pass in additional unstructured kwargs to configure different unstructured settings. Examples -------- from langchain_community.document_loaders import UnstructuredFileIOLoader with open("example.pdf", "rb") as f: loader = UnstructuredFileIOLoader( f, mode="elements", strategy="fast", ) docs = loader.load() References ---------- https://docs.unstructured.io/open-source/core-functionality/partitioning https://docs.unstructured.io/open-source/core-functionality/chunking """
[docs]def__init__(self,file:IO[bytes],*,mode:str="single",**unstructured_kwargs:Any,):"""Initialize with file path."""self.file=filesuper().__init__(mode=mode,**unstructured_kwargs)
def_get_elements(self)->List[Element]:fromunstructured.partition.autoimportpartitionreturnpartition(file=self.file,**self.unstructured_kwargs)def_get_metadata(self)->dict[str,Any]:return{}def_post_process_elements(self,elements:List[Element])->List[Element]:"""Apply post processing functions to extracted unstructured elements. Post processing functions are str -> str callables passed in using the post_processors kwarg when the loader is instantiated. """forelementinelements:forpost_processorinself.post_processors:element.apply(post_processor)returnelements
[docs]@deprecated(since="0.2.8",removal="1.0",alternative_import="langchain_unstructured.UnstructuredLoader",)classUnstructuredAPIFileIOLoader(UnstructuredBaseLoader):"""Send file-like objects with `unstructured-client` sdk to the Unstructured API. By default, the loader makes a call to the hosted Unstructured API. If you are running the unstructured API locally, you can change the API rule by passing in the url parameter when you initialize the loader. The hosted Unstructured API requires an API key. See the links below to learn more about our API offerings and get an API key. You can run the loader in different modes: "single", "elements", and "paged". The default "single" mode will return a single langchain Document object. If you use "elements" mode, the unstructured library will split the document into elements such as Title and NarrativeText and return those as individual langchain Document objects. In addition to these post-processing modes (which are specific to the LangChain Loaders), Unstructured has its own "chunking" parameters for post-processing elements into more useful chunks for uses cases such as Retrieval Augmented Generation (RAG). You can pass in additional unstructured kwargs to configure different unstructured settings. Examples -------- from langchain_community.document_loaders import UnstructuredAPIFileLoader with open("example.pdf", "rb") as f: loader = UnstructuredAPIFileIOLoader( f, mode="elements", strategy="fast", api_key="MY_API_KEY", ) docs = loader.load() References ---------- https://docs.unstructured.io/api-reference/api-services/sdk https://docs.unstructured.io/api-reference/api-services/overview https://docs.unstructured.io/open-source/core-functionality/partitioning https://docs.unstructured.io/open-source/core-functionality/chunking """
[docs]def__init__(self,file:Union[IO[bytes],Sequence[IO[bytes]]],*,mode:str="single",url:str="https://api.unstructuredapp.io/general/v0/general",api_key:str="",**unstructured_kwargs:Any,):"""Initialize with file path."""ifisinstance(file,Sequence):validate_unstructured_version(min_unstructured_version="0.6.3")validate_unstructured_version(min_unstructured_version="0.6.2")self.file=fileself.url=urlself.api_key=os.getenv("UNSTRUCTURED_API_KEY")orapi_keysuper().__init__(mode=mode,**unstructured_kwargs)
def_get_elements(self)->List[Element]:ifself.unstructured_kwargs.get("metadata_filename"):returnget_elements_from_api(file=self.file,file_path=self.unstructured_kwargs.pop("metadata_filename"),api_key=self.api_key,api_url=self.url,**self.unstructured_kwargs,)else:raiseValueError("If partitioning a file via api,"" metadata_filename must be specified as well.",)def_get_metadata(self)->dict[str,Any]:return{}def_post_process_elements(self,elements:List[Element])->List[Element]:"""Apply post processing functions to extracted unstructured elements. Post processing functions are str -> str callables passed in using the post_processors kwarg when the loader is instantiated. """forelementinelements:forpost_processorinself.post_processors:element.apply(post_processor)returnelements