Source code for langchain_community.document_loaders.url
"""Loader that uses unstructured to load HTML files."""importloggingfromtypingimportAny,Listfromlangchain_core.documentsimportDocumentfromlangchain_community.document_loaders.baseimportBaseLoaderlogger=logging.getLogger(__name__)
[docs]classUnstructuredURLLoader(BaseLoader):"""Load files from remote URLs using `Unstructured`. Use the unstructured partition function to detect the MIME type and route the file to the appropriate partitioner. You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single langchain Document object. If you use "elements" mode, the unstructured library will split the document into elements such as Title and NarrativeText. You can pass in additional unstructured kwargs after mode to apply different unstructured settings. Examples -------- from langchain_community.document_loaders import UnstructuredURLLoader loader = UnstructuredURLLoader( urls=["<url-1>", "<url-2>"], mode="elements", strategy="fast", ) docs = loader.load() References ---------- https://unstructured-io.github.io/unstructured/bricks.html#partition """
[docs]def__init__(self,urls:List[str],continue_on_failure:bool=True,mode:str="single",show_progress_bar:bool=False,**unstructured_kwargs:Any,):"""Initialize with file path."""try:importunstructured# noqa:F401fromunstructured.__version__import__version__as__unstructured_version__self.__version=__unstructured_version__exceptImportError:raiseImportError("unstructured package not found, please install it with ""`pip install unstructured`")self._validate_mode(mode)self.mode=modeheaders=unstructured_kwargs.pop("headers",{})iflen(headers.keys())!=0:warn_about_headers=Falseifself.__is_non_html_available():warn_about_headers=notself.__is_headers_available_for_non_html()else:warn_about_headers=notself.__is_headers_available_for_html()ifwarn_about_headers:logger.warning("You are using an old version of unstructured. ""The headers parameter is ignored")self.urls=urlsself.continue_on_failure=continue_on_failureself.headers=headersself.unstructured_kwargs=unstructured_kwargsself.show_progress_bar=show_progress_bar
def_validate_mode(self,mode:str)->None:_valid_modes={"single","elements"}ifmodenotin_valid_modes:raiseValueError(f"Got {mode} for `mode`, but should be one of `{_valid_modes}`")def__is_headers_available_for_html(self)->bool:_unstructured_version=self.__version.split("-")[0]unstructured_version=tuple([int(x)forxin_unstructured_version.split(".")])returnunstructured_version>=(0,5,7)def__is_headers_available_for_non_html(self)->bool:_unstructured_version=self.__version.split("-")[0]unstructured_version=tuple([int(x)forxin_unstructured_version.split(".")])returnunstructured_version>=(0,5,13)def__is_non_html_available(self)->bool:_unstructured_version=self.__version.split("-")[0]unstructured_version=tuple([int(x)forxin_unstructured_version.split(".")])returnunstructured_version>=(0,5,12)
[docs]defload(self)->List[Document]:"""Load file."""fromunstructured.partition.autoimportpartitionfromunstructured.partition.htmlimportpartition_htmldocs:List[Document]=list()ifself.show_progress_bar:try:fromtqdmimporttqdmexceptImportErrorase:raiseImportError("Package tqdm must be installed if show_progress_bar=True. ""Please install with 'pip install tqdm' or set ""show_progress_bar=False.")fromeurls=tqdm(self.urls)else:urls=self.urlsforurlinurls:try:ifself.__is_non_html_available():ifself.__is_headers_available_for_non_html():elements=partition(url=url,headers=self.headers,**self.unstructured_kwargs)else:elements=partition(url=url,**self.unstructured_kwargs)else:ifself.__is_headers_available_for_html():elements=partition_html(url=url,headers=self.headers,**self.unstructured_kwargs)else:elements=partition_html(url=url,**self.unstructured_kwargs)exceptExceptionase:ifself.continue_on_failure:logger.error(f"Error fetching or processing {url}, exception: {e}")continueelse:raiseeifself.mode=="single":text="\n\n".join([str(el)forelinelements])metadata={"source":url}docs.append(Document(page_content=text,metadata=metadata))elifself.mode=="elements":forelementinelements:metadata=element.metadata.to_dict()metadata["category"]=element.categorydocs.append(Document(page_content=str(element),metadata=metadata))returndocs