Source code for langchain_community.document_loaders.parsers.html.bs4
"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""importloggingfromtypingimportAny,Dict,Iterator,Unionfromlangchain_core.documentsimportDocumentfromlangchain_community.document_loaders.baseimportBaseBlobParserfromlangchain_community.document_loaders.blob_loadersimportBloblogger=logging.getLogger(__name__)
[docs]classBS4HTMLParser(BaseBlobParser):"""Parse HTML files using `Beautiful Soup`."""
[docs]def__init__(self,*,features:str="lxml",get_text_separator:str="",**kwargs:Any,)->None:"""Initialize a bs4 based HTML parser."""try:importbs4# noqa:F401exceptImportError:raiseImportError("beautifulsoup4 package not found, please install it with ""`pip install beautifulsoup4`")self.bs_kwargs={"features":features,**kwargs}self.get_text_separator=get_text_separator
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:"""Load HTML document into document objects."""frombs4importBeautifulSoupwithblob.as_bytes_io()asf:soup=BeautifulSoup(f,**self.bs_kwargs)text=soup.get_text(self.get_text_separator)ifsoup.title:title=str(soup.title.string)else:title=""metadata:Dict[str,Union[str,None]]={"source":blob.source,"title":title,}yieldDocument(page_content=text,metadata=metadata)