[docs]classMHTMLLoader(BaseLoader):"""Parse `MHTML` files with `BeautifulSoup`."""
[docs]def__init__(self,file_path:Union[str,Path],open_encoding:Union[str,None]=None,bs_kwargs:Union[dict,None]=None,get_text_separator:str="",)->None:"""initialize with path, and optionally, file encoding to use, and any kwargs to pass to the BeautifulSoup object. Args: file_path: Path to file to load. open_encoding: The encoding to use when opening the file. bs_kwargs: Any kwargs to pass to the BeautifulSoup object. get_text_separator: The separator to use when getting the text from the soup. """try:importbs4# noqa:F401exceptImportError:raiseImportError("beautifulsoup4 package not found, please install it with ""`pip install beautifulsoup4`")self.file_path=file_pathself.open_encoding=open_encodingifbs_kwargsisNone:bs_kwargs={"features":"lxml"}self.bs_kwargs=bs_kwargsself.get_text_separator=get_text_separator
[docs]deflazy_load(self)->Iterator[Document]:"""Load MHTML document into document objects."""frombs4importBeautifulSoupwithopen(self.file_path,"r",encoding=self.open_encoding)asf:message=email.message_from_string(f.read())parts=message.get_payload()ifnotisinstance(parts,list):parts=[message]forpartinparts:ifpart.get_content_type()=="text/html":# type: ignore[union-attr]html=part.get_payload(decode=True).decode()# type: ignore[union-attr]soup=BeautifulSoup(html,**self.bs_kwargs)text=soup.get_text(self.get_text_separator)ifsoup.title:title=str(soup.title.string)else:title=""metadata:Dict[str,Union[str,None]]={"source":str(self.file_path),"title":title,}yieldDocument(page_content=text,metadata=metadata)return