[docs]classBSHTMLLoader(BaseLoader):""" __ModuleName__ document loader integration Setup: Install ``langchain-community`` and ``bs4``. .. code-block:: bash pip install -U langchain-community bs4 Instantiate: .. code-block:: python from langchain_community.document_loaders import BSHTMLLoader loader = BSHTMLLoader( file_path="./example_data/fake-content.html", ) Lazy load: .. code-block:: python docs = [] docs_lazy = loader.lazy_load() # async variant: # docs_lazy = await loader.alazy_load() for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Test Title My First Heading My first paragraph. {'source': './example_data/fake-content.html', 'title': 'Test Title'} Async load: .. code-block:: python docs = await loader.aload() print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Test Title My First Heading My first paragraph. {'source': './example_data/fake-content.html', 'title': 'Test Title'} """# noqa: E501
[docs]def__init__(self,file_path:Union[str,Path],open_encoding:Union[str,None]=None,bs_kwargs:Union[dict,None]=None,get_text_separator:str="",)->None:"""initialize with path, and optionally, file encoding to use, and any kwargs to pass to the BeautifulSoup object. Args: file_path: The path to the file to load. open_encoding: The encoding to use when opening the file. bs_kwargs: Any kwargs to pass to the BeautifulSoup object. get_text_separator: The separator to use when calling get_text on the soup. """try:importbs4# noqa:F401exceptImportError:raiseImportError("beautifulsoup4 package not found, please install it with ""`pip install beautifulsoup4`")self.file_path=file_pathself.open_encoding=open_encodingifbs_kwargsisNone:bs_kwargs={"features":"lxml"}self.bs_kwargs=bs_kwargsself.get_text_separator=get_text_separator
[docs]deflazy_load(self)->Iterator[Document]:"""Load HTML document into document objects."""frombs4importBeautifulSoupwithopen(self.file_path,"r",encoding=self.open_encoding)asf:soup=BeautifulSoup(f,**self.bs_kwargs)text=soup.get_text(self.get_text_separator)ifsoup.title:title=str(soup.title.string)else:title=""metadata:Dict[str,Union[str,None]]={"source":str(self.file_path),"title":title,}yieldDocument(page_content=text,metadata=metadata)