Source code for langchain_community.document_loaders.docusaurus
"""Load Documents from Docusarus Documentation"""fromtypingimportAny,List,Optionalfromlangchain_community.document_loaders.sitemapimportSitemapLoader
[docs]classDocusaurusLoader(SitemapLoader):"""Load from Docusaurus Documentation. It leverages the SitemapLoader to loop through the generated pages of a Docusaurus Documentation website and extracts the content by looking for specific HTML tags. By default, the parser searches for the main content of the Docusaurus page, which is normally the <article>. You can also define your own custom HTML tags by providing them as a list, for example: ["div", ".main", "a"]. """
[docs]def__init__(self,url:str,custom_html_tags:Optional[List[str]]=None,**kwargs:Any,):"""Initialize DocusaurusLoader Args: url: The base URL of the Docusaurus website. custom_html_tags: Optional custom html tags to extract content from pages. kwargs: Additional args to extend the underlying SitemapLoader, for example: filter_urls, blocksize, meta_function, is_local, continue_on_failure """ifnotkwargs.get("is_local"):url=f"{url}/sitemap.xml"self.custom_html_tags=custom_html_tagsor["main article"]super().__init__(url,parsing_function=kwargs.get("parsing_function")orself._parsing_function,**kwargs,)
def_parsing_function(self,content:Any)->str:"""Parses specific elements from a Docusaurus page."""relevant_elements=content.select(",".join(self.custom_html_tags))forelementinrelevant_elements:ifelementnotinrelevant_elements:element.decompose()returnstr(content.get_text())