[docs]classGitbookLoader(WebBaseLoader):"""Load `GitBook` data. 1. load from either a single page, or 2. load all (relative) paths in the navbar. """
[docs]def__init__(self,web_page:str,load_all_paths:bool=False,base_url:Optional[str]=None,content_selector:str="main",continue_on_failure:bool=False,show_progress:bool=True,):"""Initialize with web page and whether to load all paths. Args: web_page: The web page to load or the starting point from where relative paths are discovered. load_all_paths: If set to True, all relative paths in the navbar are loaded instead of only `web_page`. base_url: If `load_all_paths` is True, the relative paths are appended to this base url. Defaults to `web_page`. content_selector: The CSS selector for the content to load. Defaults to "main". continue_on_failure: whether to continue loading the sitemap if an error occurs loading a url, emitting a warning instead of raising an exception. Setting this to True makes the loader more robust, but also may result in missing data. Default: False show_progress: whether to show a progress bar while loading. Default: True """self.base_url=base_urlorweb_pageifself.base_url.endswith("/"):self.base_url=self.base_url[:-1]ifload_all_paths:# set web_path to the sitemap if we want to crawl all pathsweb_page=f"{self.base_url}/sitemap.xml"super().__init__(web_paths=(web_page,),continue_on_failure=continue_on_failure,show_progress=show_progress,)self.load_all_paths=load_all_pathsself.content_selector=content_selector
[docs]deflazy_load(self)->Iterator[Document]:"""Fetch text from one single GitBook page."""ifself.load_all_paths:soup_info=self.scrape()relative_paths=self._get_paths(soup_info)urls=[urljoin(self.base_url,path)forpathinrelative_paths]soup_infos=self.scrape_all(urls)forsoup_info,urlinzip(soup_infos,urls):doc=self._get_document(soup_info,url)ifdoc:yielddocelse:soup_info=self.scrape()doc=self._get_document(soup_info,self.web_path)ifdoc:yielddoc
def_get_document(self,soup:Any,custom_url:Optional[str]=None)->Optional[Document]:"""Fetch content from page and return Document."""page_content_raw=soup.find(self.content_selector)ifnotpage_content_raw:returnNonecontent=page_content_raw.get_text(separator="\n").strip()title_if_exists=page_content_raw.find("h1")title=title_if_exists.textiftitle_if_existselse""metadata={"source":custom_urlorself.web_path,"title":title}returnDocument(page_content=content,metadata=metadata)def_get_paths(self,soup:Any)->List[str]:"""Fetch all relative paths in the navbar."""return[urlparse(loc.text).pathforlocinsoup.find_all("loc")]