Source code for langchain_community.document_loaders.web_base
"""Web base loader class."""importasyncioimportloggingimportwarningsfromtypingimportAny,Dict,Iterator,List,Optional,Sequence,Unionimportaiohttpimportrequestsfromlangchain_core.documentsimportDocumentfromlangchain_community.document_loaders.baseimportBaseLoaderfromlangchain_community.utils.user_agentimportget_user_agentlogger=logging.getLogger(__name__)default_header_template={"User-Agent":get_user_agent(),"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*"";q=0.8","Accept-Language":"en-US,en;q=0.5","Referer":"https://www.google.com/","DNT":"1","Connection":"keep-alive","Upgrade-Insecure-Requests":"1",}def_build_metadata(soup:Any,url:str)->dict:"""Build metadata from BeautifulSoup output."""metadata={"source":url}iftitle:=soup.find("title"):metadata["title"]=title.get_text()ifdescription:=soup.find("meta",attrs={"name":"description"}):metadata["description"]=description.get("content","No description found.")ifhtml:=soup.find("html"):metadata["language"]=html.get("lang","No language found.")returnmetadata
[docs]def__init__(self,web_path:Union[str,Sequence[str]]="",header_template:Optional[dict]=None,verify_ssl:bool=True,proxies:Optional[dict]=None,continue_on_failure:bool=False,autoset_encoding:bool=True,encoding:Optional[str]=None,web_paths:Sequence[str]=(),requests_per_second:int=2,default_parser:str="html.parser",requests_kwargs:Optional[Dict[str,Any]]=None,raise_for_status:bool=False,bs_get_text_kwargs:Optional[Dict[str,Any]]=None,bs_kwargs:Optional[Dict[str,Any]]=None,session:Any=None,*,show_progress:bool=True,)->None:"""Initialize loader. Args: web_paths: Web paths to load from. requests_per_second: Max number of concurrent requests to make. default_parser: Default parser to use for BeautifulSoup. requests_kwargs: kwargs for requests raise_for_status: Raise an exception if http status code denotes an error. bs_get_text_kwargs: kwargs for beatifulsoup4 get_text bs_kwargs: kwargs for beatifulsoup4 web page parsing show_progress: Show progress bar when loading pages. """# web_path kept for backwards-compatibility.ifweb_pathandweb_paths:raiseValueError("Received web_path and web_paths. Only one can be specified. ""web_path is deprecated, web_paths should be used.")ifweb_paths:self.web_paths=list(web_paths)elifisinstance(web_path,str):self.web_paths=[web_path]elifisinstance(web_path,Sequence):self.web_paths=list(web_path)else:raiseTypeError(f"web_path must be str or Sequence[str] got ({type(web_path)}) or"f" web_paths must be Sequence[str] got ({type(web_paths)})")self.requests_per_second=requests_per_secondself.default_parser=default_parserself.requests_kwargs=requests_kwargsor{}self.raise_for_status=raise_for_statusself.show_progress=show_progressself.bs_get_text_kwargs=bs_get_text_kwargsor{}self.bs_kwargs=bs_kwargsor{}ifsession:self.session=sessionelse:session=requests.Session()header_template=header_templateordefault_header_template.copy()ifnotheader_template.get("User-Agent"):try:fromfake_useragentimportUserAgentheader_template["User-Agent"]=UserAgent().randomexceptImportError:logger.info("fake_useragent not found, using default user agent.""To get a realistic header for requests, ""`pip install fake_useragent`.")session.headers=dict(header_template)session.verify=verify_sslifproxies:session.proxies.update(proxies)self.session=sessionself.continue_on_failure=continue_on_failureself.autoset_encoding=autoset_encodingself.encoding=encoding
@propertydefweb_path(self)->str:iflen(self.web_paths)>1:raiseValueError("Multiple webpaths found.")returnself.web_paths[0]asyncdef_fetch(self,url:str,retries:int=3,cooldown:int=2,backoff:float=1.5)->str:asyncwithaiohttp.ClientSession()assession:foriinrange(retries):try:kwargs:Dict=dict(headers=self.session.headers,cookies=self.session.cookies.get_dict(),)ifnotself.session.verify:kwargs["ssl"]=Falseasyncwithsession.get(url,**kwargs)asresponse:ifself.raise_for_status:response.raise_for_status()returnawaitresponse.text()exceptaiohttp.ClientConnectionErrorase:ifi==retries-1:raiseelse:logger.warning(f"Error fetching {url} with attempt "f"{i+1}/{retries}: {e}. Retrying...")awaitasyncio.sleep(cooldown*backoff**i)raiseValueError("retry count exceeded")asyncdef_fetch_with_rate_limit(self,url:str,semaphore:asyncio.Semaphore)->str:asyncwithsemaphore:try:returnawaitself._fetch(url)exceptExceptionase:ifself.continue_on_failure:logger.warning(f"Error fetching {url}, skipping due to"f" continue_on_failure=True")return""logger.exception(f"Error fetching {url} and aborting, use continue_on_failure=True ""to continue loading urls after encountering an error.")raisee
[docs]asyncdeffetch_all(self,urls:List[str])->Any:"""Fetch all urls concurrently with rate limiting."""semaphore=asyncio.Semaphore(self.requests_per_second)tasks=[]forurlinurls:task=asyncio.ensure_future(self._fetch_with_rate_limit(url,semaphore))tasks.append(task)try:ifself.show_progress:fromtqdm.asyncioimporttqdm_asyncioreturnawaittqdm_asyncio.gather(*tasks,desc="Fetching pages",ascii=True,mininterval=1)else:returnawaitasyncio.gather(*tasks)exceptImportError:warnings.warn("For better logging of progress, `pip install tqdm`")returnawaitasyncio.gather(*tasks)
@staticmethoddef_check_parser(parser:str)->None:"""Check that parser is valid for bs4."""valid_parsers=["html.parser","lxml","xml","lxml-xml","html5lib"]ifparsernotinvalid_parsers:raiseValueError("`parser` must be one of "+", ".join(valid_parsers)+".")
[docs]defscrape_all(self,urls:List[str],parser:Union[str,None]=None)->List[Any]:"""Fetch all urls, then return soups for all results."""frombs4importBeautifulSoupresults=asyncio.run(self.fetch_all(urls))final_results=[]fori,resultinenumerate(results):url=urls[i]ifparserisNone:ifurl.endswith(".xml"):parser="xml"else:parser=self.default_parserself._check_parser(parser)final_results.append(BeautifulSoup(result,parser,**self.bs_kwargs))returnfinal_results
[docs]defscrape(self,parser:Union[str,None]=None)->Any:"""Scrape data from webpage and return it in BeautifulSoup format."""ifparserisNone:parser=self.default_parserreturnself._scrape(self.web_path,parser=parser,bs_kwargs=self.bs_kwargs)
[docs]deflazy_load(self)->Iterator[Document]:"""Lazy load text from the url(s) in web_path."""forpathinself.web_paths:soup=self._scrape(path,bs_kwargs=self.bs_kwargs)text=soup.get_text(**self.bs_get_text_kwargs)metadata=_build_metadata(soup,path)yieldDocument(page_content=text,metadata=metadata)
[docs]defaload(self)->List[Document]:# type: ignore"""Load text from the urls in web_path async into Documents."""results=self.scrape_all(self.web_paths)docs=[]forpath,soupinzip(self.web_paths,results):text=soup.get_text(**self.bs_get_text_kwargs)metadata=_build_metadata(soup,path)docs.append(Document(page_content=text,metadata=metadata))returndocs