[docs]classBrowserlessLoader(BaseLoader):"""Load webpages with `Browserless` /content endpoint."""
[docs]def__init__(self,api_token:str,urls:Union[str,List[str]],text_content:bool=True):"""Initialize with API token and the URLs to scrape"""self.api_token=api_token"""Browserless API token."""self.urls=urls"""List of URLs to scrape."""self.text_content=text_content
[docs]deflazy_load(self)->Iterator[Document]:"""Lazy load Documents from URLs."""forurlinself.urls:ifself.text_content:response=requests.post("https://chrome.browserless.io/scrape",params={"token":self.api_token,},json={"url":url,"elements":[{"selector":"body",}],},)yieldDocument(page_content=response.json()["data"][0]["results"][0]["text"],metadata={"source":url,},)else:response=requests.post("https://chrome.browserless.io/content",params={"token":self.api_token,},json={"url":url,},)yieldDocument(page_content=response.text,metadata={"source":url,},)