Source code for langchain_community.document_loaders.scrapfly
"""Scrapfly Web Reader."""importloggingfromtypingimportIterator,List,Literal,Optionalfromlangchain_core.document_loadersimportBaseLoaderfromlangchain_core.documentsimportDocumentfromlangchain_core.utilsimportget_from_envlogger=logging.getLogger(__file__)
[docs]classScrapflyLoader(BaseLoader):"""Turn a url to llm accessible markdown with `Scrapfly.io`. For further details, visit: https://scrapfly.io/docs/sdk/python """
[docs]def__init__(self,urls:List[str],*,api_key:Optional[str]=None,scrape_format:Literal["markdown","text"]="markdown",scrape_config:Optional[dict]=None,continue_on_failure:bool=True,)->None:"""Initialize client. Args: urls: List of urls to scrape. api_key: The Scrapfly API key. If not specified must have env var SCRAPFLY_API_KEY set. scrape_format: Scrape result format, one or "markdown" or "text". scrape_config: Dictionary of ScrapFly scrape config object. continue_on_failure: Whether to continue if scraping a url fails. """try:fromscrapflyimportScrapflyClientexceptImportError:raiseImportError("`scrapfly` package not found, please run `pip install scrapfly-sdk`")ifnoturls:raiseValueError("URLs must be provided.")api_key=api_keyorget_from_env("api_key","SCRAPFLY_API_KEY")self.scrapfly=ScrapflyClient(key=api_key)self.urls=urlsself.scrape_format=scrape_formatself.scrape_config=scrape_configself.continue_on_failure=continue_on_failure
[docs]deflazy_load(self)->Iterator[Document]:fromscrapflyimportScrapeConfigscrape_config=self.scrape_configifself.scrape_configisnotNoneelse{}forurlinself.urls:try:response=self.scrapfly.scrape(ScrapeConfig(url,format=self.scrape_format,**scrape_config))yieldDocument(page_content=response.scrape_result["content"],metadata={"url":url},)exceptExceptionase:ifself.continue_on_failure:logger.error(f"Error fetching data from {url}, exception: {e}")else:raisee