Source code for langchain_community.document_loaders.scrapingant
"""ScrapingAnt Web Extractor."""importloggingfromtypingimportIterator,List,Optionalfromlangchain_core.document_loadersimportBaseLoaderfromlangchain_core.documentsimportDocumentfromlangchain_core.utilsimportget_from_envlogger=logging.getLogger(__file__)
[docs]classScrapingAntLoader(BaseLoader):"""Turn an url to LLM accessible markdown with `ScrapingAnt`. For further details, visit: https://docs.scrapingant.com/python-client """
[docs]def__init__(self,urls:List[str],*,api_key:Optional[str]=None,scrape_config:Optional[dict]=None,continue_on_failure:bool=True,)->None:"""Initialize client. Args: urls: List of urls to scrape. api_key: The ScrapingAnt API key. If not specified must have env var SCRAPINGANT_API_KEY set. scrape_config: The scraping config from ScrapingAntClient.markdown_request continue_on_failure: Whether to continue if scraping an url fails. """try:fromscrapingant_clientimportScrapingAntClientexceptImportError:raiseImportError("`scrapingant-client` package not found,"" run `pip install scrapingant-client`")ifnoturls:raiseValueError("URLs must be provided.")api_key=api_keyorget_from_env("api_key","SCRAPINGANT_API_KEY")self.client=ScrapingAntClient(token=api_key)self.urls=urlsself.scrape_config=scrape_configself.continue_on_failure=continue_on_failure
[docs]deflazy_load(self)->Iterator[Document]:"""Fetch data from ScrapingAnt."""scrape_config=self.scrape_configifself.scrape_configisnotNoneelse{}forurlinself.urls:try:result=self.client.markdown_request(url=url,**scrape_config)yieldDocument(page_content=result.markdown,metadata={"url":result.url},)exceptExceptionase:ifself.continue_on_failure:logger.error(f"Error fetching data from {url}, exception: {e}")else:raisee