[docs]classSpiderLoader(BaseLoader):"""Load web pages as Documents using Spider AI. Must have the Python package `spider-client` installed and a Spider API key. See https://spider.cloud for more. """
[docs]def__init__(self,url:str,*,api_key:Optional[str]=None,mode:Literal["scrape","crawl"]="scrape",params:Optional[dict]=None,):"""Initialize with API key and URL. Args: url: The URL to be processed. api_key: The Spider API key. If not specified, will be read from env var `SPIDER_API_KEY`. mode: The mode to run the loader in. Default is "scrape". Options include "scrape" (single page) and "crawl" (with deeper crawling following subpages). params: Additional parameters for the Spider API. """ifparamsisNone:params={"return_format":"markdown","metadata":True,}# Using the metadata param slightly slows down the outputtry:fromspiderimportSpiderexceptImportError:raiseImportError("`spider` package not found, please run `pip install spider-client`")ifmodenotin("scrape","crawl"):raiseValueError(f"Unrecognized mode '{mode}'. Expected one of 'scrape', 'crawl'.")# Use the environment variable if the API key isn't providedapi_key=api_keyorget_from_env("api_key","SPIDER_API_KEY")self.spider=Spider(api_key=api_key)self.url=urlself.mode=modeself.params=params
[docs]deflazy_load(self)->Iterator[Document]:"""Load documents based on the specified mode."""spider_docs=[]ifself.mode=="scrape":# Scrape a single pageresponse=self.spider.scrape_url(self.url,params=self.params)ifresponse:spider_docs.append(response)elifself.mode=="crawl":# Crawl multiple pagesresponse=self.spider.crawl_url(self.url,params=self.params)ifresponse:spider_docs.extend(response)fordocinspider_docs:ifself.mode=="scrape":# Ensure page_content is also not Nonepage_content=doc[0].get("content","")# Ensure metadata is also not Nonemetadata=doc[0].get("metadata",{})ifpage_contentisnotNone:yieldDocument(page_content=page_content,metadata=metadata)ifself.mode=="crawl":# Ensure page_content is also not Nonepage_content=doc.get("content","")# Ensure metadata is also not Nonemetadata=doc.get("metadata",{})ifpage_contentisnotNone:yieldDocument(page_content=page_content,metadata=metadata,)