[docs]def__init__(self,api_token:str,urls:List[str],continue_on_failure:bool=True):"""Initialize with API token, ids, and key. Args: api_token: Diffbot API token. urls: List of URLs to load. continue_on_failure: Whether to continue loading other URLs if one fails. Defaults to True. """self.api_token=api_tokenself.urls=urlsself.continue_on_failure=continue_on_failure
def_diffbot_api_url(self,diffbot_api:str)->str:returnf"https://api.diffbot.com/v3/{diffbot_api}"def_get_diffbot_data(self,url:str)->Any:"""Get Diffbot file from Diffbot REST API."""# TODO: Add support for other Diffbot APIsdiffbot_url=self._diffbot_api_url("article")params={"token":self.api_token,"url":url,}response=requests.get(diffbot_url,params=params,timeout=10)# TODO: handle non-ok errorsreturnresponse.json()ifresponse.okelse{}
[docs]defload(self)->List[Document]:"""Extract text from Diffbot on all the URLs and return Documents"""docs:List[Document]=list()forurlinself.urls:try:data=self._get_diffbot_data(url)text=data["objects"][0]["text"]if"objects"indataelse""metadata={"source":url}docs.append(Document(page_content=text,metadata=metadata))exceptExceptionase:ifself.continue_on_failure:logger.error(f"Error fetching or processing {url}, exception: {e}")else:raiseereturndocs