[docs]classQuipLoader(BaseLoader):"""Load `Quip` pages. Port of https://github.com/quip/quip-api/tree/master/samples/baqup """
[docs]def__init__(self,api_url:str,access_token:str,request_timeout:Optional[int]=60,*,allow_dangerous_xml_parsing:bool=False,):""" Args: api_url: https://platform.quip.com access_token: token of access quip API. Please refer: https://quip.com/dev/automation/documentation/current#section/Authentication/Get-Access-to-Quip's-APIs request_timeout: timeout of request, default 60s. allow_dangerous_xml_parsing: Allow dangerous XML parsing, defaults to False """try:fromquip_api.quipimportQuipClientexceptImportError:raiseImportError("`quip_api` package not found, please run `pip install quip_api`")self.quip_client=QuipClient(access_token=access_token,base_url=api_url,request_timeout=request_timeout)ifnotallow_dangerous_xml_parsing:raiseValueError("The quip client uses the built-in XML parser which may cause""security issues when parsing XML data in some cases. ""Please see ""https://docs.python.org/3/library/xml.html#xml-vulnerabilities ""For more information, set `allow_dangerous_xml_parsing` as True ""if you are sure that your distribution of the standard library ""is not vulnerable to XML vulnerabilities.")
[docs]defload(self,folder_ids:Optional[List[str]]=None,thread_ids:Optional[List[str]]=None,max_docs:Optional[int]=1000,include_all_folders:bool=False,include_comments:bool=False,include_images:bool=False,)->List[Document]:""" Args: :param folder_ids: List of specific folder IDs to load, defaults to None :param thread_ids: List of specific thread IDs to load, defaults to None :param max_docs: Maximum number of docs to retrieve in total, defaults 1000 :param include_all_folders: Include all folders that your access_token can access, but doesn't include your private folder :param include_comments: Include comments, defaults to False :param include_images: Include images, defaults to False """ifnotfolder_idsandnotthread_idsandnotinclude_all_folders:raiseValueError("Must specify at least one among `folder_ids`, `thread_ids` ""or set `include_all`_folders as True")thread_ids=thread_idsor[]iffolder_ids:forfolder_idinfolder_ids:self.get_thread_ids_by_folder_id(folder_id,0,thread_ids)ifinclude_all_folders:user=self.quip_client.get_authenticated_user()if"group_folder_ids"inuser:self.get_thread_ids_by_folder_id(user["group_folder_ids"],0,thread_ids)if"shared_folder_ids"inuser:self.get_thread_ids_by_folder_id(user["shared_folder_ids"],0,thread_ids)thread_ids=list(set(thread_ids[:max_docs]))returnself.process_threads(thread_ids,include_images,include_comments)
[docs]defget_thread_ids_by_folder_id(self,folder_id:str,depth:int,thread_ids:List[str])->None:"""Get thread ids by folder id and update in thread_ids"""fromquip_api.quipimportHTTPError,QuipErrortry:folder=self.quip_client.get_folder(folder_id)exceptQuipErrorase:ife.code==403:logging.warning(f"depth {depth}, Skipped over restricted folder {folder_id}, {e}")else:logging.warning(f"depth {depth}, Skipped over folder {folder_id} "f"due to unknown error {e.code}")returnexceptHTTPErrorase:logging.warning(f"depth {depth}, Skipped over folder {folder_id} "f"due to HTTP error {e.code}")returntitle=folder["folder"].get("title","Folder %s"%folder_id)logging.info(f"depth {depth}, Processing folder {title}")forchildinfolder["children"]:if"folder_id"inchild:self.get_thread_ids_by_folder_id(child["folder_id"],depth+1,thread_ids)elif"thread_id"inchild:thread_ids.append(child["thread_id"])
[docs]defprocess_threads(self,thread_ids:Sequence[str],include_images:bool,include_messages:bool)->List[Document]:"""Process a list of thread into a list of documents."""docs=[]forthread_idinthread_ids:doc=self.process_thread(thread_id,include_images,include_messages)ifdocisnotNone:docs.append(doc)returndocs
[docs]defprocess_thread(self,thread_id:str,include_images:bool,include_messages:bool)->Optional[Document]:thread=self.quip_client.get_thread(thread_id)thread_id=thread["thread"]["id"]title=thread["thread"]["title"]link=thread["thread"]["link"]update_ts=thread["thread"]["updated_usec"]sanitized_title=QuipLoader._sanitize_title(title)logger.info(f"processing thread {thread_id} title {sanitized_title} "f"link {link} update_ts {update_ts}")if"html"inthread:# Parse the documenttry:tree=self.quip_client.parse_document_html(thread["html"])exceptxml.etree.cElementTree.ParseErrorase:logger.error(f"Error parsing thread {title}{thread_id}, skipping, {e}")returnNonemetadata={"title":sanitized_title,"update_ts":update_ts,"id":thread_id,"source":link,}# Download each image and replace with the new URLtext=""ifinclude_images:text=self.process_thread_images(tree)ifinclude_messages:text=text+"/n"+self.process_thread_messages(thread_id)returnDocument(page_content=thread["html"]+text,metadata=metadata,)returnNone
[docs]defprocess_thread_images(self,tree:ElementTree)->str:text=""try:fromPILimportImagefrompytesseractimportpytesseractexceptImportError:raiseImportError("`Pillow or pytesseract` package not found, ""please run ""`pip install Pillow` or `pip install pytesseract`")forimgintree.iter("img"):src=img.get("src")ifnotsrcornotsrc.startswith("/blob"):continue_,_,thread_id,blob_id=src.split("/")blob_response=self.quip_client.get_blob(thread_id,blob_id)try:image=Image.open(BytesIO(blob_response.read()))text=text+"\n"+pytesseract.image_to_string(image)exceptOSErrorase:logger.error(f"failed to convert image to text, {e}")raiseereturntext