[docs]classRSpaceLoader(BaseLoader):"""Load content from RSpace notebooks, folders, documents or PDF Gallery files. Map RSpace document <-> Langchain Document in 1-1. PDFs are imported using PyPDF. Requirements are rspace_client (`pip install rspace_client`) and PyPDF if importing PDF docs (`pip install pypdf`). """
[docs]def__init__(self,global_id:str,api_key:Optional[str]=None,url:Optional[str]=None):"""api_key: RSpace API key - can also be supplied as environment variable 'RSPACE_API_KEY' url: str The URL of your RSpace instance - can also be supplied as environment variable 'RSPACE_URL' global_id: str The global ID of the resource to load, e.g. 'SD12344' (a single document); 'GL12345'(A PDF file in the gallery); 'NB4567' (a notebook); 'FL12244' (a folder) """args:Dict[str,Optional[str]]={"api_key":api_key,"url":url,"global_id":global_id,}verified_args:Dict[str,str]=RSpaceLoader.validate_environment(args)self.api_key=verified_args["api_key"]self.url=verified_args["url"]self.global_id:str=verified_args["global_id"]
[docs]@classmethoddefvalidate_environment(cls,values:Dict)->Dict:"""Validate that API key and URL exist in environment."""values["api_key"]=get_from_dict_or_env(values,"api_key","RSPACE_API_KEY")values["url"]=get_from_dict_or_env(values,"url","RSPACE_URL")if"global_id"notinvaluesorvalues["global_id"]isNone:raiseValueError("No value supplied for global_id. Please supply an RSpace global ID")returnvalues
def_create_rspace_client(self)->Any:"""Create a RSpace client."""try:fromrspace_client.elnimporteln,field_contentexceptImportError:raiseImportError("You must run `pip install rspace_client`")try:eln=eln.ELNClient(self.url,self.api_key)eln.get_status()exceptException:raiseException(f"Unable to initialize client - is url {self.url} or api key correct?")returneln,field_content.FieldContentdef_get_doc(self,cli:Any,field_content:Any,d_id:Union[str,int])->Document:content=""doc=cli.get_document(d_id)content+=f"<h2>{doc['name']}<h2/>"forfindoc["fields"]:content+=f"{f['name']}\n"fc=field_content(f["content"])content+=fc.get_text()content+="\n"returnDocument(metadata={"source":f"rspace: {doc['name']}-{doc['globalId']}"},page_content=content,)def_load_structured_doc(self)->Iterator[Document]:cli,field_content=self._create_rspace_client()yieldself._get_doc(cli,field_content,self.global_id)def_load_folder_tree(self)->Iterator[Document]:cli,field_content=self._create_rspace_client()ifself.global_id:docs_in_folder=cli.list_folder_tree(folder_id=self.global_id[2:],typesToInclude=["document"])doc_ids:List[int]=[d["id"]fordindocs_in_folder["records"]]fordoc_idindoc_ids:yieldself._get_doc(cli,field_content,doc_id)def_load_pdf(self)->Iterator[Document]:cli,field_content=self._create_rspace_client()file_info=cli.get_file_info(self.global_id)_,ext=os.path.splitext(file_info["name"])ifext.lower()==".pdf":outfile=f"{self.global_id}.pdf"cli.download_file(self.global_id,outfile)pdf_loader=PyPDFLoader(outfile)forpdfinpdf_loader.lazy_load():pdf.metadata["rspace_src"]=self.global_idyieldpdf
[docs]deflazy_load(self)->Iterator[Document]:ifself.global_idand"GL"inself.global_id:fordinself._load_pdf():yielddelifself.global_idand"SD"inself.global_id:fordinself._load_structured_doc():yielddelifself.global_idandself.global_id[0:2]in["FL","NB"]:fordinself._load_folder_tree():yielddelse:raiseValueError("Unknown global ID type")