Source code for langchain_community.document_loaders.sharepoint
"""Loader that loads data from Sharepoint Document Library"""from__future__importannotationsimportjsonfrompathlibimportPathfromtypingimportAny,Dict,Iterator,List,Optionalimportrequests# type: ignorefromlangchain_core.document_loadersimportBaseLoaderfromlangchain_core.documentsimportDocumentfrompydanticimportFieldfromlangchain_community.document_loaders.base_o365import(O365BaseLoader,)
[docs]classSharePointLoader(O365BaseLoader,BaseLoader):"""Load from `SharePoint`."""document_library_id:str=Field(...)""" The ID of the SharePoint document library to load data from."""folder_path:Optional[str]=None""" The path to the folder to load data from."""object_ids:Optional[List[str]]=None""" The IDs of the objects to load data from."""folder_id:Optional[str]=None""" The ID of the folder to load data from."""load_auth:Optional[bool]=False""" Whether to load authorization identities."""token_path:Path=Path.home()/".credentials"/"o365_token.txt"""" The path to the token to make api calls"""load_extended_metadata:Optional[bool]=False""" Whether to load extended metadata. Size, Owner and full_path."""@propertydef_scopes(self)->List[str]:"""Return required scopes. Returns: List[str]: A list of required scopes. """return["sharepoint","basic"]
[docs]deflazy_load(self)->Iterator[Document]:""" Load documents lazily. Use this when working at a large scale. Yields: Document: A document object representing the parsed blob. """try:fromO365.driveimportDrive,FolderexceptImportError:raiseImportError("O365 package not found, please install it with `pip install o365`")drive=self._auth().storage().get_drive(self.document_library_id)ifnotisinstance(drive,Drive):raiseValueError(f"There isn't a Drive with id {self.document_library_id}.")ifself.folder_path:target_folder=drive.get_item_by_path(self.folder_path)ifnotisinstance(target_folder,Folder):raiseValueError(f"There isn't a folder with path {self.folder_path}.")forblobinself._load_from_folder(target_folder):file_id=str(blob.metadata.get("id"))ifself.load_authisTrue:auth_identities=self.authorized_identities(file_id)ifself.load_extended_metadataisTrue:extended_metadata=self.get_extended_metadata(file_id)extended_metadata.update({"source_full_url":target_folder.web_url})forparsed_blobinself._blob_parser.lazy_parse(blob):ifself.load_authisTrue:parsed_blob.metadata["authorized_identities"]=auth_identitiesifself.load_extended_metadataisTrue:parsed_blob.metadata.update(extended_metadata)yieldparsed_blobifself.folder_id:target_folder=drive.get_item(self.folder_id)ifnotisinstance(target_folder,Folder):raiseValueError(f"There isn't a folder with path {self.folder_path}.")forblobinself._load_from_folder(target_folder):file_id=str(blob.metadata.get("id"))ifself.load_authisTrue:auth_identities=self.authorized_identities(file_id)ifself.load_extended_metadataisTrue:extended_metadata=self.get_extended_metadata(file_id)extended_metadata.update({"source_full_url":target_folder.web_url})forparsed_blobinself._blob_parser.lazy_parse(blob):ifself.load_authisTrue:parsed_blob.metadata["authorized_identities"]=auth_identitiesifself.load_extended_metadataisTrue:parsed_blob.metadata.update(extended_metadata)yieldparsed_blobifself.object_ids:forblobinself._load_from_object_ids(drive,self.object_ids):file_id=str(blob.metadata.get("id"))ifself.load_authisTrue:auth_identities=self.authorized_identities(file_id)ifself.load_extended_metadataisTrue:extended_metadata=self.get_extended_metadata(file_id)forparsed_blobinself._blob_parser.lazy_parse(blob):ifself.load_authisTrue:parsed_blob.metadata["authorized_identities"]=auth_identitiesifself.load_extended_metadataisTrue:parsed_blob.metadata.update(extended_metadata)yieldparsed_blobifnot(self.folder_pathorself.folder_idorself.object_ids):target_folder=drive.get_root_folder()ifnotisinstance(target_folder,Folder):raiseValueError("Unable to fetch root folder")forblobinself._load_from_folder(target_folder):file_id=str(blob.metadata.get("id"))ifself.load_authisTrue:auth_identities=self.authorized_identities(file_id)ifself.load_extended_metadataisTrue:extended_metadata=self.get_extended_metadata(file_id)forblob_partinself._blob_parser.lazy_parse(blob):blob_part.metadata.update(blob.metadata)ifself.load_authisTrue:blob_part.metadata["authorized_identities"]=auth_identitiesifself.load_extended_metadataisTrue:blob_part.metadata.update(extended_metadata)blob_part.metadata.update({"source_full_url":target_folder.web_url})yieldblob_part
[docs]defauthorized_identities(self,file_id:str)->List:""" Retrieve the access identities (user/group emails) for a given file. Args: file_id (str): The ID of the file. Returns: List: A list of group names (email addresses) that have access to the file. """data=self._fetch_access_token()access_token=data.get("access_token")url=("https://graph.microsoft.com/v1.0/drives"f"/{self.document_library_id}/items/{file_id}/permissions")headers={"Authorization":f"Bearer {access_token}"}response=requests.request("GET",url,headers=headers)access_list=response.json()group_names=[]foraccess_datainaccess_list.get("value"):ifaccess_data.get("grantedToV2"):site_data=((access_data.get("grantedToV2").get("siteUser"))or(access_data.get("grantedToV2").get("user"))or(access_data.get("grantedToV2").get("group")))ifsite_data:email=site_data.get("email")ifemail:group_names.append(email)returngroup_names
def_fetch_access_token(self)->Any:""" Fetch the access token from the token file. Returns: The access token as a dictionary. """withopen(self.token_path,encoding="utf-8")asf:s=f.read()data=json.loads(s)returndata
[docs]defget_extended_metadata(self,file_id:str)->Dict:""" Retrieve extended metadata for a file in SharePoint. As of today, following fields are supported in the extended metadata: - size: size of the source file. - owner: display name of the owner of the source file. - full_path: pretty human readable path of the source file. Args: file_id (str): The ID of the file. Returns: dict: A dictionary containing the extended metadata of the file, including size, owner, and full path. """data=self._fetch_access_token()access_token=data.get("access_token")url=("https://graph.microsoft.com/v1.0/drives/"f"{self.document_library_id}/items/{file_id}""?$select=size,createdBy,parentReference,name")headers={"Authorization":f"Bearer {access_token}"}response=requests.request("GET",url,headers=headers)metadata=response.json()staged_metadata={"size":metadata.get("size",0),"owner":metadata.get("createdBy",{}).get("user",{}).get("displayName",""),"full_path":metadata.get("parentReference",{}).get("path","").split(":")[-1]+"/"+metadata.get("name",""),}returnstaged_metadata