Source code for langchain_community.document_loaders.onenote
"""Loads data from OneNote Notebooks"""frompathlibimportPathfromtypingimportDict,Iterator,List,Optionalimportrequestsfromlangchain_core.documentsimportDocumentfromlangchain_core.pydantic_v1import(BaseModel,BaseSettings,Field,FilePath,SecretStr,)fromlangchain_community.document_loaders.baseimportBaseLoaderclass_OneNoteGraphSettings(BaseSettings):client_id:str=Field(...,env="MS_GRAPH_CLIENT_ID")client_secret:SecretStr=Field(...,env="MS_GRAPH_CLIENT_SECRET")classConfig:case_sentive=Falseenv_file=".env"env_prefix=""
[docs]classOneNoteLoader(BaseLoader,BaseModel):"""Load pages from OneNote notebooks."""settings:_OneNoteGraphSettings=Field(default_factory=_OneNoteGraphSettings)# type: ignore[arg-type]"""Settings for the Microsoft Graph API client."""auth_with_token:bool=False"""Whether to authenticate with a token or not. Defaults to False."""access_token:str="""""Personal access token"""onenote_api_base_url:str="https://graph.microsoft.com/v1.0/me/onenote""""URL of Microsoft Graph API for OneNote"""authority_url:str="https://login.microsoftonline.com/consumers/""""A URL that identifies a token authority"""token_path:FilePath=Path.home()/".credentials"/"onenote_graph_token.txt""""Path to the file where the access token is stored"""notebook_name:Optional[str]=None"""Filter on notebook name"""section_name:Optional[str]=None"""Filter on section name"""page_title:Optional[str]=None"""Filter on section name"""object_ids:Optional[List[str]]=None""" The IDs of the objects to load data from."""
[docs]deflazy_load(self)->Iterator[Document]:""" Get pages from OneNote notebooks. Returns: A list of Documents with attributes: - page_content - metadata - title """self._auth()try:frombs4importBeautifulSoupexceptImportError:raiseImportError("beautifulsoup4 package not found, please install it with ""`pip install bs4`")ifself.object_idsisnotNone:forobject_idinself.object_ids:page_content_html=self._get_page_content(object_id)soup=BeautifulSoup(page_content_html,"html.parser")page_title=""title_tag=soup.titleiftitle_tag:page_title=title_tag.get_text(strip=True)page_content=soup.get_text(separator="\n",strip=True)yieldDocument(page_content=page_content,metadata={"title":page_title})else:request_url=self._urlwhilerequest_url!="":response=requests.get(request_url,headers=self._headers,timeout=10)response.raise_for_status()pages=response.json()forpageinpages["value"]:page_id=page["id"]page_content_html=self._get_page_content(page_id)soup=BeautifulSoup(page_content_html,"html.parser")page_title=""title_tag=soup.titleiftitle_tag:page_content=soup.get_text(separator="\n",strip=True)yieldDocument(page_content=page_content,metadata={"title":page_title})if"@odata.nextLink"inpages:request_url=pages["@odata.nextLink"]else:request_url=""
def_get_page_content(self,page_id:str)->str:"""Get page content from OneNote API"""request_url=self.onenote_api_base_url+f"/pages/{page_id}/content"response=requests.get(request_url,headers=self._headers,timeout=10)response.raise_for_status()returnresponse.text@propertydef_headers(self)->Dict[str,str]:"""Return headers for requests to OneNote API"""return{"Authorization":f"Bearer {self.access_token}",}@propertydef_scopes(self)->List[str]:"""Return required scopes."""return["Notes.Read"]def_auth(self)->None:"""Authenticate with Microsoft Graph API"""ifself.access_token!="":returnifself.auth_with_token:withself.token_path.open("r")astoken_file:self.access_token=token_file.read()else:try:frommsalimportConfidentialClientApplicationexceptImportErrorase:raiseImportError("MSAL package not found, please install it with `pip install msal`")fromeclient_instance=ConfidentialClientApplication(client_id=self.settings.client_id,client_credential=self.settings.client_secret.get_secret_value(),authority=self.authority_url,)authorization_request_url=client_instance.get_authorization_request_url(self._scopes)print("Visit the following url to give consent:")# noqa: T201print(authorization_request_url)# noqa: T201authorization_url=input("Paste the authenticated url here:\n")authorization_code=authorization_url.split("code=")[1].split("&")[0]access_token_json=client_instance.acquire_token_by_authorization_code(code=authorization_code,scopes=self._scopes)self.access_token=access_token_json["access_token"]try:ifnotself.token_path.parent.exists():self.token_path.parent.mkdir(parents=True)exceptExceptionase:raiseException(f"Could not create the folder {self.token_path.parent} "+"to store the access token.")fromewithself.token_path.open("w")astoken_file:token_file.write(self.access_token)@propertydef_url(self)->str:"""Create URL for getting page ids from the OneNoteApi API."""query_params_list=[]filter_list=[]expand_list=[]query_params_list.append("$select=id")ifself.notebook_nameisnotNone:filter_list.append("parentNotebook/displayName%20eq%20"+f"'{self.notebook_name.replace(' ','%20')}'")expand_list.append("parentNotebook")ifself.section_nameisnotNone:filter_list.append("parentSection/displayName%20eq%20"+f"'{self.section_name.replace(' ','%20')}'")expand_list.append("parentSection")ifself.page_titleisnotNone:filter_list.append("title%20eq%20"+f"'{self.page_title.replace(' ','%20')}'")iflen(expand_list)>0:query_params_list.append("$expand="+",".join(expand_list))iflen(filter_list)>0:query_params_list.append("$filter="+"%20and%20".join(filter_list))query_params="&".join(query_params_list)ifquery_params!="":query_params="?"+query_paramsreturnf"{self.onenote_api_base_url}/pages{query_params}"