Source code for langchain_community.document_loaders.evernote
"""Load documents from Evernote.https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c"""importhashlibimportloggingfrombase64importb64decodefrompathlibimportPathfromtimeimportstrptimefromtypingimportAny,Dict,Iterator,List,Optional,Unionfromlangchain_core.documentsimportDocumentfromlangchain_community.document_loaders.baseimportBaseLoaderlogger=logging.getLogger(__name__)
[docs]classEverNoteLoader(BaseLoader):"""Load from `EverNote`. Loads an EverNote notebook export file e.g. my_notebook.enex into Documents. Instructions on producing this file can be found at https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML Currently only the plain text in the note is extracted and stored as the contents of the Document, any non content metadata (e.g. 'author', 'created', 'updated' etc. but not 'content-raw' or 'resource') tags on the note will be extracted and stored as metadata on the Document. Args: file_path (str): The path to the notebook export with a .enex extension load_single_document (bool): Whether or not to concatenate the content of all notes into a single long Document. If this is set to True (default) then the only metadata on the document will be the 'source' which contains the file name of the export. """
[docs]def__init__(self,file_path:Union[str,Path],load_single_document:bool=True):"""Initialize with file path."""self.file_path=str(file_path)self.load_single_document=load_single_document
[docs]deflazy_load(self)->Iterator[Document]:"""Load documents from EverNote export file."""ifnotself.load_single_document:yield fromself._lazy_load()else:yieldDocument(page_content="".join([document.page_contentfordocumentinself._lazy_load()]),metadata={"source":self.file_path},)
@staticmethoddef_parse_content(content:str)->str:try:importhtml2textreturnhtml2text.html2text(content).strip()exceptImportErrorase:raiseImportError("Could not import `html2text`. Although it is not a required package ""to use Langchain, using the EverNote loader requires `html2text`. ""Please install `html2text` via `pip install html2text` and try again.")frome@staticmethoddef_parse_resource(resource:list)->dict:rsc_dict:Dict[str,Any]={}foreleminresource:ifelem.tag=="data":# Sometimes elem.text is Nonersc_dict[elem.tag]=b64decode(elem.text)ifelem.textelseb""rsc_dict["hash"]=hashlib.md5(rsc_dict[elem.tag]).hexdigest()else:rsc_dict[elem.tag]=elem.textreturnrsc_dict@staticmethoddef_parse_note(note:List,prefix:Optional[str]=None)->dict:note_dict:Dict[str,Any]={}resources=[]defadd_prefix(element_tag:str)->str:ifprefixisNone:returnelement_tagreturnf"{prefix}.{element_tag}"foreleminnote:ifelem.tag=="content":note_dict[elem.tag]=EverNoteLoader._parse_content(elem.text)# A copy of original contentnote_dict["content-raw"]=elem.textelifelem.tag=="resource":resources.append(EverNoteLoader._parse_resource(elem))elifelem.tag=="created"orelem.tag=="updated":note_dict[elem.tag]=strptime(elem.text,"%Y%m%dT%H%M%SZ")elifelem.tag=="note-attributes":additional_attributes=EverNoteLoader._parse_note(elem,elem.tag)# Recursively enter the note-attributes tagnote_dict.update(additional_attributes)else:note_dict[elem.tag]=elem.textiflen(resources)>0:note_dict["resource"]=resourcesreturn{add_prefix(key):valueforkey,valueinnote_dict.items()}@staticmethoddef_parse_note_xml(xml_file:str)->Iterator[Dict[str,Any]]:"""Parse Evernote xml."""# Without huge_tree set to True, parser may complain about huge text node# Try to recover, because there may be " ", which will cause# "XMLSyntaxError: Entity 'nbsp' not defined"try:fromlxmlimportetreeexceptImportErrorase:logger.error("Could not import `lxml`. Although it is not a required package to use ""Langchain, using the EverNote loader requires `lxml`. Please install ""`lxml` via `pip install lxml` and try again.")raiseecontext=etree.iterparse(xml_file,encoding="utf-8",strip_cdata=False,huge_tree=True,recover=True)foraction,elemincontext:ifelem.tag=="note":yieldEverNoteLoader._parse_note(elem)