[docs]classObsidianLoader(BaseLoader):"""Load `Obsidian` files from directory."""FRONT_MATTER_REGEX:Pattern=re.compile(r"^---\n(.*?)\n---\n",re.DOTALL)TEMPLATE_VARIABLE_REGEX:Pattern=re.compile(r"{{(.*?)}}",re.DOTALL)TAG_REGEX:Pattern=re.compile(r"[^\S\/]#([a-zA-Z_]+[-_/\w]*)")DATAVIEW_LINE_REGEX:Pattern=re.compile(r"^\s*(\w+)::\s*(.*)$",re.MULTILINE)DATAVIEW_INLINE_BRACKET_REGEX:Pattern=re.compile(r"\[(\w+)::\s*(.*)\]",re.MULTILINE)DATAVIEW_INLINE_PAREN_REGEX:Pattern=re.compile(r"\((\w+)::\s*(.*)\)",re.MULTILINE)
[docs]def__init__(self,path:Union[str,Path],encoding:str="UTF-8",collect_metadata:bool=True,):"""Initialize with a path. Args: path: Path to the directory containing the Obsidian files. encoding: Charset encoding, defaults to "UTF-8" collect_metadata: Whether to collect metadata from the front matter. Defaults to True. """self.file_path=pathself.encoding=encodingself.collect_metadata=collect_metadata
def_replace_template_var(self,placeholders:Dict[str,str],match:re.Match)->str:"""Replace a template variable with a placeholder."""placeholder=f"__TEMPLATE_VAR_{len(placeholders)}__"placeholders[placeholder]=match.group(1)returnplaceholderdef_restore_template_vars(self,obj:Any,placeholders:Dict[str,str])->Any:"""Restore template variables replaced with placeholders to original values."""ifisinstance(obj,str):forplaceholder,valueinplaceholders.items():obj=obj.replace(placeholder,f"{{{{{value}}}}}")elifisinstance(obj,dict):forkey,valueinobj.items():obj[key]=self._restore_template_vars(value,placeholders)elifisinstance(obj,list):fori,iteminenumerate(obj):obj[i]=self._restore_template_vars(item,placeholders)returnobjdef_parse_front_matter(self,content:str)->dict:"""Parse front matter metadata from the content and return it as a dict."""ifnotself.collect_metadata:return{}match=self.FRONT_MATTER_REGEX.search(content)ifnotmatch:return{}placeholders:Dict[str,str]={}replace_template_var=functools.partial(self._replace_template_var,placeholders)front_matter_text=self.TEMPLATE_VARIABLE_REGEX.sub(replace_template_var,match.group(1))try:front_matter=yaml.safe_load(front_matter_text)front_matter=self._restore_template_vars(front_matter,placeholders)# If tags are a string, split them into a listif"tags"infront_matterandisinstance(front_matter["tags"],str):front_matter["tags"]=front_matter["tags"].split(", ")returnfront_matterexceptyaml.parser.ParserError:logger.warning("Encountered non-yaml frontmatter")return{}def_to_langchain_compatible_metadata(self,metadata:dict)->dict:"""Convert a dictionary to a compatible with langchain."""result={}forkey,valueinmetadata.items():iftype(value)in{str,int,float}:result[key]=valueelse:result[key]=str(value)returnresultdef_parse_document_tags(self,content:str)->set:"""Return a set of all tags in within the document."""ifnotself.collect_metadata:returnset()match=self.TAG_REGEX.findall(content)ifnotmatch:returnset()return{tagfortaginmatch}def_parse_dataview_fields(self,content:str)->dict:"""Parse obsidian dataview plugin fields from the content and return it as a dict."""ifnotself.collect_metadata:return{}return{**{match[0]:match[1]formatchinself.DATAVIEW_LINE_REGEX.findall(content)},**{match[0]:match[1]formatchinself.DATAVIEW_INLINE_PAREN_REGEX.findall(content)},**{match[0]:match[1]formatchinself.DATAVIEW_INLINE_BRACKET_REGEX.findall(content)},}def_remove_front_matter(self,content:str)->str:"""Remove front matter metadata from the given content."""ifnotself.collect_metadata:returncontentreturnself.FRONT_MATTER_REGEX.sub("",content)