[docs]classBibtexLoader(BaseLoader):"""Load a `bibtex` file. Each document represents one entry from the bibtex file. If a PDF file is present in the `file` bibtex field, the original PDF is loaded into the document text. If no such file entry is present, the `abstract` field is used instead. """
[docs]def__init__(self,file_path:str,*,parser:Optional[BibtexparserWrapper]=None,max_docs:Optional[int]=None,max_content_chars:Optional[int]=4_000,load_extra_metadata:bool=False,file_pattern:str=r"[^:]+\.pdf",):"""Initialize the BibtexLoader. Args: file_path: Path to the bibtex file. parser: The parser to use. If None, a default parser is used. max_docs: Max number of associated documents to load. Use -1 means no limit. max_content_chars: Maximum number of characters to load from the PDF. load_extra_metadata: Whether to load extra metadata from the PDF. file_pattern: Regex pattern to match the file name in the bibtex. """self.file_path=file_pathself.parser=parserorBibtexparserWrapper()self.max_docs=max_docsself.max_content_chars=max_content_charsself.load_extra_metadata=load_extra_metadataself.file_regex=re.compile(file_pattern)
def_load_entry(self,entry:Mapping[str,Any])->Optional[Document]:importfitzparent_dir=Path(self.file_path).parent# regex is useful for Zotero flavor bibtex filesfile_names=self.file_regex.findall(entry.get("file",""))ifnotfile_names:returnNonetexts:List[str]=[]forfile_nameinfile_names:try:withfitz.open(parent_dir/file_name)asf:texts.extend(page.get_text()forpageinf)exceptFileNotFoundErrorase:logger.debug(e)content="\n".join(texts)orentry.get("abstract","")ifself.max_content_chars:content=content[:self.max_content_chars]metadata=self.parser.get_metadata(entry,load_extra=self.load_extra_metadata)returnDocument(page_content=content,metadata=metadata,)
[docs]deflazy_load(self)->Iterator[Document]:"""Load bibtex file using bibtexparser and get the article texts plus the article metadata. See https://bibtexparser.readthedocs.io/en/master/ Returns: a list of documents with the document.page_content in text format """try:importfitz# noqa: F401exceptImportError:raiseImportError("PyMuPDF package not found, please install it with ""`pip install pymupdf`")entries=self.parser.load_bibtex_entries(self.file_path)ifself.max_docs:entries=entries[:self.max_docs]forentryinentries:doc=self._load_entry(entry)ifdoc:yielddoc