Source code for langchain_community.document_loaders.parsers.pdf
"""Module contains common parsers for PDFs."""from__future__importannotationsimporthtmlimportioimportloggingimportthreadingimportwarningsfromdatetimeimportdatetimefrompathlibimportPathfromtempfileimportTemporaryDirectoryfromtypingimport(TYPE_CHECKING,Any,BinaryIO,Iterable,Iterator,Literal,Mapping,Optional,Sequence,Union,cast,)fromurllib.parseimporturlparseimportnumpyimportnumpyasnpfromlangchain_core.documentsimportDocumentfromlangchain_community.document_loaders.baseimportBaseBlobParserfromlangchain_community.document_loaders.blob_loadersimportBlobfromlangchain_community.document_loaders.parsers.imagesimport(BaseImageBlobParser,RapidOCRBlobParser,)ifTYPE_CHECKING:importpdfplumberimportpymupdfimportpypdfimportpypdfium2fromtextractor.data.text_linearization_configimportTextLinearizationConfig_PDF_FILTER_WITH_LOSS=["DCTDecode","DCT","JPXDecode"]_PDF_FILTER_WITHOUT_LOSS=["LZWDecode","LZW","FlateDecode","Fl","ASCII85Decode","A85","ASCIIHexDecode","AHx","RunLengthDecode","RL","CCITTFaxDecode","CCF","JBIG2Decode",]
[docs]defextract_from_images_with_rapidocr(images:Sequence[Union[Iterable[np.ndarray],bytes]],)->str:"""Extract text from images with RapidOCR. Args: images: Images to extract text from. Returns: Text extracted from images. Raises: ImportError: If `rapidocr-onnxruntime` package is not installed. """try:fromrapidocr_onnxruntimeimportRapidOCRexceptImportError:raiseImportError("`rapidocr-onnxruntime` package not found, please install it with ""`pip install rapidocr-onnxruntime`")ocr=RapidOCR()text=""forimginimages:result,_=ocr(img)ifresult:result=[text[1]fortextinresult]text+="\n".join(result)returntext
logger=logging.getLogger(__name__)_FORMAT_IMAGE_STR="\n\n{image_text}\n\n"_JOIN_IMAGES="\n"_JOIN_TABLES="\n"_DEFAULT_PAGES_DELIMITER="\n\f"_STD_METADATA_KEYS={"source","total_pages","creationdate","creator","producer"}def_format_inner_image(blob:Blob,content:str,format:str)->str:"""Format the content of the image with the source of the blob. blob: The blob containing the image. format:: The format for the parsed output. - "text" = return the content as is - "markdown-img" = wrap the content into an image markdown link, w/ link pointing to (`![body)(#)`] - "html-img" = wrap the content as the `alt` text of an tag and link to (`<img alt="{body}" src="#"/>`) """ifcontent:source=blob.sourceor"#"ifformat=="markdown-img":content=content.replace("]",r"\\]")content=f""elifformat=="html-img":content=f'<img alt="{html.escape(content,quote=True)} src="{source}" />'returncontentdef_validate_metadata(metadata:dict[str,Any])->dict[str,Any]:"""Validate that the metadata has all the standard keys and the page is an integer. The standard keys are: - source - total_page - creationdate - creator - producer Validate that page is an integer if it is present. """ifnot_STD_METADATA_KEYS.issubset(metadata.keys()):raiseValueError("The PDF parser must valorize the standard metadata.")ifnotisinstance(metadata.get("page",0),int):raiseValueError("The PDF metadata page must be a integer.")returnmetadatadef_purge_metadata(metadata:dict[str,Any])->dict[str,Any]:"""Purge metadata from unwanted keys and normalize key names. Args: metadata: The original metadata dictionary. Returns: The cleaned and normalized the key format of metadata dictionary. """new_metadata:dict[str,Any]={}map_key={"page_count":"total_pages","file_path":"source",}fork,vinmetadata.items():iftype(v)notin[str,int]:v=str(v)ifk.startswith("/"):k=k[1:]k=k.lower()ifkin["creationdate","moddate"]:try:new_metadata[k]=datetime.strptime(v.replace("'",""),"D:%Y%m%d%H%M%S%z").isoformat("T")exceptValueError:new_metadata[k]=velifkinmap_key:# Normalize key with others PDF parsernew_metadata[map_key[k]]=vnew_metadata[k]=velifisinstance(v,str):new_metadata[k]=v.strip()elifisinstance(v,int):new_metadata[k]=vreturnnew_metadata_PARAGRAPH_DELIMITER=["\n\n\n","\n\n",]# To insert images or table in the middle of the page.def_merge_text_and_extras(extras:list[str],text_from_page:str)->str:"""Insert extras such as image/table in a text between two paragraphs if possible, else at the end of the text. Args: extras: List of extra content (images/tables) to insert. text_from_page: The text content from the page. Returns: The merged text with extras inserted. """def_recurs_merge_text_and_extras(extras:list[str],text_from_page:str,recurs:bool)->Optional[str]:ifextras:fordelimin_PARAGRAPH_DELIMITER:pos=text_from_page.rfind(delim)ifpos!=-1:# search penultimate, to bypass an error in footerprevious_text=Noneifrecurs:previous_text=_recurs_merge_text_and_extras(extras,text_from_page[:pos],False)ifprevious_text:all_text=previous_text+text_from_page[pos:]else:all_extras=""str_extras="\n\n".join(filter(lambdax:x,extras))ifstr_extras:all_extras=delim+str_extrasall_text=(text_from_page[:pos]+all_extras+text_from_page[pos:])breakelse:all_text=Noneelse:all_text=text_from_pagereturnall_textall_text=_recurs_merge_text_and_extras(extras,text_from_page,True)ifnotall_text:all_extras=""str_extras="\n\n".join(filter(lambdax:x,extras))ifstr_extras:all_extras=_PARAGRAPH_DELIMITER[-1]+str_extrasall_text=text_from_page+all_extrasreturnall_text
[docs]classPyPDFParser(BaseBlobParser):"""Parse a blob from a PDF using `pypdf` library. This class provides methods to parse a blob from a PDF document, supporting various configurations such as handling password-protected PDFs, extracting images. It integrates the 'pypdf' library for PDF processing and offers synchronous blob parsing. Examples: Setup: .. code-block:: bash pip install -U langchain-community pypdf Load a blob from a PDF file: .. code-block:: python from langchain_core.documents.base import Blob blob = Blob.from_path("./example_data/layout-parser-paper.pdf") Instantiate the parser: .. code-block:: python from langchain_community.document_loaders.parsers import PyPDFParser parser = PyPDFParser( # password = None, mode = "single", pages_delimiter = "\n\f", # images_parser = TesseractBlobParser(), ) Lazily parse the blob: .. code-block:: python docs = [] docs_lazy = parser.lazy_parse(blob) for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) """
[docs]def__init__(self,password:Optional[Union[str,bytes]]=None,extract_images:bool=False,*,mode:Literal["single","page"]="page",pages_delimiter:str=_DEFAULT_PAGES_DELIMITER,images_parser:Optional[BaseImageBlobParser]=None,images_inner_format:Literal["text","markdown-img","html-img"]="text",extraction_mode:Literal["plain","layout"]="plain",extraction_kwargs:Optional[dict[str,Any]]=None,):"""Initialize a parser based on PyPDF. Args: password: Optional password for opening encrypted PDFs. extract_images: Whether to extract images from the PDF. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. pages_delimiter: A string delimiter to separate pages in single-mode extraction. images_parser: Optional image blob parser. images_inner_format: The format for the parsed output. - "text" = return the content as is - "markdown-img" = wrap the content into an image markdown link, w/ link pointing to (`![body)(#)`] - "html-img" = wrap the content as the `alt` text of an tag and link to (`<img alt="{body}" src="#"/>`) extraction_mode: βplainβ for legacy functionality, βlayoutβ extract text in a fixed width format that closely adheres to the rendered layout in the source pdf. extraction_kwargs: Optional additional parameters for the extraction process. Raises: ValueError: If the `mode` is not "single" or "page". """super().__init__()ifmodenotin["single","page"]:raiseValueError("mode must be single or page")self.extract_images=extract_imagesifextract_imagesandnotimages_parser:images_parser=RapidOCRBlobParser()self.images_parser=images_parserself.images_inner_format=images_inner_formatself.password=passwordself.mode=modeself.pages_delimiter=pages_delimiterself.extraction_mode=extraction_modeself.extraction_kwargs=extraction_kwargsor{}
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:# type: ignore[valid-type]""" Lazily parse the blob. Insert image, if possible, between two paragraphs. In this way, a paragraph can be continued on the next page. Args: blob: The blob to parse. Raises: ImportError: If the `pypdf` package is not found. Yield: An iterator over the parsed documents. """try:importpypdfexceptImportError:raiseImportError("`pypdf` package not found, please install it with `pip install pypdf`")def_extract_text_from_page(page:pypdf.PageObject)->str:""" Extract text from image given the version of pypdf. Args: page: The page object to extract text from. Returns: str: The extracted text. """ifpypdf.__version__.startswith("3"):returnpage.extract_text()else:returnpage.extract_text(extraction_mode=self.extraction_mode,**self.extraction_kwargs,)withblob.as_bytes_io()aspdf_file_obj:# type: ignore[attr-defined]pdf_reader=pypdf.PdfReader(pdf_file_obj,password=self.password)doc_metadata=_purge_metadata({"producer":"PyPDF","creator":"PyPDF","creationdate":""}|cast(dict,pdf_reader.metadataor{})|{"source":blob.source,"total_pages":len(pdf_reader.pages),})single_texts=[]forpage_number,pageinenumerate(pdf_reader.pages):text_from_page=_extract_text_from_page(page=page)images_from_page=self.extract_images_from_page(page)all_text=_merge_text_and_extras([images_from_page],text_from_page).strip()ifself.mode=="page":yieldDocument(page_content=all_text,metadata=_validate_metadata(doc_metadata|{"page":page_number,"page_label":pdf_reader.page_labels[page_number],}),)else:single_texts.append(all_text)ifself.mode=="single":yieldDocument(page_content=self.pages_delimiter.join(single_texts),metadata=_validate_metadata(doc_metadata),)
[docs]defextract_images_from_page(self,page:pypdf._page.PageObject)->str:"""Extract images from a PDF page and get the text using images_to_text. Args: page: The page object from which to extract images. Returns: str: The extracted text from the images on the page. """ifnotself.images_parser:return""importpypdffromPILimportImageif"/XObject"notincast(dict,page["/Resources"]).keys():return""xObject=page["/Resources"]["/XObject"].get_object()# type: ignore[index]images=[]forobjinxObject:np_image:Any=NoneifxObject[obj]["/Subtype"]=="/Image":img_filter=(xObject[obj]["/Filter"][1:]iftype(xObject[obj]["/Filter"])ispypdf.generic._base.NameObjectelsexObject[obj]["/Filter"][0][1:])ifimg_filterin_PDF_FILTER_WITHOUT_LOSS:height,width=xObject[obj]["/Height"],xObject[obj]["/Width"]np_image=np.frombuffer(xObject[obj].get_data(),dtype=np.uint8).reshape(height,width,-1)elifimg_filterin_PDF_FILTER_WITH_LOSS:np_image=np.array(Image.open(io.BytesIO(xObject[obj].get_data())))else:logger.warning("Unknown PDF Filter!")ifnp_imageisnotNone:image_bytes=io.BytesIO()Image.fromarray(np_image).save(image_bytes,format="PNG")blob=Blob.from_data(image_bytes.getvalue(),mime_type="image/png")image_text=next(self.images_parser.lazy_parse(blob)).page_contentimages.append(_format_inner_image(blob,image_text,self.images_inner_format))return_FORMAT_IMAGE_STR.format(image_text=_JOIN_IMAGES.join(filter(None,images)))
[docs]classPDFMinerParser(BaseBlobParser):"""Parse a blob from a PDF using `pdfminer.six` library. This class provides methods to parse a blob from a PDF document, supporting various configurations such as handling password-protected PDFs, extracting images, and defining extraction mode. It integrates the 'pdfminer.six' library for PDF processing and offers synchronous blob parsing. Examples: Setup: .. code-block:: bash pip install -U langchain-community pdfminer.six pillow Load a blob from a PDF file: .. code-block:: python from langchain_core.documents.base import Blob blob = Blob.from_path("./example_data/layout-parser-paper.pdf") Instantiate the parser: .. code-block:: python from langchain_community.document_loaders.parsers import PDFMinerParser parser = PDFMinerParser( # password = None, mode = "single", pages_delimiter = "\n\f", # extract_images = True, # images_to_text = convert_images_to_text_with_tesseract(), ) Lazily parse the blob: .. code-block:: python docs = [] docs_lazy = parser.lazy_parse(blob) for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) """_warn_concatenate_pages=False
[docs]def__init__(self,extract_images:bool=False,*,password:Optional[str]=None,mode:Literal["single","page"]="single",pages_delimiter:str=_DEFAULT_PAGES_DELIMITER,images_parser:Optional[BaseImageBlobParser]=None,images_inner_format:Literal["text","markdown-img","html-img"]="text",concatenate_pages:Optional[bool]=None,):"""Initialize a parser based on PDFMiner. Args: password: Optional password for opening encrypted PDFs. mode: Extraction mode to use. Either "single" or "page" for page-wise extraction. pages_delimiter: A string delimiter to separate pages in single-mode extraction. extract_images: Whether to extract images from PDF. images_inner_format: The format for the parsed output. - "text" = return the content as is - "markdown-img" = wrap the content into an image markdown link, w/ link pointing to (`![body)(#)`] - "html-img" = wrap the content as the `alt` text of an tag and link to (`<img alt="{body}" src="#"/>`) concatenate_pages: Deprecated. If True, concatenate all PDF pages into one a single document. Otherwise, return one document per page. Returns: This method does not directly return data. Use the `parse` or `lazy_parse` methods to retrieve parsed documents with content and metadata. Raises: ValueError: If the `mode` is not "single" or "page". Warnings: `concatenate_pages` parameter is deprecated. Use `mode='single' or 'page' instead. """super().__init__()ifmodenotin["single","page"]:raiseValueError("mode must be single or page")ifextract_imagesandnotimages_parser:images_parser=RapidOCRBlobParser()self.extract_images=extract_imagesself.images_parser=images_parserself.images_inner_format=images_inner_formatself.password=passwordself.mode=modeself.pages_delimiter=pages_delimiterifconcatenate_pagesisnotNone:ifnotPDFMinerParser._warn_concatenate_pages:PDFMinerParser._warn_concatenate_pages=Truelogger.warning("`concatenate_pages` parameter is deprecated. ""Use `mode='single' or 'page'` instead.")self.mode="single"ifconcatenate_pageselse"page"
[docs]@staticmethoddefdecode_text(s:Union[bytes,str])->str:""" Decodes a PDFDocEncoding string to Unicode. Adds py3 compatibility to pdfminer's version. Args: s: The string to decode. Returns: str: The decoded Unicode string. """frompdfminer.utilsimportPDFDocEncodingifisinstance(s,bytes)ands.startswith(b"\xfe\xff"):returnstr(s[2:],"utf-16be","ignore")try:ords=(ord(c)ifisinstance(c,str)elsecforcins)return"".join(PDFDocEncoding[o]foroinords)exceptIndexError:returnstr(s)
[docs]@staticmethoddefresolve_and_decode(obj:Any)->Any:""" Recursively resolve the metadata values. Args: obj: The object to resolve and decode. It can be of any type. Returns: The resolved and decoded object. """frompdfminer.psparserimportPSLiteralifhasattr(obj,"resolve"):obj=obj.resolve()ifisinstance(obj,list):returnlist(map(PDFMinerParser.resolve_and_decode,obj))elifisinstance(obj,PSLiteral):returnPDFMinerParser.decode_text(obj.name)elifisinstance(obj,(str,bytes)):returnPDFMinerParser.decode_text(obj)elifisinstance(obj,dict):fork,vinobj.items():obj[k]=PDFMinerParser.resolve_and_decode(v)returnobjreturnobj
def_get_metadata(self,fp:BinaryIO,password:str="",caching:bool=True,)->dict[str,Any]:""" Extract metadata from a PDF file. Args: fp: The file pointer to the PDF file. password: The password for the PDF file, if encrypted. Defaults to an empty string. caching: Whether to cache the PDF structure. Defaults to True. Returns: Metadata of the PDF file. """frompdfminer.pdfpageimportPDFDocument,PDFPage,PDFParser# Create a PDF parser object associated with the file object.parser=PDFParser(fp)# Create a PDF document object that stores the document structure.doc=PDFDocument(parser,password=password,caching=caching)metadata={}forinfoindoc.info:metadata.update(info)fork,vinmetadata.items():try:metadata[k]=PDFMinerParser.resolve_and_decode(v)exceptExceptionase:# pragma: nocover# This metadata value could not be parsed. Instead of failing the PDF# read, treat it as a warning only if `strict_metadata=False`.logger.warning('[WARNING] Metadata key "%s" could not be parsed due to '"exception: %s",k,str(e),)# Count number of pages.metadata["total_pages"]=len(list(PDFPage.create_pages(doc)))returnmetadata
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:# type: ignore[valid-type]""" Lazily parse the blob. Insert image, if possible, between two paragraphs. In this way, a paragraph can be continued on the next page. Args: blob: The blob to parse. Raises: ImportError: If the `pdfminer.six` or `pillow` package is not found. Yield: An iterator over the parsed documents. """try:importpdfminerfrompdfminer.converterimportPDFLayoutAnalyzerfrompdfminer.layoutimport(LAParams,LTContainer,LTImage,LTItem,LTPage,LTText,LTTextBox,)frompdfminer.pdfinterpimportPDFPageInterpreter,PDFResourceManagerfrompdfminer.pdfpageimportPDFPageifint(pdfminer.__version__)<20201018:raiseImportError("This parser is tested with pdfminer.six version 20201018 or ""later. Remove pdfminer, and install pdfminer.six with ""`pip uninstall pdfminer && pip install pdfminer.six`.")exceptImportError:raiseImportError("pdfminer package not found, please install it ""with `pip install pdfminer.six`")withblob.as_bytes_io()aspdf_file_obj,TemporaryDirectory()astempdir:pages=PDFPage.get_pages(pdf_file_obj,password=self.passwordor"")rsrcmgr=PDFResourceManager()doc_metadata=_purge_metadata(self._get_metadata(pdf_file_obj,password=self.passwordor""))doc_metadata["source"]=blob.sourceclassVisitor(PDFLayoutAnalyzer):def__init__(self,rsrcmgr:PDFResourceManager,pageno:int=1,laparams:Optional[LAParams]=None,)->None:super().__init__(rsrcmgr,pageno=pageno,laparams=laparams)defreceive_layout(me,ltpage:LTPage)->None:defrender(item:LTItem)->None:ifisinstance(item,LTContainer):forchildinitem:render(child)elifisinstance(item,LTText):text_io.write(item.get_text())ifisinstance(item,LTTextBox):text_io.write("\n")elifisinstance(item,LTImage):ifself.images_parser:frompdfminer.imageimportImageWriterimage_writer=ImageWriter(tempdir)filename=image_writer.export_image(item)blob=Blob.from_path(Path(tempdir)/filename)blob.metadata["source"]="#"image_text=next(self.images_parser.lazy_parse(blob)).page_contenttext_io.write(_format_inner_image(blob,image_text,self.images_inner_format))else:passrender(ltpage)text_io=io.StringIO()visitor_for_all=PDFPageInterpreter(rsrcmgr,Visitor(rsrcmgr,laparams=LAParams()))all_content=[]fori,pageinenumerate(pages):text_io.truncate(0)text_io.seek(0)visitor_for_all.process_page(page)all_text=text_io.getvalue()# For legacy compatibility, net strip()all_text=all_text.strip()ifself.mode=="page":text_io.truncate(0)text_io.seek(0)yieldDocument(page_content=all_text,metadata=_validate_metadata(doc_metadata|{"page":i}),)else:ifall_text.endswith("\f"):all_text=all_text[:-1]all_content.append(all_text)ifself.mode=="single":# Add pages_delimiter between pagesdocument_content=self.pages_delimiter.join(all_content)yieldDocument(page_content=document_content,metadata=_validate_metadata(doc_metadata),)
[docs]classPyMuPDFParser(BaseBlobParser):"""Parse a blob from a PDF using `PyMuPDF` library. This class provides methods to parse a blob from a PDF document, supporting various configurations such as handling password-protected PDFs, extracting images, and defining extraction mode. It integrates the 'PyMuPDF' library for PDF processing and offers synchronous blob parsing. Examples: Setup: .. code-block:: bash pip install -U langchain-community pymupdf Load a blob from a PDF file: .. code-block:: python from langchain_core.documents.base import Blob blob = Blob.from_path("./example_data/layout-parser-paper.pdf") Instantiate the parser: .. code-block:: python from langchain_community.document_loaders.parsers import PyMuPDFParser parser = PyMuPDFParser( # password = None, mode = "single", pages_delimiter = "\n\f", # images_parser = TesseractBlobParser(), # extract_tables="markdown", # extract_tables_settings=None, # text_kwargs=None, ) Lazily parse the blob: .. code-block:: python docs = [] docs_lazy = parser.lazy_parse(blob) for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) """# PyMuPDF is not thread safe.# See https://pymupdf.readthedocs.io/en/latest/recipes-multiprocessing.html_lock=threading.Lock()
[docs]def__init__(self,text_kwargs:Optional[dict[str,Any]]=None,extract_images:bool=False,*,password:Optional[str]=None,mode:Literal["single","page"]="page",pages_delimiter:str=_DEFAULT_PAGES_DELIMITER,images_parser:Optional[BaseImageBlobParser]=None,images_inner_format:Literal["text","markdown-img","html-img"]="text",extract_tables:Union[Literal["csv","markdown","html"],None]=None,extract_tables_settings:Optional[dict[str,Any]]=None,)->None:"""Initialize a parser based on PyMuPDF. Args: password: Optional password for opening encrypted PDFs. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. pages_delimiter: A string delimiter to separate pages in single-mode extraction. extract_images: Whether to extract images from the PDF. images_parser: Optional image blob parser. images_inner_format: The format for the parsed output. - "text" = return the content as is - "markdown-img" = wrap the content into an image markdown link, w/ link pointing to (`![body)(#)`] - "html-img" = wrap the content as the `alt` text of an tag and link to (`<img alt="{body}" src="#"/>`) extract_tables: Whether to extract tables in a specific format, such as "csv", "markdown", or "html". extract_tables_settings: Optional dictionary of settings for customizing table extraction. Returns: This method does not directly return data. Use the `parse` or `lazy_parse` methods to retrieve parsed documents with content and metadata. Raises: ValueError: If the mode is not "single" or "page". ValueError: If the extract_tables format is not "markdown", "html", or "csv". """super().__init__()ifmodenotin["single","page"]:raiseValueError("mode must be single or page")ifextract_tablesandextract_tablesnotin["markdown","html","csv"]:raiseValueError("mode must be markdown")self.mode=modeself.pages_delimiter=pages_delimiterself.password=passwordself.text_kwargs=text_kwargsor{}ifextract_imagesandnotimages_parser:images_parser=RapidOCRBlobParser()self.extract_images=extract_imagesself.images_inner_format=images_inner_formatself.images_parser=images_parserself.extract_tables=extract_tablesself.extract_tables_settings=extract_tables_settings
def_lazy_parse(self,blob:Blob,# text-kwargs is present for backwards compatibility.# Users should not use it directly.text_kwargs:Optional[dict[str,Any]]=None,)->Iterator[Document]:# type: ignore[valid-type]"""Lazily parse the blob. Insert image, if possible, between two paragraphs. In this way, a paragraph can be continued on the next page. Args: blob: The blob to parse. text_kwargs: Optional keyword arguments to pass to the `get_text` method. If provided at run time, it will override the default text_kwargs. Raises: ImportError: If the `pypdf` package is not found. Yield: An iterator over the parsed documents. """try:importpymupdftext_kwargs=text_kwargsorself.text_kwargsifnotself.extract_tables_settings:frompymupdf.tableimport(DEFAULT_JOIN_TOLERANCE,DEFAULT_MIN_WORDS_HORIZONTAL,DEFAULT_MIN_WORDS_VERTICAL,DEFAULT_SNAP_TOLERANCE,)self.extract_tables_settings={# See https://pymupdf.readthedocs.io/en/latest/page.html#Page.find_tables"clip":None,"vertical_strategy":"lines","horizontal_strategy":"lines","vertical_lines":None,"horizontal_lines":None,"snap_tolerance":DEFAULT_SNAP_TOLERANCE,"snap_x_tolerance":None,"snap_y_tolerance":None,"join_tolerance":DEFAULT_JOIN_TOLERANCE,"join_x_tolerance":None,"join_y_tolerance":None,"edge_min_length":3,"min_words_vertical":DEFAULT_MIN_WORDS_VERTICAL,"min_words_horizontal":DEFAULT_MIN_WORDS_HORIZONTAL,"intersection_tolerance":3,"intersection_x_tolerance":None,"intersection_y_tolerance":None,"text_tolerance":3,"text_x_tolerance":3,"text_y_tolerance":3,"strategy":None,# offer abbreviation"add_lines":None,# optional user-specified lines}exceptImportError:raiseImportError("pymupdf package not found, please install it ""with `pip install pymupdf`")withPyMuPDFParser._lock:withblob.as_bytes_io()asfile_path:# type: ignore[attr-defined]ifblob.dataisNone:# type: ignore[attr-defined]doc=pymupdf.open(file_path)else:doc=pymupdf.open(stream=file_path,filetype="pdf")ifdoc.is_encrypted:doc.authenticate(self.password)doc_metadata=self._extract_metadata(doc,blob)full_content=[]forpageindoc:all_text=self._get_page_content(doc,page,text_kwargs).strip()ifself.mode=="page":yieldDocument(page_content=all_text,metadata=_validate_metadata(doc_metadata|{"page":page.number}),)else:full_content.append(all_text)ifself.mode=="single":yieldDocument(page_content=self.pages_delimiter.join(full_content),metadata=_validate_metadata(doc_metadata),)def_get_page_content(self,doc:pymupdf.Document,page:pymupdf.Page,text_kwargs:dict[str,Any],)->str:"""Get the text of the page using PyMuPDF and RapidOCR and issue a warning if it is empty. Args: doc: The PyMuPDF document object. page: The PyMuPDF page object. blob: The blob being parsed. Returns: str: The text content of the page. """text_from_page=page.get_text(**{**self.text_kwargs,**text_kwargs})images_from_page=self._extract_images_from_page(doc,page)tables_from_page=self._extract_tables_from_page(page)extras=[]ifimages_from_page:extras.append(images_from_page)iftables_from_page:extras.append(tables_from_page)all_text=_merge_text_and_extras(extras,text_from_page)returnall_textdef_extract_metadata(self,doc:pymupdf.Document,blob:Blob)->dict:"""Extract metadata from the document and page. Args: doc: The PyMuPDF document object. blob: The blob being parsed. Returns: dict: The extracted metadata. """metadata=_purge_metadata({**{"producer":"PyMuPDF","creator":"PyMuPDF","creationdate":"","source":blob.source,# type: ignore[attr-defined]"file_path":blob.source,# type: ignore[attr-defined]"total_pages":len(doc),},**{k:doc.metadata[k]forkindoc.metadataifisinstance(doc.metadata[k],(str,int))},})forkin("modDate","creationDate"):ifkindoc.metadata:metadata[k]=doc.metadata[k]returnmetadatadef_extract_images_from_page(self,doc:pymupdf.Document,page:pymupdf.Page)->str:"""Extract images from a PDF page and get the text using images_to_text. Args: doc: The PyMuPDF document object. page: The PyMuPDF page object. Returns: str: The extracted text from the images on the page. """ifnotself.images_parser:return""importpymupdfimg_list=page.get_images()images=[]forimginimg_list:ifself.images_parser:xref=img[0]pix=pymupdf.Pixmap(doc,xref)image=np.frombuffer(pix.samples,dtype=np.uint8).reshape(pix.height,pix.width,-1)image_bytes=io.BytesIO()numpy.save(image_bytes,image)blob=Blob.from_data(image_bytes.getvalue(),mime_type="application/x-npy")image_text=next(self.images_parser.lazy_parse(blob)).page_contentimages.append(_format_inner_image(blob,image_text,self.images_inner_format))return_FORMAT_IMAGE_STR.format(image_text=_JOIN_IMAGES.join(filter(None,images)))def_extract_tables_from_page(self,page:pymupdf.Page)->str:"""Extract tables from a PDF page. Args: page: The PyMuPDF page object. Returns: str: The extracted tables in the specified format. """ifself.extract_tablesisNone:return""importpymupdftables_list=list(pymupdf.table.find_tables(page,**self.extract_tables_settings))iftables_list:ifself.extract_tables=="markdown":return_JOIN_TABLES.join([table.to_markdown()fortableintables_list])elifself.extract_tables=="html":return_JOIN_TABLES.join([table.to_pandas().to_html(header=False,index=False,bold_rows=False,)fortableintables_list])elifself.extract_tables=="csv":return_JOIN_TABLES.join([table.to_pandas().to_csv(header=False,index=False,)fortableintables_list])else:raiseValueError(f"extract_tables {self.extract_tables} not implemented")return""
[docs]classPyPDFium2Parser(BaseBlobParser):"""Parse a blob from a PDF using `PyPDFium2` library. This class provides methods to parse a blob from a PDF document, supporting various configurations such as handling password-protected PDFs, extracting images, and defining extraction mode. It integrates the 'PyPDFium2' library for PDF processing and offers synchronous blob parsing. Examples: Setup: .. code-block:: bash pip install -U langchain-community pypdfium2 Load a blob from a PDF file: .. code-block:: python from langchain_core.documents.base import Blob blob = Blob.from_path("./example_data/layout-parser-paper.pdf") Instantiate the parser: .. code-block:: python from langchain_community.document_loaders.parsers import PyPDFium2Parser parser = PyPDFium2Parser( # password=None, mode="page", pages_delimiter="\n\f", # extract_images = True, # images_to_text = convert_images_to_text_with_tesseract(), ) Lazily parse the blob: .. code-block:: python docs = [] docs_lazy = parser.lazy_parse(blob) for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) """# PyPDFium2 is not thread safe.# See https://pypdfium2.readthedocs.io/en/stable/python_api.html#thread-incompatibility_lock=threading.Lock()
[docs]def__init__(self,extract_images:bool=False,*,password:Optional[str]=None,mode:Literal["single","page"]="page",pages_delimiter:str=_DEFAULT_PAGES_DELIMITER,images_parser:Optional[BaseImageBlobParser]=None,images_inner_format:Literal["text","markdown-img","html-img"]="text",)->None:"""Initialize a parser based on PyPDFium2. Args: password: Optional password for opening encrypted PDFs. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. pages_delimiter: A string delimiter to separate pages in single-mode extraction. extract_images: Whether to extract images from the PDF. images_parser: Optional image blob parser. images_inner_format: The format for the parsed output. - "text" = return the content as is - "markdown-img" = wrap the content into an image markdown link, w/ link pointing to (`![body)(#)`] - "html-img" = wrap the content as the `alt` text of an tag and link to (`<img alt="{body}" src="#"/>`) extraction_mode: βplainβ for legacy functionality, βlayoutβ for experimental layout mode functionality extraction_kwargs: Optional additional parameters for the extraction process. Returns: This method does not directly return data. Use the `parse` or `lazy_parse` methods to retrieve parsed documents with content and metadata. Raises: ValueError: If the mode is not "single" or "page". """super().__init__()ifmodenotin["single","page"]:raiseValueError("mode must be single or page")self.extract_images=extract_imagesifextract_imagesandnotimages_parser:images_parser=RapidOCRBlobParser()self.images_parser=images_parserself.images_inner_format=images_inner_formatself.password=passwordself.mode=modeself.pages_delimiter=pages_delimiter
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:# type: ignore[valid-type]""" Lazily parse the blob. Insert image, if possible, between two paragraphs. In this way, a paragraph can be continued on the next page. Args: blob: The blob to parse. Raises: ImportError: If the `pypdf` package is not found. Yield: An iterator over the parsed documents. """try:importpypdfium2exceptImportError:raiseImportError("pypdfium2 package not found, please install it with"" `pip install pypdfium2`")# pypdfium2 is really finicky with respect to closing things,# if done incorrectly creates seg faults.withPyPDFium2Parser._lock:withblob.as_bytes_io()asfile_path:# type: ignore[attr-defined]pdf_reader=Nonetry:pdf_reader=pypdfium2.PdfDocument(file_path,password=self.password,autoclose=True)full_content=[]doc_metadata=_purge_metadata(pdf_reader.get_metadata_dict())doc_metadata["source"]=blob.sourcedoc_metadata["total_pages"]=len(pdf_reader)forpage_number,pageinenumerate(pdf_reader):text_page=page.get_textpage()text_from_page="\n".join(text_page.get_text_range().splitlines())# Replace \r\ntext_page.close()image_from_page=self._extract_images_from_page(page)all_text=_merge_text_and_extras([image_from_page],text_from_page).strip()page.close()ifself.mode=="page":# For legacy compatibility, add the last '\n'ifnotall_text.endswith("\n"):all_text+="\n"yieldDocument(page_content=all_text,metadata=_validate_metadata({**doc_metadata,"page":page_number,}),)else:full_content.append(all_text)ifself.mode=="single":yieldDocument(page_content=self.pages_delimiter.join(full_content),metadata=_validate_metadata(doc_metadata),)finally:ifpdf_reader:pdf_reader.close()
def_extract_images_from_page(self,page:pypdfium2._helpers.page.PdfPage)->str:"""Extract images from a PDF page and get the text using images_to_text. Args: page: The page object from which to extract images. Returns: str: The extracted text from the images on the page. """ifnotself.images_parser:return""importpypdfium2.rawaspdfium_cimages=list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,)))ifnotimages:return""str_images=[]forimageinimages:image_bytes=io.BytesIO()np_image=image.get_bitmap().to_numpy()ifnp_image.size<3:continuenumpy.save(image_bytes,image.get_bitmap().to_numpy())blob=Blob.from_data(image_bytes.getvalue(),mime_type="application/x-npy")text_from_image=next(self.images_parser.lazy_parse(blob)).page_contentstr_images.append(_format_inner_image(blob,text_from_image,self.images_inner_format))image.close()return_FORMAT_IMAGE_STR.format(image_text=_JOIN_IMAGES.join(str_images))
[docs]classPDFPlumberParser(BaseBlobParser):"""Parse `PDF` with `PDFPlumber`."""
[docs]def__init__(self,text_kwargs:Optional[Mapping[str,Any]]=None,dedupe:bool=False,extract_images:bool=False,)->None:"""Initialize the parser. Args: text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` dedupe: Avoiding the error of duplicate characters if `dedupe=True`. """try:importPIL# noqa:F401exceptImportError:raiseImportError("pillow package not found, please install it with `pip install pillow`")self.text_kwargs=text_kwargsor{}self.dedupe=dedupeself.extract_images=extract_images
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:# type: ignore[valid-type]"""Lazily parse the blob."""importpdfplumberwithblob.as_bytes_io()asfile_path:# type: ignore[attr-defined]doc=pdfplumber.open(file_path)# open documentyield from[Document(page_content=self._process_page_content(page)+"\n"+self._extract_images_from_page(page),metadata=dict({"source":blob.source,# type: ignore[attr-defined]"file_path":blob.source,# type: ignore[attr-defined]"page":page.page_number-1,"total_pages":len(doc.pages),},**{k:doc.metadata[k]forkindoc.metadataiftype(doc.metadata[k])in[str,int]},),)forpageindoc.pages]
def_process_page_content(self,page:pdfplumber.page.Page)->str:"""Process the page content based on dedupe."""ifself.dedupe:returnpage.dedupe_chars().extract_text(**self.text_kwargs)returnpage.extract_text(**self.text_kwargs)def_extract_images_from_page(self,page:pdfplumber.page.Page)->str:"""Extract images from page and get the text with RapidOCR."""fromPILimportImageifnotself.extract_images:return""images=[]forimginpage.images:ifimg["stream"]["Filter"].namein_PDF_FILTER_WITHOUT_LOSS:ifimg["stream"]["BitsPerComponent"]==1:images.append(np.array(Image.frombytes("1",(img["stream"]["Width"],img["stream"]["Height"]),img["stream"].get_data(),).convert("L")))else:images.append(np.frombuffer(img["stream"].get_data(),dtype=np.uint8).reshape(img["stream"]["Height"],img["stream"]["Width"],-1))elifimg["stream"]["Filter"].namein_PDF_FILTER_WITH_LOSS:images.append(img["stream"].get_data())else:warnings.warn("Unknown PDF Filter!")returnextract_from_images_with_rapidocr(images)
[docs]classAmazonTextractPDFParser(BaseBlobParser):"""Send `PDF` files to `Amazon Textract` and parse them. For parsing multi-page PDFs, they have to reside on S3. The AmazonTextractPDFLoader calls the [Amazon Textract Service](https://aws.amazon.com/textract/) to convert PDFs into a Document structure. Single and multi-page documents are supported with up to 3000 pages and 512 MB of size. For the call to be successful an AWS account is required, similar to the [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) requirements. Besides the AWS configuration, it is very similar to the other PDF loaders, while also supporting JPEG, PNG and TIFF and non-native PDF formats. ```python from langchain_community.document_loaders import AmazonTextractPDFLoader loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg") documents = loader.load() ``` One feature is the linearization of the output. When using the features LAYOUT, FORMS or TABLES together with Textract ```python from langchain_community.document_loaders import AmazonTextractPDFLoader # you can mix and match each of the features loader=AmazonTextractPDFLoader( "example_data/alejandro_rosalez_sample-small.jpeg", textract_features=["TABLES", "LAYOUT"]) documents = loader.load() ``` it will generate output that formats the text in reading order and try to output the information in a tabular structure or output the key/value pairs with a colon (key: value). This helps most LLMs to achieve better accuracy when processing these texts. ``Document`` objects are returned with metadata that includes the ``source`` and a 1-based index of the page number in ``page``. Note that ``page`` represents the index of the result returned from Textract, not necessarily the as-written page number in the document. """
[docs]def__init__(self,textract_features:Optional[Sequence[int]]=None,client:Optional[Any]=None,*,linearization_config:Optional[TextLinearizationConfig]=None,)->None:"""Initializes the parser. Args: textract_features: Features to be used for extraction, each feature should be passed as an int that conforms to the enum `Textract_Features`, see `amazon-textract-caller` pkg client: boto3 textract client linearization_config: Config to be used for linearization of the output should be an instance of TextLinearizationConfig from the `textractor` pkg """try:importtextractcallerastcimporttextractor.entities.documentastextractorself.tc=tcself.textractor=textractoriftextract_featuresisnotNone:self.textract_features=[tc.Textract_Features(f)forfintextract_features]else:self.textract_features=[]iflinearization_configisnotNone:self.linearization_config=linearization_configelse:self.linearization_config=self.textractor.TextLinearizationConfig(hide_figure_layout=True,title_prefix="# ",section_header_prefix="## ",list_element_prefix="*",)exceptImportError:raiseImportError("Could not import amazon-textract-caller or ""amazon-textract-textractor python package. Please install it ""with `pip install amazon-textract-caller` & ""`pip install amazon-textract-textractor`.")ifnotclient:try:importboto3self.boto3_textract_client=boto3.client("textract")exceptImportError:raiseImportError("Could not import boto3 python package. ""Please install it with `pip install boto3`.")else:self.boto3_textract_client=client
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:# type: ignore[valid-type]"""Iterates over the Blob pages and returns an Iterator with a Document for each page, like the other parsers If multi-page document, blob.path has to be set to the S3 URI and for single page docs the blob.data is taken """url_parse_result=urlparse(str(blob.path))ifblob.pathelseNone# type: ignore[attr-defined]# Either call with S3 path (multi-page) or with bytes (single-page)if(url_parse_resultandurl_parse_result.scheme=="s3"andurl_parse_result.netloc):textract_response_json=self.tc.call_textract(input_document=str(blob.path),# type: ignore[attr-defined]features=self.textract_features,boto3_textract_client=self.boto3_textract_client,)else:textract_response_json=self.tc.call_textract(input_document=blob.as_bytes(),# type: ignore[attr-defined]features=self.textract_features,call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC,boto3_textract_client=self.boto3_textract_client,)document=self.textractor.Document.open(textract_response_json)foridx,pageinenumerate(document.pages):yieldDocument(page_content=page.get_text(config=self.linearization_config),metadata={"source":blob.source,"page":idx+1},# type: ignore[attr-defined])
[docs]classDocumentIntelligenceParser(BaseBlobParser):"""Loads a PDF with Azure Document Intelligence (formerly Form Recognizer) and chunks at character level."""
[docs]def__init__(self,client:Any,model:str):warnings.warn("langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParser""and langchain_community.document_loaders.pdf.DocumentIntelligenceLoader"" are deprecated. Please upgrade to ""langchain_community.document_loaders.DocumentIntelligenceLoader ""for any file parsing purpose using Azure Document Intelligence ""service.")self.client=clientself.model=model