Source code for langchain_community.document_loaders.parsers.pdf
"""Module contains common parsers for PDFs."""from__future__importannotationsimportwarningsfromtypingimport(TYPE_CHECKING,Any,Dict,Iterable,Iterator,Mapping,Optional,Sequence,Union,)fromurllib.parseimporturlparseimportnumpyasnpfromlangchain_core.documentsimportDocumentfromlangchain_community.document_loaders.baseimportBaseBlobParserfromlangchain_community.document_loaders.blob_loadersimportBlobifTYPE_CHECKING:importfitz.fitzimportpdfminer.layoutimportpdfplumber.pageimportpypdf._pageimportpypdfium2._helpers.pagefrompypdfimportPageObjectfromtextractor.data.text_linearization_configimportTextLinearizationConfig_PDF_FILTER_WITH_LOSS=["DCTDecode","DCT","JPXDecode"]_PDF_FILTER_WITHOUT_LOSS=["LZWDecode","LZW","FlateDecode","Fl","ASCII85Decode","A85","ASCIIHexDecode","AHx","RunLengthDecode","RL","CCITTFaxDecode","CCF","JBIG2Decode",]
[docs]defextract_from_images_with_rapidocr(images:Sequence[Union[Iterable[np.ndarray],bytes]],)->str:"""Extract text from images with RapidOCR. Args: images: Images to extract text from. Returns: Text extracted from images. Raises: ImportError: If `rapidocr-onnxruntime` package is not installed. """try:fromrapidocr_onnxruntimeimportRapidOCRexceptImportError:raiseImportError("`rapidocr-onnxruntime` package not found, please install it with ""`pip install rapidocr-onnxruntime`")ocr=RapidOCR()text=""forimginimages:result,_=ocr(img)ifresult:result=[text[1]fortextinresult]text+="\n".join(result)returntext
[docs]classPyPDFParser(BaseBlobParser):"""Load `PDF` using `pypdf`"""
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:# type: ignore[valid-type]"""Lazily parse the blob."""try:importpypdfexceptImportError:raiseImportError("`pypdf` package not found, please install it with ""`pip install pypdf`")def_extract_text_from_page(page:"PageObject")->str:""" Extract text from image given the version of pypdf. """ifpypdf.__version__.startswith("3"):returnpage.extract_text()else:returnpage.extract_text(extraction_mode=self.extraction_mode,**self.extraction_kwargs)withblob.as_bytes_io()aspdf_file_obj:# type: ignore[attr-defined]pdf_reader=pypdf.PdfReader(pdf_file_obj,password=self.password)yield from[Document(page_content=_extract_text_from_page(page=page)+self._extract_images_from_page(page),metadata={"source":blob.source,"page":page_number},# type: ignore[attr-defined])forpage_number,pageinenumerate(pdf_reader.pages)]
def_extract_images_from_page(self,page:pypdf._page.PageObject)->str:"""Extract images from page and get the text with RapidOCR."""ifnotself.extract_imagesor"/XObject"notinpage["/Resources"].keys():return""xObject=page["/Resources"]["/XObject"].get_object()# type: ignoreimages=[]forobjinxObject:ifxObject[obj]["/Subtype"]=="/Image":ifxObject[obj]["/Filter"][1:]in_PDF_FILTER_WITHOUT_LOSS:height,width=xObject[obj]["/Height"],xObject[obj]["/Width"]images.append(np.frombuffer(xObject[obj].get_data(),dtype=np.uint8).reshape(height,width,-1))elifxObject[obj]["/Filter"][1:]in_PDF_FILTER_WITH_LOSS:images.append(xObject[obj].get_data())else:warnings.warn("Unknown PDF Filter!")returnextract_from_images_with_rapidocr(images)
[docs]classPDFMinerParser(BaseBlobParser):"""Parse `PDF` using `PDFMiner`."""
[docs]def__init__(self,extract_images:bool=False,*,concatenate_pages:bool=True):"""Initialize a parser based on PDFMiner. Args: extract_images: Whether to extract images from PDF. concatenate_pages: If True, concatenate all PDF pages into one a single document. Otherwise, return one document per page. """self.extract_images=extract_imagesself.concatenate_pages=concatenate_pages
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:# type: ignore[valid-type]"""Lazily parse the blob."""ifnotself.extract_images:try:frompdfminer.high_levelimportextract_textexceptImportError:raiseImportError("`pdfminer` package not found, please install it with ""`pip install pdfminer.six`")withblob.as_bytes_io()aspdf_file_obj:# type: ignore[attr-defined]ifself.concatenate_pages:text=extract_text(pdf_file_obj)metadata={"source":blob.source}# type: ignore[attr-defined]yieldDocument(page_content=text,metadata=metadata)else:frompdfminer.pdfpageimportPDFPagepages=PDFPage.get_pages(pdf_file_obj)fori,_inenumerate(pages):text=extract_text(pdf_file_obj,page_numbers=[i])metadata={"source":blob.source,"page":str(i)}# type: ignore[attr-defined]yieldDocument(page_content=text,metadata=metadata)else:importiofrompdfminer.converterimportPDFPageAggregator,TextConverterfrompdfminer.layoutimportLAParamsfrompdfminer.pdfinterpimportPDFPageInterpreter,PDFResourceManagerfrompdfminer.pdfpageimportPDFPagetext_io=io.StringIO()withblob.as_bytes_io()aspdf_file_obj:# type: ignore[attr-defined]pages=PDFPage.get_pages(pdf_file_obj)rsrcmgr=PDFResourceManager()device_for_text=TextConverter(rsrcmgr,text_io,laparams=LAParams())device_for_image=PDFPageAggregator(rsrcmgr,laparams=LAParams())interpreter_for_text=PDFPageInterpreter(rsrcmgr,device_for_text)interpreter_for_image=PDFPageInterpreter(rsrcmgr,device_for_image)fori,pageinenumerate(pages):interpreter_for_text.process_page(page)interpreter_for_image.process_page(page)content=text_io.getvalue()+self._extract_images_from_page(device_for_image.get_result())text_io.truncate(0)text_io.seek(0)metadata={"source":blob.source,"page":str(i)}# type: ignore[attr-defined]yieldDocument(page_content=content,metadata=metadata)
def_extract_images_from_page(self,page:pdfminer.layout.LTPage)->str:"""Extract images from page and get the text with RapidOCR."""importpdfminerdefget_image(layout_object:Any)->Any:ifisinstance(layout_object,pdfminer.layout.LTImage):returnlayout_objectifisinstance(layout_object,pdfminer.layout.LTContainer):forchildinlayout_object:returnget_image(child)else:returnNoneimages=[]forimginlist(filter(bool,map(get_image,page))):ifimg.stream["Filter"].namein_PDF_FILTER_WITHOUT_LOSS:images.append(np.frombuffer(img.stream.get_data(),dtype=np.uint8).reshape(img.stream["Height"],img.stream["Width"],-1))elifimg.stream["Filter"].namein_PDF_FILTER_WITH_LOSS:images.append(img.stream.get_data())else:warnings.warn("Unknown PDF Filter!")returnextract_from_images_with_rapidocr(images)
[docs]classPyMuPDFParser(BaseBlobParser):"""Parse `PDF` using `PyMuPDF`."""
[docs]def__init__(self,text_kwargs:Optional[Mapping[str,Any]]=None,extract_images:bool=False,)->None:"""Initialize the parser. Args: text_kwargs: Keyword arguments to pass to ``fitz.Page.get_text()``. """self.text_kwargs=text_kwargsor{}self.extract_images=extract_images
def_extract_images_from_page(self,doc:fitz.fitz.Document,page:fitz.fitz.Page)->str:"""Extract images from page and get the text with RapidOCR."""ifnotself.extract_images:return""importfitzimg_list=page.get_images()imgs=[]forimginimg_list:xref=img[0]pix=fitz.Pixmap(doc,xref)imgs.append(np.frombuffer(pix.samples,dtype=np.uint8).reshape(pix.height,pix.width,-1))returnextract_from_images_with_rapidocr(imgs)
[docs]classPyPDFium2Parser(BaseBlobParser):"""Parse `PDF` with `PyPDFium2`."""
[docs]def__init__(self,extract_images:bool=False)->None:"""Initialize the parser."""try:importpypdfium2# noqa:F401exceptImportError:raiseImportError("pypdfium2 package not found, please install it with"" `pip install pypdfium2`")self.extract_images=extract_images
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:# type: ignore[valid-type]"""Lazily parse the blob."""importpypdfium2# pypdfium2 is really finicky with respect to closing things,# if done incorrectly creates seg faults.withblob.as_bytes_io()asfile_path:# type: ignore[attr-defined]pdf_reader=pypdfium2.PdfDocument(file_path,autoclose=True)try:forpage_number,pageinenumerate(pdf_reader):text_page=page.get_textpage()content=text_page.get_text_range()text_page.close()content+="\n"+self._extract_images_from_page(page)page.close()metadata={"source":blob.source,"page":page_number}# type: ignore[attr-defined]yieldDocument(page_content=content,metadata=metadata)finally:pdf_reader.close()
def_extract_images_from_page(self,page:pypdfium2._helpers.page.PdfPage)->str:"""Extract images from page and get the text with RapidOCR."""ifnotself.extract_images:return""importpypdfium2.rawaspdfium_cimages=list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,)))images=list(map(lambdax:x.get_bitmap().to_numpy(),images))returnextract_from_images_with_rapidocr(images)
[docs]classPDFPlumberParser(BaseBlobParser):"""Parse `PDF` with `PDFPlumber`."""
[docs]def__init__(self,text_kwargs:Optional[Mapping[str,Any]]=None,dedupe:bool=False,extract_images:bool=False,)->None:"""Initialize the parser. Args: text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` dedupe: Avoiding the error of duplicate characters if `dedupe=True`. """self.text_kwargs=text_kwargsor{}self.dedupe=dedupeself.extract_images=extract_images
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:# type: ignore[valid-type]"""Lazily parse the blob."""importpdfplumberwithblob.as_bytes_io()asfile_path:# type: ignore[attr-defined]doc=pdfplumber.open(file_path)# open documentyield from[Document(page_content=self._process_page_content(page)+"\n"+self._extract_images_from_page(page),metadata=dict({"source":blob.source,# type: ignore[attr-defined]"file_path":blob.source,# type: ignore[attr-defined]"page":page.page_number-1,"total_pages":len(doc.pages),},**{k:doc.metadata[k]forkindoc.metadataiftype(doc.metadata[k])in[str,int]},),)forpageindoc.pages]
def_process_page_content(self,page:pdfplumber.page.Page)->str:"""Process the page content based on dedupe."""ifself.dedupe:returnpage.dedupe_chars().extract_text(**self.text_kwargs)returnpage.extract_text(**self.text_kwargs)def_extract_images_from_page(self,page:pdfplumber.page.Page)->str:"""Extract images from page and get the text with RapidOCR."""ifnotself.extract_images:return""images=[]forimginpage.images:ifimg["stream"]["Filter"].namein_PDF_FILTER_WITHOUT_LOSS:images.append(np.frombuffer(img["stream"].get_data(),dtype=np.uint8).reshape(img["stream"]["Height"],img["stream"]["Width"],-1))elifimg["stream"]["Filter"].namein_PDF_FILTER_WITH_LOSS:images.append(img["stream"].get_data())else:warnings.warn("Unknown PDF Filter!")returnextract_from_images_with_rapidocr(images)
[docs]classAmazonTextractPDFParser(BaseBlobParser):"""Send `PDF` files to `Amazon Textract` and parse them. For parsing multi-page PDFs, they have to reside on S3. The AmazonTextractPDFLoader calls the [Amazon Textract Service](https://aws.amazon.com/textract/) to convert PDFs into a Document structure. Single and multi-page documents are supported with up to 3000 pages and 512 MB of size. For the call to be successful an AWS account is required, similar to the [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) requirements. Besides the AWS configuration, it is very similar to the other PDF loaders, while also supporting JPEG, PNG and TIFF and non-native PDF formats. ```python from langchain_community.document_loaders import AmazonTextractPDFLoader loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg") documents = loader.load() ``` One feature is the linearization of the output. When using the features LAYOUT, FORMS or TABLES together with Textract ```python from langchain_community.document_loaders import AmazonTextractPDFLoader # you can mix and match each of the features loader=AmazonTextractPDFLoader( "example_data/alejandro_rosalez_sample-small.jpeg", textract_features=["TABLES", "LAYOUT"]) documents = loader.load() ``` it will generate output that formats the text in reading order and try to output the information in a tabular structure or output the key/value pairs with a colon (key: value). This helps most LLMs to achieve better accuracy when processing these texts. """
[docs]def__init__(self,textract_features:Optional[Sequence[int]]=None,client:Optional[Any]=None,*,linearization_config:Optional["TextLinearizationConfig"]=None,)->None:"""Initializes the parser. Args: textract_features: Features to be used for extraction, each feature should be passed as an int that conforms to the enum `Textract_Features`, see `amazon-textract-caller` pkg client: boto3 textract client linearization_config: Config to be used for linearization of the output should be an instance of TextLinearizationConfig from the `textractor` pkg """try:importtextractcallerastcimporttextractor.entities.documentastextractorself.tc=tcself.textractor=textractoriftextract_featuresisnotNone:self.textract_features=[tc.Textract_Features(f)forfintextract_features]else:self.textract_features=[]iflinearization_configisnotNone:self.linearization_config=linearization_configelse:self.linearization_config=self.textractor.TextLinearizationConfig(hide_figure_layout=True,title_prefix="# ",section_header_prefix="## ",list_element_prefix="*",)exceptImportError:raiseImportError("Could not import amazon-textract-caller or ""amazon-textract-textractor python package. Please install it ""with `pip install amazon-textract-caller` & ""`pip install amazon-textract-textractor`.")ifnotclient:try:importboto3self.boto3_textract_client=boto3.client("textract")exceptImportError:raiseImportError("Could not import boto3 python package. ""Please install it with `pip install boto3`.")else:self.boto3_textract_client=client
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:# type: ignore[valid-type]"""Iterates over the Blob pages and returns an Iterator with a Document for each page, like the other parsers If multi-page document, blob.path has to be set to the S3 URI and for single page docs the blob.data is taken """url_parse_result=urlparse(str(blob.path))ifblob.pathelseNone# type: ignore[attr-defined]# Either call with S3 path (multi-page) or with bytes (single-page)if(url_parse_resultandurl_parse_result.scheme=="s3"andurl_parse_result.netloc):textract_response_json=self.tc.call_textract(input_document=str(blob.path),# type: ignore[attr-defined]features=self.textract_features,boto3_textract_client=self.boto3_textract_client,)else:textract_response_json=self.tc.call_textract(input_document=blob.as_bytes(),# type: ignore[attr-defined]features=self.textract_features,call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC,boto3_textract_client=self.boto3_textract_client,)document=self.textractor.Document.open(textract_response_json)foridx,pageinenumerate(document.pages):yieldDocument(page_content=page.get_text(config=self.linearization_config),metadata={"source":blob.source,"page":idx+1},# type: ignore[attr-defined])
[docs]classDocumentIntelligenceParser(BaseBlobParser):"""Loads a PDF with Azure Document Intelligence (formerly Form Recognizer) and chunks at character level."""
[docs]def__init__(self,client:Any,model:str):warnings.warn("langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParser""and langchain_community.document_loaders.pdf.DocumentIntelligenceLoader"" are deprecated. Please upgrade to ""langchain_community.document_loaders.DocumentIntelligenceLoader ""for any file parsing purpose using Azure Document Intelligence ""service.")self.client=clientself.model=model