"""Module contains common parsers for PDFs."""
from __future__ import annotations
import warnings
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterable,
Iterator,
Mapping,
Optional,
Sequence,
Union,
)
from urllib.parse import urlparse
import numpy as np
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
if TYPE_CHECKING:
import fitz.fitz
import pdfminer.layout
import pdfplumber.page
import pypdf._page
import pypdfium2._helpers.page
from pypdf import PageObject
from textractor.data.text_linearization_config import TextLinearizationConfig
_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
_PDF_FILTER_WITHOUT_LOSS = [
"LZWDecode",
"LZW",
"FlateDecode",
"Fl",
"ASCII85Decode",
"A85",
"ASCIIHexDecode",
"AHx",
"RunLengthDecode",
"RL",
"CCITTFaxDecode",
"CCF",
"JBIG2Decode",
]
[docs]
class PyPDFParser(BaseBlobParser):
"""Load `PDF` using `pypdf`"""
[docs]
def __init__(
self,
password: Optional[Union[str, bytes]] = None,
extract_images: bool = False,
*,
extraction_mode: str = "plain",
extraction_kwargs: Optional[Dict[str, Any]] = None,
):
self.password = password
self.extract_images = extract_images
self.extraction_mode = extraction_mode
self.extraction_kwargs = extraction_kwargs or {}
[docs]
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
try:
import pypdf
except ImportError:
raise ImportError(
"`pypdf` package not found, please install it with "
"`pip install pypdf`"
)
def _extract_text_from_page(page: "PageObject") -> str:
"""
Extract text from image given the version of pypdf.
"""
if pypdf.__version__.startswith("3"):
return page.extract_text()
else:
return page.extract_text(
extraction_mode=self.extraction_mode, # type: ignore[arg-type]
**self.extraction_kwargs, # type: ignore[arg-type]
)
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
yield from [
Document(
page_content=_extract_text_from_page(page=page)
+ self._extract_images_from_page(page),
metadata={"source": blob.source, "page": page_number}, # type: ignore[attr-defined]
)
for page_number, page in enumerate(pdf_reader.pages)
]
def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
"""Extract images from page and get the text with RapidOCR."""
if not self.extract_images or "/XObject" not in page["/Resources"].keys(): # type: ignore[attr-defined]
return ""
xObject = page["/Resources"]["/XObject"].get_object() # type: ignore
images = []
for obj in xObject:
if xObject[obj]["/Subtype"] == "/Image":
if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS:
height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]
images.append(
np.frombuffer(xObject[obj].get_data(), dtype=np.uint8).reshape(
height, width, -1
)
)
elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS:
images.append(xObject[obj].get_data())
else:
warnings.warn("Unknown PDF Filter!")
return extract_from_images_with_rapidocr(images)
[docs]
class PDFMinerParser(BaseBlobParser):
"""Parse `PDF` using `PDFMiner`."""
[docs]
def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True):
"""Initialize a parser based on PDFMiner.
Args:
extract_images: Whether to extract images from PDF.
concatenate_pages: If True, concatenate all PDF pages into one a single
document. Otherwise, return one document per page.
"""
self.extract_images = extract_images
self.concatenate_pages = concatenate_pages
[docs]
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
if not self.extract_images:
try:
from pdfminer.high_level import extract_text
except ImportError:
raise ImportError(
"`pdfminer` package not found, please install it with "
"`pip install pdfminer.six`"
)
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
if self.concatenate_pages:
text = extract_text(pdf_file_obj)
metadata = {"source": blob.source} # type: ignore[attr-defined]
yield Document(page_content=text, metadata=metadata)
else:
from pdfminer.pdfpage import PDFPage
pages = PDFPage.get_pages(pdf_file_obj)
for i, _ in enumerate(pages):
text = extract_text(pdf_file_obj, page_numbers=[i])
metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined]
yield Document(page_content=text, metadata=metadata)
else:
import io
from pdfminer.converter import PDFPageAggregator, TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
text_io = io.StringIO()
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
pages = PDFPage.get_pages(pdf_file_obj)
rsrcmgr = PDFResourceManager()
device_for_text = TextConverter(rsrcmgr, text_io, laparams=LAParams())
device_for_image = PDFPageAggregator(rsrcmgr, laparams=LAParams())
interpreter_for_text = PDFPageInterpreter(rsrcmgr, device_for_text)
interpreter_for_image = PDFPageInterpreter(rsrcmgr, device_for_image)
for i, page in enumerate(pages):
interpreter_for_text.process_page(page)
interpreter_for_image.process_page(page)
content = text_io.getvalue() + self._extract_images_from_page(
device_for_image.get_result()
)
text_io.truncate(0)
text_io.seek(0)
metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined]
yield Document(page_content=content, metadata=metadata)
def _extract_images_from_page(self, page: pdfminer.layout.LTPage) -> str:
"""Extract images from page and get the text with RapidOCR."""
import pdfminer
def get_image(layout_object: Any) -> Any:
if isinstance(layout_object, pdfminer.layout.LTImage):
return layout_object
if isinstance(layout_object, pdfminer.layout.LTContainer):
for child in layout_object:
return get_image(child)
else:
return None
images = []
for img in filter(bool, map(get_image, page)):
img_filter = img.stream["Filter"]
if isinstance(img_filter, list):
filter_names = [f.name for f in img_filter]
else:
filter_names = [img_filter.name]
without_loss = any(
name in _PDF_FILTER_WITHOUT_LOSS for name in filter_names
)
with_loss = any(name in _PDF_FILTER_WITH_LOSS for name in filter_names)
non_matching = {name for name in filter_names} - {
*_PDF_FILTER_WITHOUT_LOSS,
*_PDF_FILTER_WITH_LOSS,
}
if without_loss and with_loss:
warnings.warn(
"Image has both lossy and lossless filters. Defaulting to lossless"
)
if non_matching:
warnings.warn(f"Unknown PDF Filter(s): {non_matching}")
if without_loss:
images.append(
np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape(
img.stream["Height"], img.stream["Width"], -1
)
)
elif with_loss:
images.append(img.stream.get_data())
return extract_from_images_with_rapidocr(images)
[docs]
class PyMuPDFParser(BaseBlobParser):
"""Parse `PDF` using `PyMuPDF`."""
[docs]
def __init__(
self,
text_kwargs: Optional[Mapping[str, Any]] = None,
extract_images: bool = False,
) -> None:
"""Initialize the parser.
Args:
text_kwargs: Keyword arguments to pass to ``fitz.Page.get_text()``.
"""
self.text_kwargs = text_kwargs or {}
self.extract_images = extract_images
[docs]
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
import fitz
with blob.as_bytes_io() as file_path: # type: ignore[attr-defined]
if blob.data is None: # type: ignore[attr-defined]
doc = fitz.open(file_path)
else:
doc = fitz.open(stream=file_path, filetype="pdf")
yield from [
Document(
page_content=self._get_page_content(doc, page, blob),
metadata=self._extract_metadata(doc, page, blob),
)
for page in doc
]
def _get_page_content(
self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
) -> str:
"""
Get the text of the page using PyMuPDF and RapidOCR and issue a warning
if it is empty.
"""
content = page.get_text(**self.text_kwargs) + self._extract_images_from_page(
doc, page
)
if not content:
warnings.warn(
f"Warning: Empty content on page "
f"{page.number} of document {blob.source}"
)
return content
def _extract_metadata(
self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
) -> dict:
"""Extract metadata from the document and page."""
return dict(
{
"source": blob.source, # type: ignore[attr-defined]
"file_path": blob.source, # type: ignore[attr-defined]
"page": page.number,
"total_pages": len(doc),
},
**{
k: doc.metadata[k]
for k in doc.metadata
if isinstance(doc.metadata[k], (str, int))
},
)
def _extract_images_from_page(
self, doc: fitz.fitz.Document, page: fitz.fitz.Page
) -> str:
"""Extract images from page and get the text with RapidOCR."""
if not self.extract_images:
return ""
import fitz
img_list = page.get_images()
imgs = []
for img in img_list:
xref = img[0]
pix = fitz.Pixmap(doc, xref)
imgs.append(
np.frombuffer(pix.samples, dtype=np.uint8).reshape(
pix.height, pix.width, -1
)
)
return extract_from_images_with_rapidocr(imgs)
[docs]
class PyPDFium2Parser(BaseBlobParser):
"""Parse `PDF` with `PyPDFium2`."""
[docs]
def __init__(self, extract_images: bool = False) -> None:
"""Initialize the parser."""
try:
import pypdfium2 # noqa:F401
except ImportError:
raise ImportError(
"pypdfium2 package not found, please install it with"
" `pip install pypdfium2`"
)
self.extract_images = extract_images
[docs]
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
import pypdfium2
# pypdfium2 is really finicky with respect to closing things,
# if done incorrectly creates seg faults.
with blob.as_bytes_io() as file_path: # type: ignore[attr-defined]
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
try:
for page_number, page in enumerate(pdf_reader):
text_page = page.get_textpage()
content = text_page.get_text_range()
text_page.close()
content += "\n" + self._extract_images_from_page(page)
page.close()
metadata = {"source": blob.source, "page": page_number} # type: ignore[attr-defined]
yield Document(page_content=content, metadata=metadata)
finally:
pdf_reader.close()
def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> str:
"""Extract images from page and get the text with RapidOCR."""
if not self.extract_images:
return ""
import pypdfium2.raw as pdfium_c
images = list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,)))
images = list(map(lambda x: x.get_bitmap().to_numpy(), images))
return extract_from_images_with_rapidocr(images)
[docs]
class PDFPlumberParser(BaseBlobParser):
"""Parse `PDF` with `PDFPlumber`."""
[docs]
def __init__(
self,
text_kwargs: Optional[Mapping[str, Any]] = None,
dedupe: bool = False,
extract_images: bool = False,
) -> None:
"""Initialize the parser.
Args:
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
"""
self.text_kwargs = text_kwargs or {}
self.dedupe = dedupe
self.extract_images = extract_images
[docs]
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
import pdfplumber
with blob.as_bytes_io() as file_path: # type: ignore[attr-defined]
doc = pdfplumber.open(file_path) # open document
yield from [
Document(
page_content=self._process_page_content(page)
+ "\n"
+ self._extract_images_from_page(page),
metadata=dict(
{
"source": blob.source, # type: ignore[attr-defined]
"file_path": blob.source, # type: ignore[attr-defined]
"page": page.page_number - 1,
"total_pages": len(doc.pages),
},
**{
k: doc.metadata[k]
for k in doc.metadata
if type(doc.metadata[k]) in [str, int]
},
),
)
for page in doc.pages
]
def _process_page_content(self, page: pdfplumber.page.Page) -> str:
"""Process the page content based on dedupe."""
if self.dedupe:
return page.dedupe_chars().extract_text(**self.text_kwargs)
return page.extract_text(**self.text_kwargs)
def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
"""Extract images from page and get the text with RapidOCR."""
if not self.extract_images:
return ""
images = []
for img in page.images:
if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
images.append(
np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
img["stream"]["Height"], img["stream"]["Width"], -1
)
)
elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
images.append(img["stream"].get_data())
else:
warnings.warn("Unknown PDF Filter!")
return extract_from_images_with_rapidocr(images)
[docs]
class DocumentIntelligenceParser(BaseBlobParser):
"""Loads a PDF with Azure Document Intelligence
(formerly Form Recognizer) and chunks at character level."""
[docs]
def __init__(self, client: Any, model: str):
warnings.warn(
"langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParser"
"and langchain_community.document_loaders.pdf.DocumentIntelligenceLoader"
" are deprecated. Please upgrade to "
"langchain_community.document_loaders.DocumentIntelligenceLoader "
"for any file parsing purpose using Azure Document Intelligence "
"service."
)
self.client = client
self.model = model
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: # type: ignore[valid-type]
for p in result.pages:
content = " ".join([line.content for line in p.lines])
d = Document(
page_content=content,
metadata={
"source": blob.source, # type: ignore[attr-defined]
"page": p.page_number,
},
)
yield d
[docs]
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
with blob.as_bytes_io() as file_obj: # type: ignore[attr-defined]
poller = self.client.begin_analyze_document(self.model, file_obj)
result = poller.result()
docs = self._generate_docs(blob, result)
yield from docs