import json
import logging
import os
import re
import tempfile
import time
from abc import ABC
from io import StringIO
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterator,
List,
Mapping,
Optional,
Sequence,
Union,
)
from urllib.parse import urlparse
import requests
from langchain_core.documents import Document
from langchain_core.utils import get_from_dict_or_env
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.dedoc import DedocBaseLoader
from langchain_community.document_loaders.parsers.pdf import (
AmazonTextractPDFParser,
DocumentIntelligenceParser,
PDFMinerParser,
PDFPlumberParser,
PyMuPDFParser,
PyPDFium2Parser,
PyPDFParser,
)
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
if TYPE_CHECKING:
from textractor.data.text_linearization_config import TextLinearizationConfig
logger = logging.getLogger(__file__)
[docs]
class UnstructuredPDFLoader(UnstructuredFileLoader):
"""Load `PDF` files using `Unstructured`.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain_community.document_loaders import UnstructuredPDFLoader
loader = UnstructuredPDFLoader(
"example.pdf", mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf
"""
def _get_elements(self) -> List:
from unstructured.partition.pdf import partition_pdf
return partition_pdf(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type]
[docs]
class BasePDFLoader(BaseLoader, ABC):
"""Base Loader class for `PDF` files.
If the file is a web path, it will download it to a temporary file, use it, then
clean up the temporary file after completion.
"""
[docs]
def __init__(self, file_path: Union[str, Path], *, headers: Optional[Dict] = None):
"""Initialize with a file path.
Args:
file_path: Either a local, S3 or web path to a PDF file.
headers: Headers to use for GET request to download a file from a web path.
"""
self.file_path = str(file_path)
self.web_path = None
self.headers = headers
if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path)
# If the file is a web path or S3, download it to a temporary file, and use that
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
self.temp_dir = tempfile.TemporaryDirectory()
_, suffix = os.path.splitext(self.file_path)
if self._is_s3_presigned_url(self.file_path):
suffix = urlparse(self.file_path).path.split("/")[-1]
temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}")
self.web_path = self.file_path
if not self._is_s3_url(self.file_path):
r = requests.get(self.file_path, headers=self.headers)
if r.status_code != 200:
raise ValueError(
"Check the url of your file; returned status code %s"
% r.status_code
)
with open(temp_pdf, mode="wb") as f:
f.write(r.content)
self.file_path = str(temp_pdf)
elif not os.path.isfile(self.file_path):
raise ValueError("File path %s is not a valid file or url" % self.file_path)
def __del__(self) -> None:
if hasattr(self, "temp_dir"):
self.temp_dir.cleanup()
@staticmethod
def _is_valid_url(url: str) -> bool:
"""Check if the url is valid."""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
@staticmethod
def _is_s3_url(url: str) -> bool:
"""check if the url is S3"""
try:
result = urlparse(url)
if result.scheme == "s3" and result.netloc:
return True
return False
except ValueError:
return False
@staticmethod
def _is_s3_presigned_url(url: str) -> bool:
"""Check if the url is a presigned S3 url."""
try:
result = urlparse(url)
return bool(re.search(r"\.s3\.amazonaws\.com$", result.netloc))
except ValueError:
return False
@property
def source(self) -> str:
return self.web_path if self.web_path is not None else self.file_path
[docs]
class OnlinePDFLoader(BasePDFLoader):
"""Load online `PDF`."""
[docs]
def load(self) -> List[Document]:
"""Load documents."""
loader = UnstructuredPDFLoader(str(self.file_path))
return loader.load()
[docs]
class PyPDFLoader(BasePDFLoader):
"""
PyPDFLoader document loader integration
Setup:
Install ``langchain-community``.
.. code-block:: bash
pip install -U langchain-community
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(
file_path = "./example_data/layout-parser-paper.pdf",
password = "my-password",
extract_images = True,
# headers = None
# extraction_mode = "plain",
# extraction_kwargs = None,
)
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
LayoutParser : A Unified Toolkit for Deep
Learning Based Document Image Analysis
Zejiang Shen1( ), R
{'source': './example_data/layout-parser-paper.pdf', 'page': 0}
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
LayoutParser : A Unified Toolkit for Deep
Learning Based Document Image Analysis
Zejiang Shen1( ), R
{'source': './example_data/layout-parser-paper.pdf', 'page': 0}
""" # noqa: E501
[docs]
def __init__(
self,
file_path: str,
password: Optional[Union[str, bytes]] = None,
headers: Optional[Dict] = None,
extract_images: bool = False,
*,
extraction_mode: str = "plain",
extraction_kwargs: Optional[Dict] = None,
) -> None:
"""Initialize with a file path."""
try:
import pypdf # noqa:F401
except ImportError:
raise ImportError(
"pypdf package not found, please install it with `pip install pypdf`"
)
super().__init__(file_path, headers=headers)
self.parser = PyPDFParser(
password=password,
extract_images=extract_images,
extraction_mode=extraction_mode,
extraction_kwargs=extraction_kwargs,
)
[docs]
def lazy_load(
self,
) -> Iterator[Document]:
"""Lazy load given path as pages."""
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from self.parser.parse(blob)
[docs]
class PyPDFium2Loader(BasePDFLoader):
"""Load `PDF` using `pypdfium2` and chunks at character level."""
[docs]
def __init__(
self,
file_path: str,
*,
headers: Optional[Dict] = None,
extract_images: bool = False,
):
"""Initialize with a file path."""
super().__init__(file_path, headers=headers)
self.parser = PyPDFium2Parser(extract_images=extract_images)
[docs]
def lazy_load(
self,
) -> Iterator[Document]:
"""Lazy load given path as pages."""
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from self.parser.parse(blob)
[docs]
class PyPDFDirectoryLoader(BaseLoader):
"""Load a directory with `PDF` files using `pypdf` and chunks at character level.
Loader also stores page numbers in metadata.
"""
[docs]
def __init__(
self,
path: Union[str, Path],
glob: str = "**/[!.]*.pdf",
silent_errors: bool = False,
load_hidden: bool = False,
recursive: bool = False,
extract_images: bool = False,
):
self.path = path
self.glob = glob
self.load_hidden = load_hidden
self.recursive = recursive
self.silent_errors = silent_errors
self.extract_images = extract_images
@staticmethod
def _is_visible(path: Path) -> bool:
return not any(part.startswith(".") for part in path.parts)
[docs]
def load(self) -> List[Document]:
p = Path(self.path)
docs = []
items = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
for i in items:
if i.is_file():
if self._is_visible(i.relative_to(p)) or self.load_hidden:
try:
loader = PyPDFLoader(str(i), extract_images=self.extract_images)
sub_docs = loader.load()
for doc in sub_docs:
doc.metadata["source"] = str(i)
docs.extend(sub_docs)
except Exception as e:
if self.silent_errors:
logger.warning(e)
else:
raise e
return docs
[docs]
class PDFMinerLoader(BasePDFLoader):
"""Load `PDF` files using `PDFMiner`."""
[docs]
def __init__(
self,
file_path: str,
*,
headers: Optional[Dict] = None,
extract_images: bool = False,
concatenate_pages: bool = True,
) -> None:
"""Initialize with file path.
Args:
extract_images: Whether to extract images from PDF.
concatenate_pages: If True, concatenate all PDF pages into one a single
document. Otherwise, return one document per page.
"""
try:
from pdfminer.high_level import extract_text # noqa:F401
except ImportError:
raise ImportError(
"`pdfminer` package not found, please install it with "
"`pip install pdfminer.six`"
)
super().__init__(file_path, headers=headers)
self.parser = PDFMinerParser(
extract_images=extract_images, concatenate_pages=concatenate_pages
)
[docs]
def lazy_load(
self,
) -> Iterator[Document]:
"""Lazily load documents."""
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from self.parser.parse(blob)
[docs]
class PDFMinerPDFasHTMLLoader(BasePDFLoader):
"""Load `PDF` files as HTML content using `PDFMiner`."""
[docs]
def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
"""Initialize with a file path."""
try:
from pdfminer.high_level import extract_text_to_fp # noqa:F401
except ImportError:
raise ImportError(
"`pdfminer` package not found, please install it with "
"`pip install pdfminer.six`"
)
super().__init__(file_path, headers=headers)
[docs]
def lazy_load(self) -> Iterator[Document]:
"""Load file."""
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
from pdfminer.utils import open_filename
output_string = StringIO()
with open_filename(self.file_path, "rb") as fp:
extract_text_to_fp(
fp,
output_string,
codec="",
laparams=LAParams(),
output_type="html",
)
metadata = {
"source": self.file_path if self.web_path is None else self.web_path
}
yield Document(page_content=output_string.getvalue(), metadata=metadata)
[docs]
class PyMuPDFLoader(BasePDFLoader):
"""Load `PDF` files using `PyMuPDF`."""
[docs]
def __init__(
self,
file_path: str,
*,
headers: Optional[Dict] = None,
extract_images: bool = False,
**kwargs: Any,
) -> None:
"""Initialize with a file path."""
try:
import fitz # noqa:F401
except ImportError:
raise ImportError(
"`PyMuPDF` package not found, please install it with "
"`pip install pymupdf`"
)
super().__init__(file_path, headers=headers)
self.extract_images = extract_images
self.text_kwargs = kwargs
def _lazy_load(self, **kwargs: Any) -> Iterator[Document]:
if kwargs:
logger.warning(
f"Received runtime arguments {kwargs}. Passing runtime args to `load`"
f" is deprecated. Please pass arguments during initialization instead."
)
text_kwargs = {**self.text_kwargs, **kwargs}
parser = PyMuPDFParser(
text_kwargs=text_kwargs, extract_images=self.extract_images
)
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from parser.lazy_parse(blob)
[docs]
def load(self, **kwargs: Any) -> List[Document]:
return list(self._lazy_load(**kwargs))
[docs]
def lazy_load(self) -> Iterator[Document]:
yield from self._lazy_load()
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
[docs]
class MathpixPDFLoader(BasePDFLoader):
"""Load `PDF` files using `Mathpix` service."""
[docs]
def __init__(
self,
file_path: str,
processed_file_format: str = "md",
max_wait_time_seconds: int = 500,
should_clean_pdf: bool = False,
extra_request_data: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> None:
"""Initialize with a file path.
Args:
file_path: a file for loading.
processed_file_format: a format of the processed file. Default is "md".
max_wait_time_seconds: a maximum time to wait for the response from
the server. Default is 500.
should_clean_pdf: a flag to clean the PDF file. Default is False.
extra_request_data: Additional request data.
**kwargs: additional keyword arguments.
"""
self.mathpix_api_key = get_from_dict_or_env(
kwargs, "mathpix_api_key", "MATHPIX_API_KEY"
)
self.mathpix_api_id = get_from_dict_or_env(
kwargs, "mathpix_api_id", "MATHPIX_API_ID"
)
# The base class isn't expecting these and doesn't collect **kwargs
kwargs.pop("mathpix_api_key", None)
kwargs.pop("mathpix_api_id", None)
super().__init__(file_path, **kwargs)
self.processed_file_format = processed_file_format
self.extra_request_data = (
extra_request_data if extra_request_data is not None else {}
)
self.max_wait_time_seconds = max_wait_time_seconds
self.should_clean_pdf = should_clean_pdf
@property
def _mathpix_headers(self) -> Dict[str, str]:
return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key}
@property
def url(self) -> str:
return "https://api.mathpix.com/v3/pdf"
@property
def data(self) -> dict:
options = {
"conversion_formats": {self.processed_file_format: True},
**self.extra_request_data,
}
return {"options_json": json.dumps(options)}
[docs]
def send_pdf(self) -> str:
with open(self.file_path, "rb") as f:
files = {"file": f}
response = requests.post(
self.url, headers=self._mathpix_headers, files=files, data=self.data
)
response_data = response.json()
if "error" in response_data:
raise ValueError(f"Mathpix request failed: {response_data['error']}")
if "pdf_id" in response_data:
pdf_id = response_data["pdf_id"]
return pdf_id
else:
raise ValueError("Unable to send PDF to Mathpix.")
[docs]
def wait_for_processing(self, pdf_id: str) -> None:
"""Wait for processing to complete.
Args:
pdf_id: a PDF id.
Returns: None
"""
url = self.url + "/" + pdf_id
for _ in range(0, self.max_wait_time_seconds, 5):
response = requests.get(url, headers=self._mathpix_headers)
response_data = response.json()
# This indicates an error with the request (e.g. auth problems)
error = response_data.get("error", None)
error_info = response_data.get("error_info", None)
if error is not None:
error_msg = f"Unable to retrieve PDF from Mathpix: {error}"
if error_info is not None:
error_msg += f" ({error_info['id']})"
raise ValueError(error_msg)
status = response_data.get("status", None)
if status == "completed":
return
elif status == "error":
# This indicates an error with the PDF processing
raise ValueError("Unable to retrieve PDF from Mathpix")
else:
print(f"Status: {status}, waiting for processing to complete") # noqa: T201
time.sleep(5)
raise TimeoutError
[docs]
def get_processed_pdf(self, pdf_id: str) -> str:
self.wait_for_processing(pdf_id)
url = f"{self.url}/{pdf_id}.{self.processed_file_format}"
response = requests.get(url, headers=self._mathpix_headers)
return response.content.decode("utf-8")
[docs]
@staticmethod
def clean_pdf(contents: str) -> str:
"""Clean the PDF file.
Args:
contents: a PDF file contents.
Returns:
"""
contents = "\n".join(
[line for line in contents.split("\n") if not line.startswith("![]")]
)
# replace \section{Title} with # Title
contents = contents.replace("\\section{", "# ").replace("}", "")
# replace the "\" slash that Mathpix adds to escape $, %, (, etc.
contents = (
contents.replace(r"\$", "$")
.replace(r"\%", "%")
.replace(r"\(", "(")
.replace(r"\)", ")")
)
return contents
[docs]
def load(self) -> List[Document]:
pdf_id = self.send_pdf()
contents = self.get_processed_pdf(pdf_id)
if self.should_clean_pdf:
contents = self.clean_pdf(contents)
metadata = {"source": self.source, "file_path": self.source, "pdf_id": pdf_id}
return [Document(page_content=contents, metadata=metadata)]
[docs]
class PDFPlumberLoader(BasePDFLoader):
"""Load `PDF` files using `pdfplumber`."""
[docs]
def __init__(
self,
file_path: str,
text_kwargs: Optional[Mapping[str, Any]] = None,
dedupe: bool = False,
headers: Optional[Dict] = None,
extract_images: bool = False,
) -> None:
"""Initialize with a file path."""
try:
import pdfplumber # noqa:F401
except ImportError:
raise ImportError(
"pdfplumber package not found, please install it with "
"`pip install pdfplumber`"
)
super().__init__(file_path, headers=headers)
self.text_kwargs = text_kwargs or {}
self.dedupe = dedupe
self.extract_images = extract_images
[docs]
def load(self) -> List[Document]:
"""Load file."""
parser = PDFPlumberParser(
text_kwargs=self.text_kwargs,
dedupe=self.dedupe,
extract_images=self.extract_images,
)
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
return parser.parse(blob)
[docs]
class DedocPDFLoader(DedocBaseLoader):
"""
DedocPDFLoader document loader integration to load PDF files using `dedoc`.
The file loader can automatically detect the correctness of a textual layer in the
PDF document.
Note that `__init__` method supports parameters that differ from ones of
DedocBaseLoader.
Setup:
Install ``dedoc`` package.
.. code-block:: bash
pip install -U dedoc
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import DedocPDFLoader
loader = DedocPDFLoader(
file_path="example.pdf",
# split=...,
# with_tables=...,
# pdf_with_text_layer=...,
# pages=...,
# ...
)
Load:
.. code-block:: python
docs = loader.load()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Some text
{
'file_name': 'example.pdf',
'file_type': 'application/pdf',
# ...
}
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Some text
{
'file_name': 'example.pdf',
'file_type': 'application/pdf',
# ...
}
Parameters used for document parsing via `dedoc`
(https://dedoc.readthedocs.io/en/latest/parameters/pdf_handling.html):
with_attachments: enable attached files extraction
recursion_deep_attachments: recursion level for attached files extraction,
works only when with_attachments==True
pdf_with_text_layer: type of handler for parsing, available options
["true", "false", "tabby", "auto", "auto_tabby" (default)]
language: language of the document for PDF without a textual layer,
available options ["eng", "rus", "rus+eng" (default)], the list of
languages can be extended, please see
https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
pages: page slice to define the reading range for parsing
is_one_column_document: detect number of columns for PDF without a textual
layer, available options ["true", "false", "auto" (default)]
document_orientation: fix document orientation (90, 180, 270 degrees) for PDF
without a textual layer, available options ["auto" (default), "no_change"]
need_header_footer_analysis: remove headers and footers from the output result
need_binarization: clean pages background (binarize) for PDF without a textual
layer
need_pdf_table_analysis: parse tables for PDF without a textual layer
"""
def _make_config(self) -> dict:
from dedoc.utils.langchain import make_manager_pdf_config
return make_manager_pdf_config(
file_path=self.file_path,
parsing_params=self.parsing_parameters,
split=self.split,
)
[docs]
class DocumentIntelligenceLoader(BasePDFLoader):
"""Load a PDF with Azure Document Intelligence"""
[docs]
def __init__(
self,
file_path: str,
client: Any,
model: str = "prebuilt-document",
headers: Optional[Dict] = None,
) -> None:
"""
Initialize the object for file processing with Azure Document Intelligence
(formerly Form Recognizer).
This constructor initializes a DocumentIntelligenceParser object to be used
for parsing files using the Azure Document Intelligence API. The load method
generates a Document node including metadata (source blob and page number)
for each page.
Parameters:
-----------
file_path : str
The path to the file that needs to be parsed.
client: Any
A DocumentAnalysisClient to perform the analysis of the blob
model : str
The model name or ID to be used for form recognition in Azure.
Examples:
---------
>>> obj = DocumentIntelligenceLoader(
... file_path="path/to/file",
... client=client,
... model="prebuilt-document"
... )
"""
self.parser = DocumentIntelligenceParser(client=client, model=model)
super().__init__(file_path, headers=headers)
[docs]
def load(self) -> List[Document]:
"""Load given path as pages."""
return list(self.lazy_load())
[docs]
def lazy_load(
self,
) -> Iterator[Document]:
"""Lazy load given path as pages."""
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from self.parser.parse(blob)
[docs]
class ZeroxPDFLoader(BasePDFLoader):
"""
Document loader utilizing Zerox library:
https://github.com/getomni-ai/zerox
Zerox converts PDF document to serties of images (page-wise) and
uses vision-capable LLM model to generate Markdown representation.
Zerox utilizes anyc operations. Therefore when using this loader
inside Jupyter Notebook (or any environment running async)
you will need to:
```python
import nest_asyncio
nest_asyncio.apply()
```
"""
[docs]
def __init__(
self,
file_path: Union[str, Path],
model: str = "gpt-4o-mini",
**zerox_kwargs: Any,
) -> None:
super().__init__(file_path=file_path)
"""
Initialize the parser with arguments to be passed to the zerox function.
Make sure to set necessary environmnet variables such as API key, endpoint, etc.
Check zerox documentation for list of necessary environment variables for
any given model.
Args:
file_path:
Path or url of the pdf file
model:
Vision capable model to use. Defaults to "gpt-4o-mini".
Hosted models are passed in format "<provider>/<model>"
Examples: "azure/gpt-4o-mini", "vertex_ai/gemini-1.5-flash-001"
See more details in zerox documentation.
**zerox_kwargs:
Arguments specific to the zerox function.
see datailed list of arguments here in zerox repository:
https://github.com/getomni-ai/zerox/blob/main/py_zerox/pyzerox/core/zerox.py#L25
""" # noqa: E501
self.zerox_kwargs = zerox_kwargs
self.model = model
[docs]
def lazy_load(self) -> Iterator[Document]:
"""
Loads documnts from pdf utilizing zerox library:
https://github.com/getomni-ai/zerox
Returns:
Iterator[Document]: An iterator over parsed Document instances.
"""
import asyncio
from pyzerox import zerox
# Directly call asyncio.run to execute zerox synchronously
zerox_output = asyncio.run(
zerox(file_path=self.file_path, model=self.model, **self.zerox_kwargs)
)
# Convert zerox output to Document instances and yield them
if len(zerox_output.pages) > 0:
num_pages = zerox_output.pages[-1].page
for page in zerox_output.pages:
yield Document(
page_content=page.content,
metadata={
"source": self.source,
"page": page.page,
"num_pages": num_pages,
},
)
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
PagedPDFSplitter = PyPDFLoader