import html
import json
import os
from abc import ABC, abstractmethod
from typing import (
Dict,
Iterator,
Optional,
Tuple,
Union,
)
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
[docs]class DedocBaseLoader(BaseLoader, ABC):
"""
Base Loader that uses `dedoc` (https://dedoc.readthedocs.io).
Loader enables extracting text, tables and attached files from the given file:
* `Text` can be split by pages, `dedoc` tree nodes, textual lines
(according to the `split` parameter).
* `Attached files` (when with_attachments=True)
are split according to the `split` parameter.
For attachments, langchain Document object has an additional metadata field
`type`="attachment".
* `Tables` (when with_tables=True) are not split - each table corresponds to one
langchain Document object.
For tables, Document object has additional metadata fields `type`="table"
and `text_as_html` with table HTML representation.
"""
[docs] def __init__(
self,
file_path: str,
*,
split: str = "document",
with_tables: bool = True,
with_attachments: Union[str, bool] = False,
recursion_deep_attachments: int = 10,
pdf_with_text_layer: str = "auto_tabby",
language: str = "rus+eng",
pages: str = ":",
is_one_column_document: str = "auto",
document_orientation: str = "auto",
need_header_footer_analysis: Union[str, bool] = False,
need_binarization: Union[str, bool] = False,
need_pdf_table_analysis: Union[str, bool] = True,
delimiter: Optional[str] = None,
encoding: Optional[str] = None,
) -> None:
"""
Initialize with file path and parsing parameters.
Args:
file_path: path to the file for processing
split: type of document splitting into parts (each part is returned
separately), default value "document"
"document": document text is returned as a single langchain Document
object (don't split)
"page": split document text into pages (works for PDF, DJVU, PPTX, PPT,
ODP)
"node": split document text into tree nodes (title nodes, list item
nodes, raw text nodes)
"line": split document text into lines
with_tables: add tables to the result - each table is returned as a single
langchain Document object
Parameters used for document parsing via `dedoc`
(https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):
with_attachments: enable attached files extraction
recursion_deep_attachments: recursion level for attached files
extraction, works only when with_attachments==True
pdf_with_text_layer: type of handler for parsing PDF documents,
available options
["true", "false", "tabby", "auto", "auto_tabby" (default)]
language: language of the document for PDF without a textual layer and
images, available options ["eng", "rus", "rus+eng" (default)],
the list of languages can be extended, please see
https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
pages: page slice to define the reading range for parsing PDF documents
is_one_column_document: detect number of columns for PDF without
a textual layer and images, available options
["true", "false", "auto" (default)]
document_orientation: fix document orientation (90, 180, 270 degrees)
for PDF without a textual layer and images, available options
["auto" (default), "no_change"]
need_header_footer_analysis: remove headers and footers from the output
result for parsing PDF and images
need_binarization: clean pages background (binarize) for PDF without a
textual layer and images
need_pdf_table_analysis: parse tables for PDF without a textual layer
and images
delimiter: column separator for CSV, TSV files
encoding: encoding of TXT, CSV, TSV
"""
self.parsing_parameters = {
key: value
for key, value in locals().items()
if key not in {"self", "file_path", "split", "with_tables"}
}
self.valid_split_values = {"document", "page", "node", "line"}
if split not in self.valid_split_values:
raise ValueError(
f"Got {split} for `split`, but should be one of "
f"`{self.valid_split_values}`"
)
self.split = split
self.with_tables = with_tables
self.file_path = file_path
structure_type = "tree" if self.split == "node" else "linear"
self.parsing_parameters["structure_type"] = structure_type
self.parsing_parameters["need_content_analysis"] = with_attachments
[docs] def lazy_load(self) -> Iterator[Document]:
"""Lazily load documents."""
import tempfile
try:
from dedoc import DedocManager
except ImportError:
raise ImportError(
"`dedoc` package not found, please install it with `pip install dedoc`"
)
dedoc_manager = DedocManager(manager_config=self._make_config())
dedoc_manager.config["logger"].disabled = True
with tempfile.TemporaryDirectory() as tmpdir:
document_tree = dedoc_manager.parse(
file_path=self.file_path,
parameters={**self.parsing_parameters, "attachments_dir": tmpdir},
)
yield from self._split_document(
document_tree=document_tree.to_api_schema().dict(), split=self.split
)
@abstractmethod
def _make_config(self) -> dict:
"""
Make configuration for DedocManager according to the file extension and
parsing parameters.
"""
pass
def _json2txt(self, paragraph: dict) -> str:
"""Get text (recursively) of the document tree node."""
subparagraphs_text = "\n".join(
[
self._json2txt(subparagraph)
for subparagraph in paragraph["subparagraphs"]
]
)
text = (
f"{paragraph['text']}\n{subparagraphs_text}"
if subparagraphs_text
else paragraph["text"]
)
return text
def _parse_subparagraphs(
self, document_tree: dict, document_metadata: dict
) -> Iterator[Document]:
"""Parse recursively document tree obtained by `dedoc`."""
if len(document_tree["subparagraphs"]) > 0:
for subparagraph in document_tree["subparagraphs"]:
yield from self._parse_subparagraphs(
document_tree=subparagraph, document_metadata=document_metadata
)
else:
yield Document(
page_content=document_tree["text"],
metadata={**document_metadata, **document_tree["metadata"]},
)
def _split_document(
self,
document_tree: dict,
split: str,
additional_metadata: Optional[dict] = None,
) -> Iterator[Document]:
"""Split document into parts according to the `split` parameter."""
document_metadata = document_tree["metadata"]
if additional_metadata:
document_metadata = {**document_metadata, **additional_metadata}
if split == "document":
text = self._json2txt(paragraph=document_tree["content"]["structure"])
yield Document(page_content=text, metadata=document_metadata)
elif split == "page":
nodes = document_tree["content"]["structure"]["subparagraphs"]
page_id = nodes[0]["metadata"]["page_id"]
page_text = ""
for node in nodes:
if node["metadata"]["page_id"] == page_id:
page_text += self._json2txt(node)
else:
yield Document(
page_content=page_text,
metadata={**document_metadata, "page_id": page_id},
)
page_id = node["metadata"]["page_id"]
page_text = self._json2txt(node)
yield Document(
page_content=page_text,
metadata={**document_metadata, "page_id": page_id},
)
elif split == "line":
for node in document_tree["content"]["structure"]["subparagraphs"]:
line_metadata = node["metadata"]
yield Document(
page_content=self._json2txt(node),
metadata={**document_metadata, **line_metadata},
)
elif split == "node":
yield from self._parse_subparagraphs(
document_tree=document_tree["content"]["structure"],
document_metadata=document_metadata,
)
else:
raise ValueError(
f"Got {split} for `split`, but should be one of "
f"`{self.valid_split_values}`"
)
if self.with_tables:
for table in document_tree["content"]["tables"]:
table_text, table_html = self._get_table(table)
yield Document(
page_content=table_text,
metadata={
**table["metadata"],
"type": "table",
"text_as_html": table_html,
},
)
for attachment in document_tree["attachments"]:
yield from self._split_document(
document_tree=attachment,
split=self.split,
additional_metadata={"type": "attachment"},
)
def _get_table(self, table: dict) -> Tuple[str, str]:
"""Get text and HTML representation of the table."""
table_text = ""
for row in table["cells"]:
for cell in row:
table_text += " ".join(line["text"] for line in cell["lines"])
table_text += "\t"
table_text += "\n"
table_html = (
'<table border="1" style="border-collapse: collapse; width: 100%;'
'">\n<tbody>\n'
)
for row in table["cells"]:
table_html += "<tr>\n"
for cell in row:
cell_text = "\n".join(line["text"] for line in cell["lines"])
cell_text = html.escape(cell_text)
table_html += "<td"
if cell["invisible"]:
table_html += ' style="display: none" '
table_html += (
f' colspan="{cell["colspan"]}" rowspan='
f'"{cell["rowspan"]}">{cell_text}</td>\n'
)
table_html += "</tr>\n"
table_html += "</tbody>\n</table>"
return table_text, table_html
[docs]class DedocFileLoader(DedocBaseLoader):
"""
DedocFileLoader document loader integration to load files using `dedoc`.
The file loader automatically detects the file type (with the correct extension).
The list of supported file types is gives at
https://dedoc.readthedocs.io/en/latest/index.html#id1.
Please see the documentation of DedocBaseLoader to get more details.
Setup:
Install ``dedoc`` package.
.. code-block:: bash
pip install -U dedoc
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import DedocFileLoader
loader = DedocFileLoader(
file_path="example.pdf",
# split=...,
# with_tables=...,
# pdf_with_text_layer=...,
# pages=...,
# ...
)
Load:
.. code-block:: python
docs = loader.load()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Some text
{
'file_name': 'example.pdf',
'file_type': 'application/pdf',
# ...
}
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Some text
{
'file_name': 'example.pdf',
'file_type': 'application/pdf',
# ...
}
"""
def _make_config(self) -> dict:
from dedoc.utils.langchain import make_manager_config
return make_manager_config(
file_path=self.file_path,
parsing_params=self.parsing_parameters,
split=self.split,
)
[docs]class DedocAPIFileLoader(DedocBaseLoader):
"""
Load files using `dedoc` API.
The file loader automatically detects the file type (even with the wrong extension).
By default, the loader makes a call to the locally hosted `dedoc` API.
More information about `dedoc` API can be found in `dedoc` documentation:
https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api.html
Please see the documentation of DedocBaseLoader to get more details.
Setup:
You don't need to install `dedoc` library for using this loader.
Instead, the `dedoc` API needs to be run.
You may use Docker container for this purpose.
Please see `dedoc` documentation for more details:
https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker
.. code-block:: bash
docker pull dedocproject/dedoc
docker run -p 1231:1231
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import DedocAPIFileLoader
loader = DedocAPIFileLoader(
file_path="example.pdf",
# url=...,
# split=...,
# with_tables=...,
# pdf_with_text_layer=...,
# pages=...,
# ...
)
Load:
.. code-block:: python
docs = loader.load()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Some text
{
'file_name': 'example.pdf',
'file_type': 'application/pdf',
# ...
}
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Some text
{
'file_name': 'example.pdf',
'file_type': 'application/pdf',
# ...
}
"""
[docs] def __init__(
self,
file_path: str,
*,
url: str = "http://0.0.0.0:1231",
split: str = "document",
with_tables: bool = True,
with_attachments: Union[str, bool] = False,
recursion_deep_attachments: int = 10,
pdf_with_text_layer: str = "auto_tabby",
language: str = "rus+eng",
pages: str = ":",
is_one_column_document: str = "auto",
document_orientation: str = "auto",
need_header_footer_analysis: Union[str, bool] = False,
need_binarization: Union[str, bool] = False,
need_pdf_table_analysis: Union[str, bool] = True,
delimiter: Optional[str] = None,
encoding: Optional[str] = None,
) -> None:
"""Initialize with file path, API url and parsing parameters.
Args:
file_path: path to the file for processing
url: URL to call `dedoc` API
split: type of document splitting into parts (each part is returned
separately), default value "document"
"document": document is returned as a single langchain Document object
(don't split)
"page": split document into pages (works for PDF, DJVU, PPTX, PPT, ODP)
"node": split document into tree nodes (title nodes, list item nodes,
raw text nodes)
"line": split document into lines
with_tables: add tables to the result - each table is returned as a single
langchain Document object
Parameters used for document parsing via `dedoc`
(https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):
with_attachments: enable attached files extraction
recursion_deep_attachments: recursion level for attached files
extraction, works only when with_attachments==True
pdf_with_text_layer: type of handler for parsing PDF documents,
available options
["true", "false", "tabby", "auto", "auto_tabby" (default)]
language: language of the document for PDF without a textual layer and
images, available options ["eng", "rus", "rus+eng" (default)],
the list of languages can be extended, please see
https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
pages: page slice to define the reading range for parsing PDF documents
is_one_column_document: detect number of columns for PDF without
a textual layer and images, available options
["true", "false", "auto" (default)]
document_orientation: fix document orientation (90, 180, 270 degrees)
for PDF without a textual layer and images, available options
["auto" (default), "no_change"]
need_header_footer_analysis: remove headers and footers from the output
result for parsing PDF and images
need_binarization: clean pages background (binarize) for PDF without a
textual layer and images
need_pdf_table_analysis: parse tables for PDF without a textual layer
and images
delimiter: column separator for CSV, TSV files
encoding: encoding of TXT, CSV, TSV
"""
super().__init__(
file_path=file_path,
split=split,
with_tables=with_tables,
with_attachments=with_attachments,
recursion_deep_attachments=recursion_deep_attachments,
pdf_with_text_layer=pdf_with_text_layer,
language=language,
pages=pages,
is_one_column_document=is_one_column_document,
document_orientation=document_orientation,
need_header_footer_analysis=need_header_footer_analysis,
need_binarization=need_binarization,
need_pdf_table_analysis=need_pdf_table_analysis,
delimiter=delimiter,
encoding=encoding,
)
self.url = url
self.parsing_parameters["return_format"] = "json"
[docs] def lazy_load(self) -> Iterator[Document]:
"""Lazily load documents."""
doc_tree = self._send_file(
url=self.url, file_path=self.file_path, parameters=self.parsing_parameters
)
yield from self._split_document(document_tree=doc_tree, split=self.split)
def _make_config(self) -> dict:
return {}
def _send_file(
self, url: str, file_path: str, parameters: dict
) -> Dict[str, Union[list, dict, str]]:
"""Send POST-request to `dedoc` API and return the results"""
import requests
file_name = os.path.basename(file_path)
with open(file_path, "rb") as file:
files = {"file": (file_name, file)}
r = requests.post(f"{url}/upload", files=files, data=parameters)
if r.status_code != 200:
raise ValueError(f"Error during file handling: {r.content.decode()}")
result = json.loads(r.content.decode())
return result