Source code for langchain_community.document_loaders.dedoc

import html
import json
import os
from abc import ABC, abstractmethod
from typing import (
    Dict,
    Iterator,
    Optional,
    Tuple,
    Union,
)

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader


[docs]class DedocBaseLoader(BaseLoader, ABC): """ Base Loader that uses `dedoc` (https://dedoc.readthedocs.io). Loader enables extracting text, tables and attached files from the given file: * `Text` can be split by pages, `dedoc` tree nodes, textual lines (according to the `split` parameter). * `Attached files` (when with_attachments=True) are split according to the `split` parameter. For attachments, langchain Document object has an additional metadata field `type`="attachment". * `Tables` (when with_tables=True) are not split - each table corresponds to one langchain Document object. For tables, Document object has additional metadata fields `type`="table" and `text_as_html` with table HTML representation. """
[docs] def __init__( self, file_path: str, *, split: str = "document", with_tables: bool = True, with_attachments: Union[str, bool] = False, recursion_deep_attachments: int = 10, pdf_with_text_layer: str = "auto_tabby", language: str = "rus+eng", pages: str = ":", is_one_column_document: str = "auto", document_orientation: str = "auto", need_header_footer_analysis: Union[str, bool] = False, need_binarization: Union[str, bool] = False, need_pdf_table_analysis: Union[str, bool] = True, delimiter: Optional[str] = None, encoding: Optional[str] = None, ) -> None: """ Initialize with file path and parsing parameters. Args: file_path: path to the file for processing split: type of document splitting into parts (each part is returned separately), default value "document" "document": document text is returned as a single langchain Document object (don't split) "page": split document text into pages (works for PDF, DJVU, PPTX, PPT, ODP) "node": split document text into tree nodes (title nodes, list item nodes, raw text nodes) "line": split document text into lines with_tables: add tables to the result - each table is returned as a single langchain Document object Parameters used for document parsing via `dedoc` (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html): with_attachments: enable attached files extraction recursion_deep_attachments: recursion level for attached files extraction, works only when with_attachments==True pdf_with_text_layer: type of handler for parsing PDF documents, available options ["true", "false", "tabby", "auto", "auto_tabby" (default)] language: language of the document for PDF without a textual layer and images, available options ["eng", "rus", "rus+eng" (default)], the list of languages can be extended, please see https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html pages: page slice to define the reading range for parsing PDF documents is_one_column_document: detect number of columns for PDF without a textual layer and images, available options ["true", "false", "auto" (default)] document_orientation: fix document orientation (90, 180, 270 degrees) for PDF without a textual layer and images, available options ["auto" (default), "no_change"] need_header_footer_analysis: remove headers and footers from the output result for parsing PDF and images need_binarization: clean pages background (binarize) for PDF without a textual layer and images need_pdf_table_analysis: parse tables for PDF without a textual layer and images delimiter: column separator for CSV, TSV files encoding: encoding of TXT, CSV, TSV """ self.parsing_parameters = { key: value for key, value in locals().items() if key not in {"self", "file_path", "split", "with_tables"} } self.valid_split_values = {"document", "page", "node", "line"} if split not in self.valid_split_values: raise ValueError( f"Got {split} for `split`, but should be one of " f"`{self.valid_split_values}`" ) self.split = split self.with_tables = with_tables self.file_path = file_path structure_type = "tree" if self.split == "node" else "linear" self.parsing_parameters["structure_type"] = structure_type self.parsing_parameters["need_content_analysis"] = with_attachments
[docs] def lazy_load(self) -> Iterator[Document]: """Lazily load documents.""" import tempfile try: from dedoc import DedocManager except ImportError: raise ImportError( "`dedoc` package not found, please install it with `pip install dedoc`" ) dedoc_manager = DedocManager(manager_config=self._make_config()) dedoc_manager.config["logger"].disabled = True with tempfile.TemporaryDirectory() as tmpdir: document_tree = dedoc_manager.parse( file_path=self.file_path, parameters={**self.parsing_parameters, "attachments_dir": tmpdir}, ) yield from self._split_document( document_tree=document_tree.to_api_schema().dict(), split=self.split )
@abstractmethod def _make_config(self) -> dict: """ Make configuration for DedocManager according to the file extension and parsing parameters. """ pass def _json2txt(self, paragraph: dict) -> str: """Get text (recursively) of the document tree node.""" subparagraphs_text = "\n".join( [ self._json2txt(subparagraph) for subparagraph in paragraph["subparagraphs"] ] ) text = ( f"{paragraph['text']}\n{subparagraphs_text}" if subparagraphs_text else paragraph["text"] ) return text def _parse_subparagraphs( self, document_tree: dict, document_metadata: dict ) -> Iterator[Document]: """Parse recursively document tree obtained by `dedoc`.""" if len(document_tree["subparagraphs"]) > 0: for subparagraph in document_tree["subparagraphs"]: yield from self._parse_subparagraphs( document_tree=subparagraph, document_metadata=document_metadata ) else: yield Document( page_content=document_tree["text"], metadata={**document_metadata, **document_tree["metadata"]}, ) def _split_document( self, document_tree: dict, split: str, additional_metadata: Optional[dict] = None, ) -> Iterator[Document]: """Split document into parts according to the `split` parameter.""" document_metadata = document_tree["metadata"] if additional_metadata: document_metadata = {**document_metadata, **additional_metadata} if split == "document": text = self._json2txt(paragraph=document_tree["content"]["structure"]) yield Document(page_content=text, metadata=document_metadata) elif split == "page": nodes = document_tree["content"]["structure"]["subparagraphs"] page_id = nodes[0]["metadata"]["page_id"] page_text = "" for node in nodes: if node["metadata"]["page_id"] == page_id: page_text += self._json2txt(node) else: yield Document( page_content=page_text, metadata={**document_metadata, "page_id": page_id}, ) page_id = node["metadata"]["page_id"] page_text = self._json2txt(node) yield Document( page_content=page_text, metadata={**document_metadata, "page_id": page_id}, ) elif split == "line": for node in document_tree["content"]["structure"]["subparagraphs"]: line_metadata = node["metadata"] yield Document( page_content=self._json2txt(node), metadata={**document_metadata, **line_metadata}, ) elif split == "node": yield from self._parse_subparagraphs( document_tree=document_tree["content"]["structure"], document_metadata=document_metadata, ) else: raise ValueError( f"Got {split} for `split`, but should be one of " f"`{self.valid_split_values}`" ) if self.with_tables: for table in document_tree["content"]["tables"]: table_text, table_html = self._get_table(table) yield Document( page_content=table_text, metadata={ **table["metadata"], "type": "table", "text_as_html": table_html, }, ) for attachment in document_tree["attachments"]: yield from self._split_document( document_tree=attachment, split=self.split, additional_metadata={"type": "attachment"}, ) def _get_table(self, table: dict) -> Tuple[str, str]: """Get text and HTML representation of the table.""" table_text = "" for row in table["cells"]: for cell in row: table_text += " ".join(line["text"] for line in cell["lines"]) table_text += "\t" table_text += "\n" table_html = ( '<table border="1" style="border-collapse: collapse; width: 100%;' '">\n<tbody>\n' ) for row in table["cells"]: table_html += "<tr>\n" for cell in row: cell_text = "\n".join(line["text"] for line in cell["lines"]) cell_text = html.escape(cell_text) table_html += "<td" if cell["invisible"]: table_html += ' style="display: none" ' table_html += ( f' colspan="{cell["colspan"]}" rowspan=' f'"{cell["rowspan"]}">{cell_text}</td>\n' ) table_html += "</tr>\n" table_html += "</tbody>\n</table>" return table_text, table_html
[docs]class DedocFileLoader(DedocBaseLoader): """ DedocFileLoader document loader integration to load files using `dedoc`. The file loader automatically detects the file type (with the correct extension). The list of supported file types is gives at https://dedoc.readthedocs.io/en/latest/index.html#id1. Please see the documentation of DedocBaseLoader to get more details. Setup: Install ``dedoc`` package. .. code-block:: bash pip install -U dedoc Instantiate: .. code-block:: python from langchain_community.document_loaders import DedocFileLoader loader = DedocFileLoader( file_path="example.pdf", # split=..., # with_tables=..., # pdf_with_text_layer=..., # pages=..., # ... ) Load: .. code-block:: python docs = loader.load() print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Some text { 'file_name': 'example.pdf', 'file_type': 'application/pdf', # ... } Lazy load: .. code-block:: python docs = [] docs_lazy = loader.lazy_load() for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Some text { 'file_name': 'example.pdf', 'file_type': 'application/pdf', # ... } """ def _make_config(self) -> dict: from dedoc.utils.langchain import make_manager_config return make_manager_config( file_path=self.file_path, parsing_params=self.parsing_parameters, split=self.split, )
[docs]class DedocAPIFileLoader(DedocBaseLoader): """ Load files using `dedoc` API. The file loader automatically detects the file type (even with the wrong extension). By default, the loader makes a call to the locally hosted `dedoc` API. More information about `dedoc` API can be found in `dedoc` documentation: https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api.html Please see the documentation of DedocBaseLoader to get more details. Setup: You don't need to install `dedoc` library for using this loader. Instead, the `dedoc` API needs to be run. You may use Docker container for this purpose. Please see `dedoc` documentation for more details: https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker .. code-block:: bash docker pull dedocproject/dedoc docker run -p 1231:1231 Instantiate: .. code-block:: python from langchain_community.document_loaders import DedocAPIFileLoader loader = DedocAPIFileLoader( file_path="example.pdf", # url=..., # split=..., # with_tables=..., # pdf_with_text_layer=..., # pages=..., # ... ) Load: .. code-block:: python docs = loader.load() print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Some text { 'file_name': 'example.pdf', 'file_type': 'application/pdf', # ... } Lazy load: .. code-block:: python docs = [] docs_lazy = loader.lazy_load() for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Some text { 'file_name': 'example.pdf', 'file_type': 'application/pdf', # ... } """
[docs] def __init__( self, file_path: str, *, url: str = "http://0.0.0.0:1231", split: str = "document", with_tables: bool = True, with_attachments: Union[str, bool] = False, recursion_deep_attachments: int = 10, pdf_with_text_layer: str = "auto_tabby", language: str = "rus+eng", pages: str = ":", is_one_column_document: str = "auto", document_orientation: str = "auto", need_header_footer_analysis: Union[str, bool] = False, need_binarization: Union[str, bool] = False, need_pdf_table_analysis: Union[str, bool] = True, delimiter: Optional[str] = None, encoding: Optional[str] = None, ) -> None: """Initialize with file path, API url and parsing parameters. Args: file_path: path to the file for processing url: URL to call `dedoc` API split: type of document splitting into parts (each part is returned separately), default value "document" "document": document is returned as a single langchain Document object (don't split) "page": split document into pages (works for PDF, DJVU, PPTX, PPT, ODP) "node": split document into tree nodes (title nodes, list item nodes, raw text nodes) "line": split document into lines with_tables: add tables to the result - each table is returned as a single langchain Document object Parameters used for document parsing via `dedoc` (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html): with_attachments: enable attached files extraction recursion_deep_attachments: recursion level for attached files extraction, works only when with_attachments==True pdf_with_text_layer: type of handler for parsing PDF documents, available options ["true", "false", "tabby", "auto", "auto_tabby" (default)] language: language of the document for PDF without a textual layer and images, available options ["eng", "rus", "rus+eng" (default)], the list of languages can be extended, please see https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html pages: page slice to define the reading range for parsing PDF documents is_one_column_document: detect number of columns for PDF without a textual layer and images, available options ["true", "false", "auto" (default)] document_orientation: fix document orientation (90, 180, 270 degrees) for PDF without a textual layer and images, available options ["auto" (default), "no_change"] need_header_footer_analysis: remove headers and footers from the output result for parsing PDF and images need_binarization: clean pages background (binarize) for PDF without a textual layer and images need_pdf_table_analysis: parse tables for PDF without a textual layer and images delimiter: column separator for CSV, TSV files encoding: encoding of TXT, CSV, TSV """ super().__init__( file_path=file_path, split=split, with_tables=with_tables, with_attachments=with_attachments, recursion_deep_attachments=recursion_deep_attachments, pdf_with_text_layer=pdf_with_text_layer, language=language, pages=pages, is_one_column_document=is_one_column_document, document_orientation=document_orientation, need_header_footer_analysis=need_header_footer_analysis, need_binarization=need_binarization, need_pdf_table_analysis=need_pdf_table_analysis, delimiter=delimiter, encoding=encoding, ) self.url = url self.parsing_parameters["return_format"] = "json"
[docs] def lazy_load(self) -> Iterator[Document]: """Lazily load documents.""" doc_tree = self._send_file( url=self.url, file_path=self.file_path, parameters=self.parsing_parameters ) yield from self._split_document(document_tree=doc_tree, split=self.split)
def _make_config(self) -> dict: return {} def _send_file( self, url: str, file_path: str, parameters: dict ) -> Dict[str, Union[list, dict, str]]: """Send POST-request to `dedoc` API and return the results""" import requests file_name = os.path.basename(file_path) with open(file_path, "rb") as file: files = {"file": (file_name, file)} r = requests.post(f"{url}/upload", files=files, data=parameters) if r.status_code != 200: raise ValueError(f"Error during file handling: {r.content.decode()}") result = json.loads(r.content.decode()) return result