[docs]classDedocBaseLoader(BaseLoader,ABC):""" Base Loader that uses `dedoc` (https://dedoc.readthedocs.io). Loader enables extracting text, tables and attached files from the given file: * `Text` can be split by pages, `dedoc` tree nodes, textual lines (according to the `split` parameter). * `Attached files` (when with_attachments=True) are split according to the `split` parameter. For attachments, langchain Document object has an additional metadata field `type`="attachment". * `Tables` (when with_tables=True) are not split - each table corresponds to one langchain Document object. For tables, Document object has additional metadata fields `type`="table" and `text_as_html` with table HTML representation. """
[docs]def__init__(self,file_path:str,*,split:str="document",with_tables:bool=True,with_attachments:Union[str,bool]=False,recursion_deep_attachments:int=10,pdf_with_text_layer:str="auto_tabby",language:str="rus+eng",pages:str=":",is_one_column_document:str="auto",document_orientation:str="auto",need_header_footer_analysis:Union[str,bool]=False,need_binarization:Union[str,bool]=False,need_pdf_table_analysis:Union[str,bool]=True,delimiter:Optional[str]=None,encoding:Optional[str]=None,)->None:""" Initialize with file path and parsing parameters. Args: file_path: path to the file for processing split: type of document splitting into parts (each part is returned separately), default value "document" "document": document text is returned as a single langchain Document object (don't split) "page": split document text into pages (works for PDF, DJVU, PPTX, PPT, ODP) "node": split document text into tree nodes (title nodes, list item nodes, raw text nodes) "line": split document text into lines with_tables: add tables to the result - each table is returned as a single langchain Document object Parameters used for document parsing via `dedoc` (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html): with_attachments: enable attached files extraction recursion_deep_attachments: recursion level for attached files extraction, works only when with_attachments==True pdf_with_text_layer: type of handler for parsing PDF documents, available options ["true", "false", "tabby", "auto", "auto_tabby" (default)] language: language of the document for PDF without a textual layer and images, available options ["eng", "rus", "rus+eng" (default)], the list of languages can be extended, please see https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html pages: page slice to define the reading range for parsing PDF documents is_one_column_document: detect number of columns for PDF without a textual layer and images, available options ["true", "false", "auto" (default)] document_orientation: fix document orientation (90, 180, 270 degrees) for PDF without a textual layer and images, available options ["auto" (default), "no_change"] need_header_footer_analysis: remove headers and footers from the output result for parsing PDF and images need_binarization: clean pages background (binarize) for PDF without a textual layer and images need_pdf_table_analysis: parse tables for PDF without a textual layer and images delimiter: column separator for CSV, TSV files encoding: encoding of TXT, CSV, TSV """self.parsing_parameters={key:valueforkey,valueinlocals().items()ifkeynotin{"self","file_path","split","with_tables"}}self.valid_split_values={"document","page","node","line"}ifsplitnotinself.valid_split_values:raiseValueError(f"Got {split} for `split`, but should be one of "f"`{self.valid_split_values}`")self.split=splitself.with_tables=with_tablesself.file_path=file_pathstructure_type="tree"ifself.split=="node"else"linear"self.parsing_parameters["structure_type"]=structure_typeself.parsing_parameters["need_content_analysis"]=with_attachments
[docs]deflazy_load(self)->Iterator[Document]:"""Lazily load documents."""importtempfiletry:fromdedocimportDedocManagerexceptImportError:raiseImportError("`dedoc` package not found, please install it with `pip install dedoc`")dedoc_manager=DedocManager(manager_config=self._make_config())dedoc_manager.config["logger"].disabled=Truewithtempfile.TemporaryDirectory()astmpdir:document_tree=dedoc_manager.parse(file_path=self.file_path,parameters={**self.parsing_parameters,"attachments_dir":tmpdir},)yield fromself._split_document(document_tree=document_tree.to_api_schema().dict(),split=self.split)
@abstractmethoddef_make_config(self)->dict:""" Make configuration for DedocManager according to the file extension and parsing parameters. """passdef_json2txt(self,paragraph:dict)->str:"""Get text (recursively) of the document tree node."""subparagraphs_text="\n".join([self._json2txt(subparagraph)forsubparagraphinparagraph["subparagraphs"]])text=(f"{paragraph['text']}\n{subparagraphs_text}"ifsubparagraphs_textelseparagraph["text"])returntextdef_parse_subparagraphs(self,document_tree:dict,document_metadata:dict)->Iterator[Document]:"""Parse recursively document tree obtained by `dedoc`."""iflen(document_tree["subparagraphs"])>0:forsubparagraphindocument_tree["subparagraphs"]:yield fromself._parse_subparagraphs(document_tree=subparagraph,document_metadata=document_metadata)else:yieldDocument(page_content=document_tree["text"],metadata={**document_metadata,**document_tree["metadata"]},)def_split_document(self,document_tree:dict,split:str,additional_metadata:Optional[dict]=None,)->Iterator[Document]:"""Split document into parts according to the `split` parameter."""document_metadata=document_tree["metadata"]ifadditional_metadata:document_metadata={**document_metadata,**additional_metadata}ifsplit=="document":text=self._json2txt(paragraph=document_tree["content"]["structure"])yieldDocument(page_content=text,metadata=document_metadata)elifsplit=="page":nodes=document_tree["content"]["structure"]["subparagraphs"]page_id=nodes[0]["metadata"]["page_id"]page_text=""fornodeinnodes:ifnode["metadata"]["page_id"]==page_id:page_text+=self._json2txt(node)else:yieldDocument(page_content=page_text,metadata={**document_metadata,"page_id":page_id},)page_id=node["metadata"]["page_id"]page_text=self._json2txt(node)yieldDocument(page_content=page_text,metadata={**document_metadata,"page_id":page_id},)elifsplit=="line":fornodeindocument_tree["content"]["structure"]["subparagraphs"]:line_metadata=node["metadata"]yieldDocument(page_content=self._json2txt(node),metadata={**document_metadata,**line_metadata},)elifsplit=="node":yield fromself._parse_subparagraphs(document_tree=document_tree["content"]["structure"],document_metadata=document_metadata,)else:raiseValueError(f"Got {split} for `split`, but should be one of "f"`{self.valid_split_values}`")ifself.with_tables:fortableindocument_tree["content"]["tables"]:table_text,table_html=self._get_table(table)yieldDocument(page_content=table_text,metadata={**table["metadata"],"type":"table","text_as_html":table_html,},)forattachmentindocument_tree["attachments"]:yield fromself._split_document(document_tree=attachment,split=self.split,additional_metadata={"type":"attachment"},)def_get_table(self,table:dict)->Tuple[str,str]:"""Get text and HTML representation of the table."""table_text=""forrowintable["cells"]:forcellinrow:table_text+=" ".join(line["text"]forlineincell["lines"])table_text+="\t"table_text+="\n"table_html=('<table border="1" style="border-collapse: collapse; width: 100%;''">\n<tbody>\n')forrowintable["cells"]:table_html+="<tr>\n"forcellinrow:cell_text="\n".join(line["text"]forlineincell["lines"])cell_text=html.escape(cell_text)table_html+="<td"ifcell["invisible"]:table_html+=' style="display: none" 'table_html+=(f' colspan="{cell["colspan"]}" rowspan='f'"{cell["rowspan"]}">{cell_text}</td>\n')table_html+="</tr>\n"table_html+="</tbody>\n</table>"returntable_text,table_html
[docs]classDedocFileLoader(DedocBaseLoader):""" DedocFileLoader document loader integration to load files using `dedoc`. The file loader automatically detects the file type (with the correct extension). The list of supported file types is gives at https://dedoc.readthedocs.io/en/latest/index.html#id1. Please see the documentation of DedocBaseLoader to get more details. Setup: Install ``dedoc`` package. .. code-block:: bash pip install -U dedoc Instantiate: .. code-block:: python from langchain_community.document_loaders import DedocFileLoader loader = DedocFileLoader( file_path="example.pdf", # split=..., # with_tables=..., # pdf_with_text_layer=..., # pages=..., # ... ) Load: .. code-block:: python docs = loader.load() print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Some text { 'file_name': 'example.pdf', 'file_type': 'application/pdf', # ... } Lazy load: .. code-block:: python docs = [] docs_lazy = loader.lazy_load() for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Some text { 'file_name': 'example.pdf', 'file_type': 'application/pdf', # ... } """def_make_config(self)->dict:fromdedoc.utils.langchainimportmake_manager_configreturnmake_manager_config(file_path=self.file_path,parsing_params=self.parsing_parameters,split=self.split,)
[docs]classDedocAPIFileLoader(DedocBaseLoader):""" Load files using `dedoc` API. The file loader automatically detects the file type (even with the wrong extension). By default, the loader makes a call to the locally hosted `dedoc` API. More information about `dedoc` API can be found in `dedoc` documentation: https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api.html Please see the documentation of DedocBaseLoader to get more details. Setup: You don't need to install `dedoc` library for using this loader. Instead, the `dedoc` API needs to be run. You may use Docker container for this purpose. Please see `dedoc` documentation for more details: https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker .. code-block:: bash docker pull dedocproject/dedoc docker run -p 1231:1231 Instantiate: .. code-block:: python from langchain_community.document_loaders import DedocAPIFileLoader loader = DedocAPIFileLoader( file_path="example.pdf", # url=..., # split=..., # with_tables=..., # pdf_with_text_layer=..., # pages=..., # ... ) Load: .. code-block:: python docs = loader.load() print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Some text { 'file_name': 'example.pdf', 'file_type': 'application/pdf', # ... } Lazy load: .. code-block:: python docs = [] docs_lazy = loader.lazy_load() for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Some text { 'file_name': 'example.pdf', 'file_type': 'application/pdf', # ... } """
[docs]def__init__(self,file_path:str,*,url:str="http://0.0.0.0:1231",split:str="document",with_tables:bool=True,with_attachments:Union[str,bool]=False,recursion_deep_attachments:int=10,pdf_with_text_layer:str="auto_tabby",language:str="rus+eng",pages:str=":",is_one_column_document:str="auto",document_orientation:str="auto",need_header_footer_analysis:Union[str,bool]=False,need_binarization:Union[str,bool]=False,need_pdf_table_analysis:Union[str,bool]=True,delimiter:Optional[str]=None,encoding:Optional[str]=None,)->None:"""Initialize with file path, API url and parsing parameters. Args: file_path: path to the file for processing url: URL to call `dedoc` API split: type of document splitting into parts (each part is returned separately), default value "document" "document": document is returned as a single langchain Document object (don't split) "page": split document into pages (works for PDF, DJVU, PPTX, PPT, ODP) "node": split document into tree nodes (title nodes, list item nodes, raw text nodes) "line": split document into lines with_tables: add tables to the result - each table is returned as a single langchain Document object Parameters used for document parsing via `dedoc` (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html): with_attachments: enable attached files extraction recursion_deep_attachments: recursion level for attached files extraction, works only when with_attachments==True pdf_with_text_layer: type of handler for parsing PDF documents, available options ["true", "false", "tabby", "auto", "auto_tabby" (default)] language: language of the document for PDF without a textual layer and images, available options ["eng", "rus", "rus+eng" (default)], the list of languages can be extended, please see https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html pages: page slice to define the reading range for parsing PDF documents is_one_column_document: detect number of columns for PDF without a textual layer and images, available options ["true", "false", "auto" (default)] document_orientation: fix document orientation (90, 180, 270 degrees) for PDF without a textual layer and images, available options ["auto" (default), "no_change"] need_header_footer_analysis: remove headers and footers from the output result for parsing PDF and images need_binarization: clean pages background (binarize) for PDF without a textual layer and images need_pdf_table_analysis: parse tables for PDF without a textual layer and images delimiter: column separator for CSV, TSV files encoding: encoding of TXT, CSV, TSV """super().__init__(file_path=file_path,split=split,with_tables=with_tables,with_attachments=with_attachments,recursion_deep_attachments=recursion_deep_attachments,pdf_with_text_layer=pdf_with_text_layer,language=language,pages=pages,is_one_column_document=is_one_column_document,document_orientation=document_orientation,need_header_footer_analysis=need_header_footer_analysis,need_binarization=need_binarization,need_pdf_table_analysis=need_pdf_table_analysis,delimiter=delimiter,encoding=encoding,)self.url=urlself.parsing_parameters["return_format"]="json"
def_make_config(self)->dict:return{}def_send_file(self,url:str,file_path:str,parameters:dict)->Dict[str,Union[list,dict,str]]:"""Send POST-request to `dedoc` API and return the results"""importrequestsfile_name=os.path.basename(file_path)withopen(file_path,"rb")asfile:files={"file":(file_name,file)}r=requests.post(f"{url}/upload",files=files,data=parameters)ifr.status_code!=200:raiseValueError(f"Error during file handling: {r.content.decode()}")result=json.loads(r.content.decode())returnresult