Source code for langchain_community.document_loaders.word_document
"""Loads word documents."""importosimporttempfilefromabcimportABCfrompathlibimportPathfromtypingimportList,Unionfromurllib.parseimporturlparseimportrequestsfromlangchain_core.documentsimportDocumentfromlangchain_community.document_loaders.baseimportBaseLoaderfromlangchain_community.document_loaders.unstructuredimportUnstructuredFileLoader
[docs]classDocx2txtLoader(BaseLoader,ABC):"""Load `DOCX` file using `docx2txt` and chunks at character level. Defaults to check for local file, but if the file is a web path, it will download it to a temporary file, and use that, then clean up the temporary file after completion """
[docs]def__init__(self,file_path:Union[str,Path]):"""Initialize with file path."""self.file_path=str(file_path)if"~"inself.file_path:self.file_path=os.path.expanduser(self.file_path)# If the file is a web path, download it to a temporary file, and use thatifnotos.path.isfile(self.file_path)andself._is_valid_url(self.file_path):r=requests.get(self.file_path)ifr.status_code!=200:raiseValueError("Check the url of your file; returned status code %s"%r.status_code)self.web_path=self.file_pathself.temp_file=tempfile.NamedTemporaryFile()self.temp_file.write(r.content)self.file_path=self.temp_file.nameelifnotos.path.isfile(self.file_path):raiseValueError("File path %s is not a valid file or url"%self.file_path)
[docs]defload(self)->List[Document]:"""Load given path as single page."""importdocx2txtreturn[Document(page_content=docx2txt.process(self.file_path),metadata={"source":self.file_path},)]
@staticmethoddef_is_valid_url(url:str)->bool:"""Check if the url is valid."""parsed=urlparse(url)returnbool(parsed.netloc)andbool(parsed.scheme)
[docs]classUnstructuredWordDocumentLoader(UnstructuredFileLoader):"""Load `Microsoft Word` file using `Unstructured`. Works with both .docx and .doc files. You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single langchain Document object. If you use "elements" mode, the unstructured library will split the document into elements such as Title and NarrativeText. You can pass in additional unstructured kwargs after mode to apply different unstructured settings. Examples -------- from langchain_community.document_loaders import UnstructuredWordDocumentLoader loader = UnstructuredWordDocumentLoader( "example.docx", mode="elements", strategy="fast", ) docs = loader.load() References ---------- https://unstructured-io.github.io/unstructured/bricks.html#partition-docx """def_get_elements(self)->List:fromunstructured.__version__import__version__as__unstructured_version__fromunstructured.file_utils.filetypeimportFileType,detect_filetypeunstructured_version=tuple([int(x)forxin__unstructured_version__.split(".")])# NOTE(MthwRobinson) - magic will raise an import error if the libmagic# system dependency isn't installed. If it's not installed, we'll just# check the file extensiontry:importmagic# noqa: F401is_doc=detect_filetype(self.file_path)==FileType.DOCexceptImportError:_,extension=os.path.splitext(str(self.file_path))is_doc=extension==".doc"ifis_docandunstructured_version<(0,4,11):raiseValueError(f"You are on unstructured version {__unstructured_version__}. ""Partitioning .doc files is only supported in unstructured>=0.4.11. ""Please upgrade the unstructured package and try again.")ifis_doc:fromunstructured.partition.docimportpartition_docreturnpartition_doc(filename=self.file_path,**self.unstructured_kwargs)else:fromunstructured.partition.docximportpartition_docxreturnpartition_docx(filename=self.file_path,**self.unstructured_kwargs)