Source code for langchain_community.document_loaders.word_document
"""Loads word documents."""importosimporttempfilefromabcimportABCfrompathlibimportPathfromtypingimportAny,List,Unionfromurllib.parseimporturlparseimportrequestsfromlangchain_core.documentsimportDocumentfromlangchain_community.document_loaders.baseimportBaseLoaderfromlangchain_community.document_loaders.unstructuredimport(UnstructuredFileLoader,validate_unstructured_version,)
[docs]classDocx2txtLoader(BaseLoader,ABC):"""Load `DOCX` file using `docx2txt` and chunks at character level. Defaults to check for local file, but if the file is a web path, it will download it to a temporary file, and use that, then clean up the temporary file after completion """
[docs]def__init__(self,file_path:Union[str,Path]):"""Initialize with file path."""self.file_path=str(file_path)self.original_file_path=self.file_pathif"~"inself.file_path:self.file_path=os.path.expanduser(self.file_path)# If the file is a web path, download it to a temporary file, and use thatifnotos.path.isfile(self.file_path)andself._is_valid_url(self.file_path):r=requests.get(self.file_path)ifr.status_code!=200:raiseValueError("Check the url of your file; returned status code %s"%r.status_code)self.web_path=self.file_pathself.temp_file=tempfile.NamedTemporaryFile()self.temp_file.write(r.content)self.file_path=self.temp_file.nameelifnotos.path.isfile(self.file_path):raiseValueError("File path %s is not a valid file or url"%self.file_path)
[docs]defload(self)->List[Document]:"""Load given path as single page."""importdocx2txtreturn[Document(page_content=docx2txt.process(self.file_path),metadata={"source":self.original_file_path},)]
@staticmethoddef_is_valid_url(url:str)->bool:"""Check if the url is valid."""parsed=urlparse(url)returnbool(parsed.netloc)andbool(parsed.scheme)
[docs]classUnstructuredWordDocumentLoader(UnstructuredFileLoader):"""Load `Microsoft Word` file using `Unstructured`. Works with both .docx and .doc files. You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single langchain Document object. If you use "elements" mode, the unstructured library will split the document into elements such as Title and NarrativeText. You can pass in additional unstructured kwargs after mode to apply different unstructured settings. Examples -------- from langchain_community.document_loaders import UnstructuredWordDocumentLoader loader = UnstructuredWordDocumentLoader( "example.docx", mode="elements", strategy="fast", ) docs = loader.load() References ---------- https://unstructured-io.github.io/unstructured/bricks.html#partition-docx """
[docs]def__init__(self,file_path:Union[str,Path],mode:str="single",**unstructured_kwargs:Any,):""" Args: file_path: The path to the Word file to load. mode: The mode to use when loading the file. Can be one of "single", "multi", or "all". Default is "single". **unstructured_kwargs: Any kwargs to pass to the unstructured. """file_path=str(file_path)super().__init__(file_path=file_path,mode=mode,**unstructured_kwargs)
def_get_elements(self)->List:fromunstructured.file_utils.filetypeimportFileType,detect_filetype# NOTE(MthwRobinson) - magic will raise an import error if the libmagic# system dependency isn't installed. If it's not installed, we'll just# check the file extensiontry:importmagic# noqa: F401is_doc=detect_filetype(self.file_path)==FileType.DOC# type: ignore[arg-type]exceptImportError:_,extension=os.path.splitext(str(self.file_path))is_doc=extension==".doc"ifis_doc:validate_unstructured_version("0.4.11")ifis_doc:fromunstructured.partition.docimportpartition_docreturnpartition_doc(filename=self.file_path,**self.unstructured_kwargs)# type: ignore[arg-type]else:fromunstructured.partition.docximportpartition_docxreturnpartition_docx(filename=self.file_path,**self.unstructured_kwargs)# type: ignore[arg-type]