[docs]classMsWordParser(BaseBlobParser):"""Parse the Microsoft Word documents from a blob."""
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:# type: ignore[valid-type]"""Parse a Microsoft Word document into the Document iterator. Args: blob: The blob to parse. Returns: An iterator of Documents. """try:fromunstructured.partition.docimportpartition_docfromunstructured.partition.docximportpartition_docxexceptImportErrorase:raiseImportError("Could not import unstructured, please install with `pip install ""unstructured`.")fromemime_type_parser={"application/msword":partition_doc,"application/vnd.openxmlformats-officedocument.wordprocessingml.document":(partition_docx),}ifblob.mimetypenotin(# type: ignore[attr-defined]"application/msword","application/vnd.openxmlformats-officedocument.wordprocessingml.document",):raiseValueError("This blob type is not supported for this parser.")withblob.as_bytes_io()asword_document:# type: ignore[attr-defined]elements=mime_type_parser[blob.mimetype](file=word_document)# type: ignore[attr-defined] # type: ignore[operator] # type: ignore[operator] # type: ignore[operator] # type: ignore[operator] # type: ignore[operator] # type: ignore[operator]text="\n\n".join([str(el)forelinelements])metadata={"source":blob.source}# type: ignore[attr-defined]yieldDocument(page_content=text,metadata=metadata)