Source code for langchain_community.document_loaders.parsers.generic
"""Code for generic / auxiliary parsers.This module contains some logic to help assemble more sophisticated parsers."""fromtypingimportIterator,Mapping,Optionalfromlangchain_core.documentsimportDocumentfromlangchain_community.document_loaders.baseimportBaseBlobParserfromlangchain_community.document_loaders.blob_loaders.schemaimportBlob
[docs]classMimeTypeBasedParser(BaseBlobParser):"""Parser that uses `mime`-types to parse a blob. This parser is useful for simple pipelines where the mime-type is sufficient to determine how to parse a blob. To use, configure handlers based on mime-types and pass them to the initializer. Example: .. code-block:: python from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser parser = MimeTypeBasedParser( handlers={ "application/pdf": ..., }, fallback_parser=..., ) """# noqa: E501
[docs]def__init__(self,handlers:Mapping[str,BaseBlobParser],*,fallback_parser:Optional[BaseBlobParser]=None,)->None:"""Define a parser that uses mime-types to determine how to parse a blob. Args: handlers: A mapping from mime-types to functions that take a blob, parse it and return a document. fallback_parser: A fallback_parser parser to use if the mime-type is not found in the handlers. If provided, this parser will be used to parse blobs with all mime-types not found in the handlers. If not provided, a ValueError will be raised if the mime-type is not found in the handlers. """self.handlers=handlersself.fallback_parser=fallback_parser
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:"""Load documents from a blob."""mimetype=blob.mimetypeifmimetypeisNone:raiseValueError(f"{blob} does not have a mimetype.")ifmimetypeinself.handlers:handler=self.handlers[mimetype]yield fromhandler.lazy_parse(blob)else:ifself.fallback_parserisnotNone:yield fromself.fallback_parser.lazy_parse(blob)else:raiseValueError(f"Unsupported mime type: {mimetype}")