[docs]classBaseImageBlobParser(BaseBlobParser):"""Abstract base class for parsing image blobs into text."""@abstractmethoddef_analyze_image(self,img:"Image")->str:"""Abstract method to analyze an image and extract textual content. Args: img: The image to be analyzed. Returns: The extracted text content. """
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:"""Lazily parse a blob and yields Documents containing the parsed content. Args: blob (Blob): The blob to be parsed. Yields: Document: A document containing the parsed content and metadata. """try:fromPILimportImageasImgwithblob.as_bytes_io()asbuf:ifblob.mimetype=="application/x-npy":img=Img.fromarray(numpy.load(buf))else:img=Img.open(buf)content=self._analyze_image(img)logger.debug("Image text: %s",content.replace("\n","\\n"))yieldDocument(page_content=content,metadata={**blob.metadata,**{"source":blob.source}},)exceptImportError:raiseImportError("`Pillow` package not found, please install it with ""`pip install Pillow`")
[docs]classRapidOCRBlobParser(BaseImageBlobParser):"""Parser for extracting text from images using the RapidOCR library. Attributes: ocr: The RapidOCR instance for performing OCR. """
[docs]def__init__(self,)->None:""" Initializes the RapidOCRBlobParser. """super().__init__()self.ocr=None
def_analyze_image(self,img:"Image")->str:""" Analyzes an image and extracts text using RapidOCR. Args: img (Image): The image to be analyzed. Returns: str: The extracted text content. """ifnotself.ocr:try:fromrapidocr_onnxruntimeimportRapidOCRself.ocr=RapidOCR()exceptImportError:raiseImportError("`rapidocr-onnxruntime` package not found, please install it with ""`pip install rapidocr-onnxruntime`")ocr_result,_=self.ocr(np.array(img))# type: ignorecontent=""ifocr_result:content=("\n".join([text[1]fortextinocr_result])).strip()returncontent
[docs]classTesseractBlobParser(BaseImageBlobParser):"""Parse for extracting text from images using the Tesseract OCR library."""
[docs]def__init__(self,*,langs:Iterable[str]=("eng",),):"""Initialize the TesseractBlobParser. Args: langs (list[str]): The languages to use for OCR. """super().__init__()self.langs=list(langs)
def_analyze_image(self,img:"Image")->str:"""Analyze an image and extracts text using Tesseract OCR. Args: img: The image to be analyzed. Returns: str: The extracted text content. """try:importpytesseractexceptImportError:raiseImportError("`pytesseract` package not found, please install it with ""`pip install pytesseract`")returnpytesseract.image_to_string(img,lang="+".join(self.langs)).strip()
_PROMPT_IMAGES_TO_DESCRIPTION:str=("You are an assistant tasked with summarizing images for retrieval. ""1. These summaries will be embedded and used to retrieve the raw image. ""Give a concise summary of the image that is well optimized for retrieval\n""2. extract all the text from the image. ""Do not exclude any content from the page.\n""Format answer in markdown without explanatory text ""and without markdown delimiter ``` at the beginning. ")
[docs]classLLMImageBlobParser(BaseImageBlobParser):"""Parser for analyzing images using a language model (LLM). Attributes: model (BaseChatModel): The language model to use for analysis. prompt (str): The prompt to provide to the language model. """
[docs]def__init__(self,*,model:BaseChatModel,prompt:str=_PROMPT_IMAGES_TO_DESCRIPTION,):"""Initializes the LLMImageBlobParser. Args: model (BaseChatModel): The language model to use for analysis. prompt (str): The prompt to provide to the language model. """super().__init__()self.model=modelself.prompt=prompt
def_analyze_image(self,img:"Image")->str:"""Analyze an image using the provided language model. Args: img: The image to be analyzed. Returns: The extracted textual content. """image_bytes=io.BytesIO()img.save(image_bytes,format="PNG")img_base64=base64.b64encode(image_bytes.getvalue()).decode("utf-8")msg=self.model.invoke([HumanMessage(content=[{"type":"text","text":self.prompt.format(format=format),},{"type":"image_url","image_url":{"url":f"data:image/jpeg;base64,{img_base64}"},},])])result=msg.contentassertisinstance(result,str)returnresult