Source code for langchain_community.document_loaders.parsers.doc_intelligence

import logging
from typing import Any, Iterator, List, Optional

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob

logger = logging.getLogger(__name__)


[docs] class AzureAIDocumentIntelligenceParser(BaseBlobParser): """Loads a PDF with Azure Document Intelligence (formerly Forms Recognizer)."""
[docs] def __init__( self, api_endpoint: str, api_key: str, api_version: Optional[str] = None, api_model: str = "prebuilt-layout", mode: str = "markdown", analysis_features: Optional[List[str]] = None, ): from azure.ai.documentintelligence import DocumentIntelligenceClient from azure.ai.documentintelligence.models import DocumentAnalysisFeature from azure.core.credentials import AzureKeyCredential kwargs = {} if api_version is not None: kwargs["api_version"] = api_version if analysis_features is not None: _SUPPORTED_FEATURES = [ DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, ] analysis_features = [ DocumentAnalysisFeature(feature) for feature in analysis_features ] if any( [feature not in _SUPPORTED_FEATURES for feature in analysis_features] ): logger.warning( f"The current supported features are: " f"{[f.value for f in _SUPPORTED_FEATURES]}. " "Using other features may result in unexpected behavior." ) self.client = DocumentIntelligenceClient( endpoint=api_endpoint, credential=AzureKeyCredential(api_key), headers={"x-ms-useragent": "langchain-parser/1.0.0"}, features=analysis_features, **kwargs, ) self.api_model = api_model self.mode = mode assert self.mode in ["single", "page", "markdown"]
def _generate_docs_page(self, result: Any) -> Iterator[Document]: for p in result.pages: content = " ".join([line.content for line in p.lines]) d = Document( page_content=content, metadata={ "page": p.page_number, }, ) yield d def _generate_docs_single(self, result: Any) -> Iterator[Document]: yield Document(page_content=result.content, metadata=result.as_dict())
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" with blob.as_bytes_io() as file_obj: poller = self.client.begin_analyze_document( self.api_model, file_obj, content_type="application/octet-stream", output_content_format="markdown" if self.mode == "markdown" else "text", ) result = poller.result() if self.mode in ["single", "markdown"]: yield from self._generate_docs_single(result) elif self.mode in ["page"]: yield from self._generate_docs_page(result) else: raise ValueError(f"Invalid mode: {self.mode}")
[docs] def parse_url(self, url: str) -> Iterator[Document]: from azure.ai.documentintelligence.models import AnalyzeDocumentRequest poller = self.client.begin_analyze_document( self.api_model, AnalyzeDocumentRequest(url_source=url), # content_type="application/octet-stream", output_content_format="markdown" if self.mode == "markdown" else "text", ) result = poller.result() if self.mode in ["single", "markdown"]: yield from self._generate_docs_single(result) elif self.mode in ["page"]: yield from self._generate_docs_page(result) else: raise ValueError(f"Invalid mode: {self.mode}")
[docs] def parse_bytes(self, bytes_source: bytes) -> Iterator[Document]: from azure.ai.documentintelligence.models import AnalyzeDocumentRequest poller = self.client.begin_analyze_document( self.api_model, analyze_request=AnalyzeDocumentRequest(bytes_source=bytes_source), # content_type="application/octet-stream", output_content_format="markdown" if self.mode == "markdown" else "text", ) result = poller.result() if self.mode in ["single", "markdown"]: yield from self._generate_docs_single(result) elif self.mode in ["page"]: yield from self._generate_docs_page(result) else: raise ValueError(f"Invalid mode: {self.mode}")