[docs]defparse_output(data:dict,output_format:OutputFormat)->str:""" Parse the output data based on the specified output type. Args: data (dict): The data to be parsed. output_format (OutputFormat): The output format to parse the element data into. Returns: str: The parsed output. Raises: ValueError: If the output type is invalid. """content=data["content"]ifoutput_format=="text":returncontent["text"]elifoutput_format=="html":returncontent["html"]elifoutput_format=="markdown":returncontent["markdown"]else:raiseValueError(f"Invalid output type: {output_format}")
[docs]defget_from_param_or_env(key:str,param:Optional[str]=None,env_key:Optional[str]=None,default:Optional[str]=None,)->str:"""Get a value from a param or an environment variable."""ifparamisnotNone:returnparamelifenv_keyandenv_keyinos.environandos.environ[env_key]:returnos.environ[env_key]elifdefaultisnotNone:returndefaultelse:raiseValueError(f"Did not find {key}, please add an environment variable"f" `{env_key}` which contains it, or pass"f" `{key}` as a named parameter.")
[docs]classUpstageDocumentParseParser(BaseBlobParser):"""Upstage Document Parse Parser. To use, you should have the environment variable `UPSTAGE_API_KEY` set with your API key or pass it as a named parameter to the constructor. Example: .. code-block:: python from langchain_upstage import UpstageDocumentParseParser loader = UpstageDocumentParseParser(split="page", output_format="text") """
[docs]def__init__(self,api_key:Optional[str]=None,base_url:str=DOCUMENT_PARSE_BASE_URL,model:str=DOCUMENT_PARSE_DEFAULT_MODEL,split:SplitType="none",ocr:OCR="auto",output_format:OutputFormat="html",coordinates:bool=True,base64_encoding:List[Category]=[],):""" Initializes an instance of the Upstage class. Args: api_key (str, optional): The API key for accessing the Upstage API. Defaults to None, in which case it will be fetched from the environment variable `UPSTAGE_API_KEY`. base_url (str, optional): The base URL for accessing the Upstage API. model (str): The model to be used for the document parse. Defaults to "document-parse". split (SplitType, optional): The type of splitting to be applied. Defaults to "none" (no splitting). ocr (OCRMode, optional): Extract text from images in the document using OCR. If the value is "force", OCR is used to extract text from an image. If the value is "auto", text is extracted from a PDF. (An error will occur if the value is "auto" and the input is NOT in PDF format) output_format (OutputFormat, optional): Format of the inference results. coordinates (bool, optional): Whether to include the coordinates of the OCR in the output. base64_encoding (List[Category], optional): The category of the elements to be encoded in base64. """self.api_key=get_from_param_or_env("UPSTAGE_API_KEY",api_key,"UPSTAGE_API_KEY",os.environ.get("UPSTAGE_API_KEY"),)self.base_url=base_urlself.model=modelself.split=splitself.ocr=ocrself.output_format=output_formatself.coordinates=coordinatesself.base64_encoding=base64_encoding
def_get_response(self,files:Dict)->List:""" Sends a POST request to the API endpoint with the provided files and returns the response. Args: files (dict): A dictionary containing the files to be sent in the request. Returns: dict: The JSON response from the API. Raises: ValueError: If there is an error in the API call. """try:headers={"Authorization":f"Bearer {self.api_key}",}response=requests.post(self.base_url,headers=headers,files=files,data={"ocr":self.ocr,"model":self.model,"output_formats":f"['{self.output_format}']","coordinates":self.coordinates,"base64_encoding":f"{self.base64_encoding}",},)response.raise_for_status()result=response.json().get("elements",[])returnresultexceptrequests.HTTPErrorase:raiseValueError(f"HTTP error: {e.response.text}")exceptrequests.RequestExceptionase:# Handle any request-related exceptionsraiseValueError(f"Failed to send request: {e}")exceptjson.JSONDecodeErrorase:# Handle JSON decode errorsraiseValueError(f"Failed to decode JSON response: {e}")exceptExceptionase:# Handle any other exceptionsraiseValueError(f"An error occurred: {e}")def_split_and_request(self,full_docs:PdfReader,start_page:int,num_pages:int=DEFAULT_NUM_PAGES,)->List:""" Splits the full pdf document into partial pages and sends a request to the server. Args: full_docs (PdfReader): The full document to be split and requested. start_page (int): The starting page number for splitting the document. num_pages (int, optional): The number of pages to split the document into. Defaults to DEFAULT_NUMBER_OF_PAGE. Returns: response: The response from the server. """merger=PdfWriter()merger.append(full_docs,pages=(start_page,min(start_page+num_pages,full_docs.get_num_pages())),)withio.BytesIO()asbuffer:merger.write(buffer)buffer.seek(0)response=self._get_response({"document":buffer})returnresponsedef_element_document(self,elements:Dict,start_page:int=0)->Document:""" Converts an elements into a Document object. Args: elements (Dict) : The elements to convert. start_page (int): The starting page number for splitting the document. This number starts from zero. Returns: A list containing a single Document object. """metadata={"id":elements["id"],"page":elements["page"]+start_page,"category":elements["category"],}ifself.coordinatesandelements.get("coordinates"):metadata["coordinates"]=elements.get("coordinates")ifself.base64_encodingandelements.get("base64_encoding"):metadata["base64_encoding"]=elements.get("base64_encoding")returnDocument(page_content=(parse_output(elements,self.output_format)),metadata=metadata,)def_page_document(self,elements:List,start_page:int=0)->List[Document]:""" Combines elements with the same page number into a single Document object. Args: elements (List): A list of elements containing page numbers. start_page (int): The starting page number for splitting the document. This number starts from zero. Returns: List[Document]: A list of Document objects, each representing a page with its content and metadata. """_docs=[]pages=sorted(set(map(lambdax:x["page"],elements)))page_group=[[elementforelementinelementsifelement["page"]==x]forxinpages]forgroupinpage_group:page_content=" ".join([parse_output(element,self.output_format)forelementingroup])metadata={"page":group[0]["page"]+start_page,}ifself.base64_encoding:base64_encodings=[element.get("base64_encoding")forelementingroupifelement.get("base64_encoding")isnotNone]metadata["base64_encodings"]=base64_encodingsifself.coordinates:coordinates=[element.get("coordinates")forelementingroupifelement.get("coordinates")isnotNone]metadata["coordinates"]=coordinates_docs.append(Document(page_content=page_content,metadata=metadata,))return_docs
[docs]deflazy_parse(self,blob:Blob,is_batch:bool=False)->Iterator[Document]:""" Lazily parses a document and yields Document objects based on the specified split type. Args: blob (Blob): The input document blob to parse. is_batch (bool, optional): Whether to parse the document in batches. Defaults to False (single page parsing) Yields: Document: The parsed document object. Raises: ValueError: If an invalid split type is provided. """ifis_batch:num_pages=DEFAULT_NUM_PAGESelse:num_pages=1try:full_docs=PdfReader(str(blob.path))number_of_pages=full_docs.get_num_pages()is_pdf=TrueexceptPdfReadError:number_of_pages=1is_pdf=FalseexceptExceptionase:raiseValueError(f"Failed to read PDF file: {e}")ifself.split=="none":result=""base64_encodings=[]coordinates=[]ifis_pdf:start_page=0num_pages=DEFAULT_NUM_PAGESfor_inrange(number_of_pages):ifstart_page>=number_of_pages:breakelements=self._split_and_request(full_docs,start_page,num_pages)forelementinelements:result+=parse_output(element,self.output_format)ifself.base64_encoding:base64_encoding=element.get("base64_encoding")ifbase64_encodingisnotNone:base64_encodings.append(base64_encoding)ifself.coordinates:coordinate=element.get("coordinates")ifcoordinateisnotNone:coordinates.append(coordinate)start_page+=num_pageselse:ifnotblob.path:raiseValueError("Blob path is required for non-PDF files.")withopen(blob.path,"rb")asf:elements=self._get_response({"document":f})forelementinelements:result+=parse_output(element,self.output_format)if(self.base64_encodingandelement.get("base64_encoding")isnotNone):base64_encoding=element.get("base64_encoding")ifbase64_encodingisnotNone:base64_encodings.append(base64_encoding)ifself.coordinatesandelement.get("coordinates")isnotNone:coordinate=element.get("coordinates")ifcoordinateisnotNone:coordinates.append(coordinate)metadata:Dict[str,Any]={"total_pages":number_of_pages,}ifself.coordinates:metadata["coordinates"]=coordinatesifself.base64_encoding:metadata["base64_encodings"]=base64_encodingsyieldDocument(page_content=result,metadata=metadata,)elifself.split=="element":ifis_pdf:start_page=0for_inrange(number_of_pages):ifstart_page>=number_of_pages:breakelements=self._split_and_request(full_docs,start_page,num_pages)forelementinelements:yieldself._element_document(element,start_page)start_page+=num_pageselse:ifnotblob.path:raiseValueError("Blob path is required for non-PDF files.")withopen(blob.path,"rb")asf:elements=self._get_response({"document":f})forelementinelements:yieldself._element_document(element)elifself.split=="page":ifis_pdf:start_page=0for_inrange(number_of_pages):ifstart_page>=number_of_pages:breakelements=self._split_and_request(full_docs,start_page,num_pages)yield fromself._page_document(elements,start_page)start_page+=num_pageselse:ifnotblob.path:raiseValueError("Blob path is required for non-PDF files.")withopen(blob.path,"rb")asf:elements=self._get_response({"document":f})yield fromself._page_document(elements)else:raiseValueError(f"Invalid split type: {self.split}")