[docs]defvalidate_file_path(file_path:Union[str,Path,List[str],List[Path]])->None:""" Validates if a file exists at the given file path. Args: file_path (Union[str, Path, List[str], List[Path]): The file path(s) to be validated. Raises: FileNotFoundError: If the file or any of the files in the list do not exist. """ifisinstance(file_path,list):forpathinfile_path:validate_file_path(path)returnifnotos.path.exists(file_path):raiseFileNotFoundError(f"File not found: {file_path}")
[docs]defget_from_param_or_env(key:str,param:Optional[str]=None,env_key:Optional[str]=None,default:Optional[str]=None,)->str:"""Get a value from a param or an environment variable."""ifparamisnotNone:returnparamelifenv_keyandenv_keyinos.environandos.environ[env_key]:returnos.environ[env_key]elifdefaultisnotNone:returndefaultelse:raiseValueError(f"Did not find {key}, please add an environment variable"f" `{env_key}` which contains it, or pass"f" `{key}` as a named parameter.")
[docs]classUpstageDocumentParseLoader(BaseLoader):"""Upstage Document Parse Loader. To use, you should have the environment variable `UPSTAGE_API_KEY` set with your API key or pass it as a named parameter to the constructor. Example: .. code-block:: python from langchain_upstage import UpstageDocumentParseLoader file_path = "/PATH/TO/YOUR/FILE.pdf" loader = UpstageDocumentParseLoader( file_path, split="page", output_format="text" ) """
[docs]def__init__(self,file_path:Union[str,Path,List[str],List[Path]],split:SplitType="none",api_key:Optional[str]=None,base_url:str=DOCUMENT_PARSE_BASE_URL,model:str=DOCUMENT_PARSE_DEFAULT_MODEL,ocr:OCR="auto",output_format:OutputFormat="html",coordinates:bool=True,base64_encoding:List[Category]=[],):""" Initializes an instance of the Upstage document parse loader. Args: file_path (Union[str, Path, List[str], List[Path]]): The path to the document to be loaded. split (SplitType, optional): The type of splitting to be applied. Defaults to "none" (no splitting). api_key (str, optional): The API key for accessing the Upstage API. Defaults to None, in which case it will be fetched from the environment variable `UPSTAGE_API_KEY`. base_url (str, optional): The base URL for accessing the Upstage API. model (str): The model to be used for the document parse. Defaults to "document-parse". ocr (OCRMode, optional): Extract text from images in the document using OCR. If the value is "force", OCR is used to extract text from an image. If the value is "auto", text is extracted from a PDF. (An error will occur if the value is "auto" and the input is NOT in PDF format) output_format (OutputFormat, optional): Format of the inference results. coordinates (bool, optional): Whether to include the coordinates of the OCR in the output. base64_encoding (List[Category], optional): The category of the elements to be encoded in base64. """self.file_path=file_pathself.split=splitself.api_key=get_from_param_or_env("UPSTAGE_API_KEY",api_key,"UPSTAGE_API_KEY",os.environ.get("UPSTAGE_API_KEY"),)self.base_url=base_urlself.model=modelself.ocr=ocrself.output_format=output_formatself.coordinates=coordinatesself.base64_encoding=base64_encodingself.parser=UpstageDocumentParseParser(api_key=self.api_key,base_url=self.base_url,model=self.model,split=self.split,ocr=self.ocr,output_format=self.output_format,coordinates=self.coordinates,base64_encoding=self.base64_encoding,)validate_file_path(self.file_path)
[docs]defload(self)->List[Document]:""" Loads and parses the document using the UpstageDocumentParseParser. Returns: A list of Document objects representing the parsed layout analysis. """ifisinstance(self.file_path,list):result=[]forfile_pathinself.file_path:blob=Blob.from_path(file_path)result.extend(list(self.parser.lazy_parse(blob,is_batch=True)))returnresultelse:blob=Blob.from_path(self.file_path)returnlist(self.parser.lazy_parse(blob,is_batch=True))
[docs]deflazy_load(self)->Iterator[Document]:""" Lazily loads and parses the document using the UpstageDocumentParseParser. Returns: An iterator of Document objects representing the parsed layout analysis. """ifisinstance(self.file_path,list):forfile_pathinself.file_path:blob=Blob.from_path(file_path)yield fromself.parser.lazy_parse(blob,is_batch=True)else:blob=Blob.from_path(self.file_path)yield fromself.parser.lazy_parse(blob)
[docs]defmerge_and_split(self,documents:List[Document],splitter:Optional[object]=None)->List[Document]:""" Merges the page content and metadata of multiple documents into a single document, or splits the documents using a custom splitter. Args: documents (list): A list of Document objects to be merged and split. splitter (object, optional): An optional splitter object that implements the `split_documents` method. If provided, the documents will be split using this splitter. Defaults to None, in which case the documents are merged. Returns: list: A list of Document objects. If no splitter is provided, a single Document object is returned with the merged content and combined metadata. If a splitter is provided, the documents are split and a list of Document objects is returned. Raises: AssertionError: If a splitter is provided but it does not implement the `split_documents` method. """ifsplitterisNone:merged_content=" ".join([doc.page_contentfordocindocuments])metadatas:Dict[str,Any]=dict()for_metain[doc.metadatafordocindocuments]:forkey,valuein_meta.items():ifkeyinmetadatas:metadatas[key].append(value)else:metadatas[key]=[value]return[Document(page_content=merged_content,metadata=metadatas)]else:asserthasattr(splitter,"split_documents"),"splitter must implement split_documents method"returnsplitter.split_documents(documents)