[docs]classLLMSherpaFileLoader(BaseLoader):"""Load Documents using `LLMSherpa`. LLMSherpaFileLoader use LayoutPDFReader, which is part of the LLMSherpa library. This tool is designed to parse PDFs while preserving their layout information, which is often lost when using most PDF to text parsers. Examples -------- from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader loader = LLMSherpaFileLoader( "example.pdf", strategy="chunks", llmsherpa_api_url="http://localhost:5010/api/parseDocument?renderFormat=all", ) docs = loader.load() """
[docs]def__init__(self,file_path:Union[str,Path],new_indent_parser:bool=True,apply_ocr:bool=True,strategy:str="chunks",llmsherpa_api_url:str=DEFAULT_API,):"""Initialize with a file path."""try:importllmsherpa# noqa:F401exceptImportError:raiseImportError("llmsherpa package not found, please install it with ""`pip install llmsherpa`")_valid_strategies=["sections","chunks","html","text"]ifstrategynotin_valid_strategies:raiseValueError(f"Got {strategy} for `strategy`, "f"but should be one of `{_valid_strategies}`")# validate llmsherpa urlifnotself._is_valid_url(llmsherpa_api_url):raiseValueError(f"Invalid URL: {llmsherpa_api_url}")self.url=self._validate_llmsherpa_url(url=llmsherpa_api_url,new_indent_parser=new_indent_parser,apply_ocr=apply_ocr,)self.strategy=strategyself.file_path=str(file_path)
@staticmethoddef_is_valid_url(url:str)->bool:"""Check if the url is valid."""parsed=urlparse(url)returnbool(parsed.netloc)andbool(parsed.scheme)@staticmethoddef_validate_llmsherpa_url(url:str,new_indent_parser:bool=True,apply_ocr:bool=True)->str:"""Check if the llmsherpa url is valid."""parsed=urlparse(url)valid_url=urlif("/api/parseDocument"notinparsed.path)and("/api/document/developer/parseDocument"notinparsed.path):raiseValueError(f"Invalid LLMSherpa URL: {url}")if"renderFormat=all"notinparsed.query:valid_url=valid_url+"?renderFormat=all"ifnew_indent_parserand"useNewIndentParser=true"notinparsed.query:valid_url=valid_url+"&useNewIndentParser=true"ifapply_ocrand"applyOcr=yes"notinparsed.query:valid_url=valid_url+"&applyOcr=yes"returnvalid_url