[docs]classUnstructuredPDFLoader(UnstructuredFileLoader):"""Load `PDF` files using `Unstructured`. You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single langchain Document object. If you use "elements" mode, the unstructured library will split the document into elements such as Title and NarrativeText. You can pass in additional unstructured kwargs after mode to apply different unstructured settings. Examples -------- from langchain_community.document_loaders import UnstructuredPDFLoader loader = UnstructuredPDFLoader( "example.pdf", mode="elements", strategy="fast", ) docs = loader.load() References ---------- https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf """def_get_elements(self)->List:fromunstructured.partition.pdfimportpartition_pdfreturnpartition_pdf(filename=self.file_path,**self.unstructured_kwargs)
[docs]classBasePDFLoader(BaseLoader,ABC):"""Base Loader class for `PDF` files. If the file is a web path, it will download it to a temporary file, use it, then clean up the temporary file after completion. """
[docs]def__init__(self,file_path:Union[str,Path],*,headers:Optional[Dict]=None):"""Initialize with a file path. Args: file_path: Either a local, S3 or web path to a PDF file. headers: Headers to use for GET request to download a file from a web path. """self.file_path=str(file_path)self.web_path=Noneself.headers=headersif"~"inself.file_path:self.file_path=os.path.expanduser(self.file_path)# If the file is a web path or S3, download it to a temporary file, and use thatifnotos.path.isfile(self.file_path)andself._is_valid_url(self.file_path):self.temp_dir=tempfile.TemporaryDirectory()_,suffix=os.path.splitext(self.file_path)ifself._is_s3_presigned_url(self.file_path):suffix=urlparse(self.file_path).path.split("/")[-1]temp_pdf=os.path.join(self.temp_dir.name,f"tmp{suffix}")self.web_path=self.file_pathifnotself._is_s3_url(self.file_path):r=requests.get(self.file_path,headers=self.headers)ifr.status_code!=200:raiseValueError("Check the url of your file; returned status code %s"%r.status_code)withopen(temp_pdf,mode="wb")asf:f.write(r.content)self.file_path=str(temp_pdf)elifnotos.path.isfile(self.file_path):raiseValueError("File path %s is not a valid file or url"%self.file_path)
def__del__(self)->None:ifhasattr(self,"temp_dir"):self.temp_dir.cleanup()@staticmethoddef_is_valid_url(url:str)->bool:"""Check if the url is valid."""parsed=urlparse(url)returnbool(parsed.netloc)andbool(parsed.scheme)@staticmethoddef_is_s3_url(url:str)->bool:"""check if the url is S3"""try:result=urlparse(url)ifresult.scheme=="s3"andresult.netloc:returnTruereturnFalseexceptValueError:returnFalse@staticmethoddef_is_s3_presigned_url(url:str)->bool:"""Check if the url is a presigned S3 url."""try:result=urlparse(url)returnbool(re.search(r"\.s3\.amazonaws\.com$",result.netloc))exceptValueError:returnFalse@propertydefsource(self)->str:returnself.web_pathifself.web_pathisnotNoneelseself.file_path
[docs]def__init__(self,file_path:str,password:Optional[Union[str,bytes]]=None,headers:Optional[Dict]=None,extract_images:bool=False,*,extraction_mode:str="plain",extraction_kwargs:Optional[Dict]=None,)->None:"""Initialize with a file path."""try:importpypdf# noqa:F401exceptImportError:raiseImportError("pypdf package not found, please install it with `pip install pypdf`")super().__init__(file_path,headers=headers)self.parser=PyPDFParser(password=password,extract_images=extract_images,extraction_mode=extraction_mode,extraction_kwargs=extraction_kwargs,)
[docs]deflazy_load(self,)->Iterator[Document]:"""Lazy load given path as pages."""ifself.web_path:blob=Blob.from_data(open(self.file_path,"rb").read(),path=self.web_path)# type: ignore[attr-defined]else:blob=Blob.from_path(self.file_path)# type: ignore[attr-defined]yield fromself.parser.parse(blob)
[docs]classPyPDFium2Loader(BasePDFLoader):"""Load `PDF` using `pypdfium2` and chunks at character level."""
[docs]def__init__(self,file_path:str,*,headers:Optional[Dict]=None,extract_images:bool=False,):"""Initialize with a file path."""super().__init__(file_path,headers=headers)self.parser=PyPDFium2Parser(extract_images=extract_images)
[docs]deflazy_load(self,)->Iterator[Document]:"""Lazy load given path as pages."""ifself.web_path:blob=Blob.from_data(open(self.file_path,"rb").read(),path=self.web_path)# type: ignore[attr-defined]else:blob=Blob.from_path(self.file_path)# type: ignore[attr-defined]yield fromself.parser.parse(blob)
[docs]classPyPDFDirectoryLoader(BaseLoader):"""Load a directory with `PDF` files using `pypdf` and chunks at character level. Loader also stores page numbers in metadata. """
[docs]classPDFMinerLoader(BasePDFLoader):"""Load `PDF` files using `PDFMiner`."""
[docs]def__init__(self,file_path:str,*,headers:Optional[Dict]=None,extract_images:bool=False,concatenate_pages:bool=True,)->None:"""Initialize with file path. Args: extract_images: Whether to extract images from PDF. concatenate_pages: If True, concatenate all PDF pages into one a single document. Otherwise, return one document per page. """try:frompdfminer.high_levelimportextract_text# noqa:F401exceptImportError:raiseImportError("`pdfminer` package not found, please install it with ""`pip install pdfminer.six`")super().__init__(file_path,headers=headers)self.parser=PDFMinerParser(extract_images=extract_images,concatenate_pages=concatenate_pages)
[docs]classPDFMinerPDFasHTMLLoader(BasePDFLoader):"""Load `PDF` files as HTML content using `PDFMiner`."""
[docs]def__init__(self,file_path:str,*,headers:Optional[Dict]=None):"""Initialize with a file path."""try:frompdfminer.high_levelimportextract_text_to_fp# noqa:F401exceptImportError:raiseImportError("`pdfminer` package not found, please install it with ""`pip install pdfminer.six`")super().__init__(file_path,headers=headers)
[docs]classPyMuPDFLoader(BasePDFLoader):"""Load `PDF` files using `PyMuPDF`."""
[docs]def__init__(self,file_path:str,*,headers:Optional[Dict]=None,extract_images:bool=False,**kwargs:Any,)->None:"""Initialize with a file path."""try:importfitz# noqa:F401exceptImportError:raiseImportError("`PyMuPDF` package not found, please install it with ""`pip install pymupdf`")super().__init__(file_path,headers=headers)self.extract_images=extract_imagesself.text_kwargs=kwargs
def_lazy_load(self,**kwargs:Any)->Iterator[Document]:ifkwargs:logger.warning(f"Received runtime arguments {kwargs}. Passing runtime args to `load`"f" is deprecated. Please pass arguments during initialization instead.")text_kwargs={**self.text_kwargs,**kwargs}parser=PyMuPDFParser(text_kwargs=text_kwargs,extract_images=self.extract_images)ifself.web_path:blob=Blob.from_data(open(self.file_path,"rb").read(),path=self.web_path)# type: ignore[attr-defined]else:blob=Blob.from_path(self.file_path)# type: ignore[attr-defined]yield fromparser.lazy_parse(blob)
# MathpixPDFLoader implementation taken largely from Daniel Gross's:# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
[docs]classMathpixPDFLoader(BasePDFLoader):"""Load `PDF` files using `Mathpix` service."""
[docs]def__init__(self,file_path:str,processed_file_format:str="md",max_wait_time_seconds:int=500,should_clean_pdf:bool=False,extra_request_data:Optional[Dict[str,Any]]=None,**kwargs:Any,)->None:"""Initialize with a file path. Args: file_path: a file for loading. processed_file_format: a format of the processed file. Default is "md". max_wait_time_seconds: a maximum time to wait for the response from the server. Default is 500. should_clean_pdf: a flag to clean the PDF file. Default is False. extra_request_data: Additional request data. **kwargs: additional keyword arguments. """self.mathpix_api_key=get_from_dict_or_env(kwargs,"mathpix_api_key","MATHPIX_API_KEY")self.mathpix_api_id=get_from_dict_or_env(kwargs,"mathpix_api_id","MATHPIX_API_ID")# The base class isn't expecting these and doesn't collect **kwargskwargs.pop("mathpix_api_key",None)kwargs.pop("mathpix_api_id",None)super().__init__(file_path,**kwargs)self.processed_file_format=processed_file_formatself.extra_request_data=(extra_request_dataifextra_request_dataisnotNoneelse{})self.max_wait_time_seconds=max_wait_time_secondsself.should_clean_pdf=should_clean_pdf
[docs]defsend_pdf(self)->str:withopen(self.file_path,"rb")asf:files={"file":f}response=requests.post(self.url,headers=self._mathpix_headers,files=files,data=self.data)response_data=response.json()if"error"inresponse_data:raiseValueError(f"Mathpix request failed: {response_data['error']}")if"pdf_id"inresponse_data:pdf_id=response_data["pdf_id"]returnpdf_idelse:raiseValueError("Unable to send PDF to Mathpix.")
[docs]defwait_for_processing(self,pdf_id:str)->None:"""Wait for processing to complete. Args: pdf_id: a PDF id. Returns: None """url=self.url+"/"+pdf_idfor_inrange(0,self.max_wait_time_seconds,5):response=requests.get(url,headers=self._mathpix_headers)response_data=response.json()# This indicates an error with the request (e.g. auth problems)error=response_data.get("error",None)error_info=response_data.get("error_info",None)iferrorisnotNone:error_msg=f"Unable to retrieve PDF from Mathpix: {error}"iferror_infoisnotNone:error_msg+=f" ({error_info['id']})"raiseValueError(error_msg)status=response_data.get("status",None)ifstatus=="completed":returnelifstatus=="error":# This indicates an error with the PDF processingraiseValueError("Unable to retrieve PDF from Mathpix")else:print(f"Status: {status}, waiting for processing to complete")# noqa: T201time.sleep(5)raiseTimeoutError
[docs]defclean_pdf(self,contents:str)->str:"""Clean the PDF file. Args: contents: a PDF file contents. Returns: """contents="\n".join([lineforlineincontents.split("\n")ifnotline.startswith("![]")])# replace \section{Title} with # Titlecontents=contents.replace("\\section{","# ").replace("}","")# replace the "\" slash that Mathpix adds to escape $, %, (, etc.contents=(contents.replace(r"\$","$").replace(r"\%","%").replace(r"\(","(").replace(r"\)",")"))returncontents
[docs]classPDFPlumberLoader(BasePDFLoader):"""Load `PDF` files using `pdfplumber`."""
[docs]def__init__(self,file_path:str,text_kwargs:Optional[Mapping[str,Any]]=None,dedupe:bool=False,headers:Optional[Dict]=None,extract_images:bool=False,)->None:"""Initialize with a file path."""try:importpdfplumber# noqa:F401exceptImportError:raiseImportError("pdfplumber package not found, please install it with ""`pip install pdfplumber`")super().__init__(file_path,headers=headers)self.text_kwargs=text_kwargsor{}self.dedupe=dedupeself.extract_images=extract_images
[docs]classAmazonTextractPDFLoader(BasePDFLoader):"""Load `PDF` files from a local file system, HTTP or S3. To authenticate, the AWS client uses the following methods to automatically load credentials: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html If a specific credential profile should be used, you must pass the name of the profile from the ~/.aws/credentials file that is to be used. Make sure the credentials / roles used have the required policies to access the Amazon Textract service. Example: .. code-block:: python from langchain_community.document_loaders import AmazonTextractPDFLoader loader = AmazonTextractPDFLoader( file_path="s3://pdfs/myfile.pdf" ) document = loader.load() """
[docs]def__init__(self,file_path:str,textract_features:Optional[Sequence[str]]=None,client:Optional[Any]=None,credentials_profile_name:Optional[str]=None,region_name:Optional[str]=None,endpoint_url:Optional[str]=None,headers:Optional[Dict]=None,*,linearization_config:Optional["TextLinearizationConfig"]=None,)->None:"""Initialize the loader. Args: file_path: A file, url or s3 path for input file textract_features: Features to be used for extraction, each feature should be passed as a str that conforms to the enum `Textract_Features`, see `amazon-textract-caller` pkg client: boto3 textract client (Optional) credentials_profile_name: AWS profile name, if not default (Optional) region_name: AWS region, eg us-east-1 (Optional) endpoint_url: endpoint url for the textract service (Optional) linearization_config: Config to be used for linearization of the output should be an instance of TextLinearizationConfig from the `textractor` pkg """super().__init__(file_path,headers=headers)try:importtextractcallerastcexceptImportError:raiseImportError("Could not import amazon-textract-caller python package. ""Please install it with `pip install amazon-textract-caller`.")iftextract_features:features=[tc.Textract_Features[x]forxintextract_features]else:features=[]ifcredentials_profile_nameorregion_nameorendpoint_url:try:importboto3ifcredentials_profile_nameisnotNone:session=boto3.Session(profile_name=credentials_profile_name)else:# use default credentialssession=boto3.Session()client_params={}ifregion_name:client_params["region_name"]=region_nameifendpoint_url:client_params["endpoint_url"]=endpoint_urlclient=session.client("textract",**client_params)exceptImportError:raiseImportError("Could not import boto3 python package. ""Please install it with `pip install boto3`.")exceptExceptionase:raiseValueError("Could not load credentials to authenticate with AWS client. ""Please check that credentials in the specified "f"profile name are valid. {e}")fromeself.parser=AmazonTextractPDFParser(textract_features=features,client=client,linearization_config=linearization_config,)
[docs]defload(self)->List[Document]:"""Load given path as pages."""returnlist(self.lazy_load())
[docs]deflazy_load(self,)->Iterator[Document]:"""Lazy load documents"""# the self.file_path is local, but the blob has to include# the S3 location if the file originated from S3 for multi-page documents# raises ValueError when multi-page and not on S3"""ifself.web_pathandself._is_s3_url(self.web_path):blob=Blob(path=self.web_path)# type: ignore[call-arg] # type: ignore[misc]else:blob=Blob.from_path(self.file_path)# type: ignore[attr-defined]ifAmazonTextractPDFLoader._get_number_of_pages(blob)>1:raiseValueError(f"the file {blob.path} is a multi-page document, \ but not stored on S3. \ Textract requires multi-page documents to be on S3.")yield fromself.parser.parse(blob)
@staticmethoddef_get_number_of_pages(blob:Blob)->int:# type: ignore[valid-type]try:importpypdffromPILimportImage,ImageSequenceexceptImportError:raiseImportError("Could not import pypdf or Pilloe python package. ""Please install it with `pip install pypdf Pillow`.")ifblob.mimetype=="application/pdf":# type: ignore[attr-defined]withblob.as_bytes_io()asinput_pdf_file:# type: ignore[attr-defined]pdf_reader=pypdf.PdfReader(input_pdf_file)returnlen(pdf_reader.pages)elifblob.mimetype=="image/tiff":# type: ignore[attr-defined]num_pages=0img=Image.open(blob.as_bytes())# type: ignore[attr-defined]for_,_inenumerate(ImageSequence.Iterator(img)):num_pages+=1returnnum_pageselifblob.mimetypein["image/png","image/jpeg"]:# type: ignore[attr-defined]return1else:raiseValueError(f"unsupported mime type: {blob.mimetype}")# type: ignore[attr-defined]
[docs]classDedocPDFLoader(DedocBaseLoader):""" DedocPDFLoader document loader integration to load PDF files using `dedoc`. The file loader can automatically detect the correctness of a textual layer in the PDF document. Note that `__init__` method supports parameters that differ from ones of DedocBaseLoader. Setup: Install ``dedoc`` package. .. code-block:: bash pip install -U dedoc Instantiate: .. code-block:: python from langchain_community.document_loaders import DedocPDFLoader loader = DedocPDFLoader( file_path="example.pdf", # split=..., # with_tables=..., # pdf_with_text_layer=..., # pages=..., # ... ) Load: .. code-block:: python docs = loader.load() print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Some text { 'file_name': 'example.pdf', 'file_type': 'application/pdf', # ... } Lazy load: .. code-block:: python docs = [] docs_lazy = loader.lazy_load() for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Some text { 'file_name': 'example.pdf', 'file_type': 'application/pdf', # ... } Parameters used for document parsing via `dedoc` (https://dedoc.readthedocs.io/en/latest/parameters/pdf_handling.html): with_attachments: enable attached files extraction recursion_deep_attachments: recursion level for attached files extraction, works only when with_attachments==True pdf_with_text_layer: type of handler for parsing, available options ["true", "false", "tabby", "auto", "auto_tabby" (default)] language: language of the document for PDF without a textual layer, available options ["eng", "rus", "rus+eng" (default)], the list of languages can be extended, please see https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html pages: page slice to define the reading range for parsing is_one_column_document: detect number of columns for PDF without a textual layer, available options ["true", "false", "auto" (default)] document_orientation: fix document orientation (90, 180, 270 degrees) for PDF without a textual layer, available options ["auto" (default), "no_change"] need_header_footer_analysis: remove headers and footers from the output result need_binarization: clean pages background (binarize) for PDF without a textual layer need_pdf_table_analysis: parse tables for PDF without a textual layer """def_make_config(self)->dict:fromdedoc.utils.langchainimportmake_manager_pdf_configreturnmake_manager_pdf_config(file_path=self.file_path,parsing_params=self.parsing_parameters,split=self.split,)
[docs]classDocumentIntelligenceLoader(BasePDFLoader):"""Load a PDF with Azure Document Intelligence"""
[docs]def__init__(self,file_path:str,client:Any,model:str="prebuilt-document",headers:Optional[Dict]=None,)->None:""" Initialize the object for file processing with Azure Document Intelligence (formerly Form Recognizer). This constructor initializes a DocumentIntelligenceParser object to be used for parsing files using the Azure Document Intelligence API. The load method generates a Document node including metadata (source blob and page number) for each page. Parameters: ----------- file_path : str The path to the file that needs to be parsed. client: Any A DocumentAnalysisClient to perform the analysis of the blob model : str The model name or ID to be used for form recognition in Azure. Examples: --------- >>> obj = DocumentIntelligenceLoader( ... file_path="path/to/file", ... client=client, ... model="prebuilt-document" ... ) """self.parser=DocumentIntelligenceParser(client=client,model=model)super().__init__(file_path,headers=headers)
[docs]defload(self)->List[Document]:"""Load given path as pages."""returnlist(self.lazy_load())
[docs]deflazy_load(self,)->Iterator[Document]:"""Lazy load given path as pages."""blob=Blob.from_path(self.file_path)# type: ignore[attr-defined]yield fromself.parser.parse(blob)
# Legacy: only for backwards compatibility. Use PyPDFLoader insteadPagedPDFSplitter=PyPDFLoader