[docs]@deprecated(since="0.0.32",removal="1.0",alternative_import="langchain_google_community.GCSFileLoader",)classGCSFileLoader(BaseLoader):"""Load from GCS file."""
[docs]def__init__(self,project_name:str,bucket:str,blob:str,loader_func:Optional[Callable[[str],BaseLoader]]=None,):"""Initialize with bucket and key name. Args: project_name: The name of the project to load bucket: The name of the GCS bucket. blob: The name of the GCS blob to load. loader_func: A loader function that instantiates a loader based on a file_path argument. If nothing is provided, the UnstructuredFileLoader is used. Examples: To use an alternative PDF loader: >> from from langchain_community.document_loaders import PyPDFLoader >> loader = GCSFileLoader(..., loader_func=PyPDFLoader) To use UnstructuredFileLoader with additional arguments: >> loader = GCSFileLoader(..., >> loader_func=lambda x: UnstructuredFileLoader(x, mode="elements")) """self.bucket=bucketself.blob=blobself.project_name=project_namedefdefault_loader_func(file_path:str)->BaseLoader:returnUnstructuredFileLoader(file_path)self._loader_func=loader_funcifloader_funcelsedefault_loader_func
[docs]defload(self)->List[Document]:"""Load documents."""try:fromgoogle.cloudimportstorageexceptImportError:raiseImportError("Could not import google-cloud-storage python package. ""Please install it with `pip install google-cloud-storage`.")# initialize a clientstorage_client=storage.Client(self.project_name,client_info=get_client_info("google-cloud-storage"))# Create a bucket object for our bucketbucket=storage_client.get_bucket(self.bucket)# Create a blob object from the filepathblob=bucket.blob(self.blob)# retrieve custom metadata associated with the blobmetadata=bucket.get_blob(self.blob).metadatawithtempfile.TemporaryDirectory()astemp_dir:file_path=f"{temp_dir}/{self.blob}"os.makedirs(os.path.dirname(file_path),exist_ok=True)# Download the file to a destinationblob.download_to_filename(file_path)loader=self._loader_func(file_path)docs=loader.load()fordocindocs:if"source"indoc.metadata:doc.metadata["source"]=f"gs://{self.bucket}/{self.blob}"ifmetadata:doc.metadata.update(metadata)returndocs