Source code for langchain_community.document_loaders.blob_loaders.file_system
"""Use to load blobs from the local file system."""frompathlibimportPathfromtypingimportCallable,Iterable,Iterator,Optional,Sequence,TypeVar,Unionfromlangchain_community.document_loaders.blob_loaders.schemaimportBlob,BlobLoaderT=TypeVar("T")def_make_iterator(length_func:Callable[[],int],show_progress:bool=False)->Callable[[Iterable[T]],Iterator[T]]:"""Create a function that optionally wraps an iterable in tqdm."""iterator:Callable[[Iterable[T]],Iterator[T]]ifshow_progress:try:fromtqdm.autoimporttqdmexceptImportError:raiseImportError("You must install tqdm to use show_progress=True.""You can install tqdm with `pip install tqdm`.")# Make sure to provide `total` here so that tqdm can show# a progress bar that takes into account the total number of files.def_with_tqdm(iterable:Iterable[T])->Iterator[T]:"""Wrap an iterable in a tqdm progress bar."""returntqdm(iterable,total=length_func())iterator=_with_tqdmelse:iterator=iterreturniterator# PUBLIC API
[docs]classFileSystemBlobLoader(BlobLoader):"""Load blobs in the local file system. Example: .. code-block:: python from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader loader = FileSystemBlobLoader("/path/to/directory") for blob in loader.yield_blobs(): print(blob) # noqa: T201 """# noqa: E501
[docs]def__init__(self,path:Union[str,Path],*,glob:str="**/[!.]*",exclude:Sequence[str]=(),suffixes:Optional[Sequence[str]]=None,show_progress:bool=False,)->None:"""Initialize with a path to directory and how to glob over it. Args: path: Path to directory to load from or path to file to load. If a path to a file is provided, glob/exclude/suffixes are ignored. glob: Glob pattern relative to the specified path by default set to pick up all non-hidden files exclude: patterns to exclude from results, use glob syntax suffixes: Provide to keep only files with these suffixes Useful when wanting to keep files with different suffixes Suffixes must include the dot, e.g. ".txt" show_progress: If true, will show a progress bar as the files are loaded. This forces an iteration through all matching files to count them prior to loading them. Examples: .. code-block:: python from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader # Load a single file. loader = FileSystemBlobLoader("/path/to/file.txt") # Recursively load all text files in a directory. loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt") # Recursively load all non-hidden files in a directory. loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*") # Load all files in a directory without recursion. loader = FileSystemBlobLoader("/path/to/directory", glob="*") # Recursively load all files in a directory, except for py or pyc files. loader = FileSystemBlobLoader( "/path/to/directory", glob="**/*.txt", exclude=["**/*.py", "**/*.pyc"] ) """# noqa: E501ifisinstance(path,Path):_path=pathelifisinstance(path,str):_path=Path(path)else:raiseTypeError(f"Expected str or Path, got {type(path)}")self.path=_path.expanduser()# Expand user to handle ~self.glob=globself.suffixes=set(suffixesor[])self.show_progress=show_progressself.exclude=exclude
[docs]defyield_blobs(self,)->Iterable[Blob]:"""Yield blobs that match the requested pattern."""iterator=_make_iterator(length_func=self.count_matching_files,show_progress=self.show_progress)forpathiniterator(self._yield_paths()):yieldBlob.from_path(path)
def_yield_paths(self)->Iterable[Path]:"""Yield paths that match the requested pattern."""ifself.path.is_file():yieldself.pathreturnpaths=self.path.glob(self.glob)forpathinpaths:ifself.exclude:ifany(path.match(glob)forglobinself.exclude):continueifpath.is_file():ifself.suffixesandpath.suffixnotinself.suffixes:continueyieldpath
[docs]defcount_matching_files(self)->int:"""Count files that match the pattern without loading them."""# Carry out a full iteration to count the files without# materializing anything expensive in memory.num=0for_inself._yield_paths():num+=1returnnum