Source code for langchain_community.document_loaders.blob_loaders.file_system

"""Use to load blobs from the local file system."""

from pathlib import Path
from typing import Callable, Iterable, Iterator, Optional, Sequence, TypeVar, Union

from langchain_community.document_loaders.blob_loaders.schema import Blob, BlobLoader

T = TypeVar("T")


def _make_iterator(
    length_func: Callable[[], int], show_progress: bool = False
) -> Callable[[Iterable[T]], Iterator[T]]:
    """Create a function that optionally wraps an iterable in tqdm."""
    iterator: Callable[[Iterable[T]], Iterator[T]]
    if show_progress:
        try:
            from tqdm.auto import tqdm
        except ImportError:
            raise ImportError(
                "You must install tqdm to use show_progress=True."
                "You can install tqdm with `pip install tqdm`."
            )

        # Make sure to provide `total` here so that tqdm can show
        # a progress bar that takes into account the total number of files.
        def _with_tqdm(iterable: Iterable[T]) -> Iterator[T]:
            """Wrap an iterable in a tqdm progress bar."""
            return tqdm(iterable, total=length_func())

        iterator = _with_tqdm
    else:
        iterator = iter

    return iterator


# PUBLIC API


[docs] class FileSystemBlobLoader(BlobLoader): """Load blobs in the local file system. Example: .. code-block:: python from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader loader = FileSystemBlobLoader("/path/to/directory") for blob in loader.yield_blobs(): print(blob) # noqa: T201 """ # noqa: E501
[docs] def __init__( self, path: Union[str, Path], *, glob: str = "**/[!.]*", exclude: Sequence[str] = (), suffixes: Optional[Sequence[str]] = None, show_progress: bool = False, ) -> None: """Initialize with a path to directory and how to glob over it. Args: path: Path to directory to load from or path to file to load. If a path to a file is provided, glob/exclude/suffixes are ignored. glob: Glob pattern relative to the specified path by default set to pick up all non-hidden files exclude: patterns to exclude from results, use glob syntax suffixes: Provide to keep only files with these suffixes Useful when wanting to keep files with different suffixes Suffixes must include the dot, e.g. ".txt" show_progress: If true, will show a progress bar as the files are loaded. This forces an iteration through all matching files to count them prior to loading them. Examples: .. code-block:: python from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader # Load a single file. loader = FileSystemBlobLoader("/path/to/file.txt") # Recursively load all text files in a directory. loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt") # Recursively load all non-hidden files in a directory. loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*") # Load all files in a directory without recursion. loader = FileSystemBlobLoader("/path/to/directory", glob="*") # Recursively load all files in a directory, except for py or pyc files. loader = FileSystemBlobLoader( "/path/to/directory", glob="**/*.txt", exclude=["**/*.py", "**/*.pyc"] ) """ # noqa: E501 if isinstance(path, Path): _path = path elif isinstance(path, str): _path = Path(path) else: raise TypeError(f"Expected str or Path, got {type(path)}") self.path = _path.expanduser() # Expand user to handle ~ self.glob = glob self.suffixes = set(suffixes or []) self.show_progress = show_progress self.exclude = exclude
[docs] def yield_blobs( self, ) -> Iterable[Blob]: """Yield blobs that match the requested pattern.""" iterator = _make_iterator( length_func=self.count_matching_files, show_progress=self.show_progress ) for path in iterator(self._yield_paths()): yield Blob.from_path(path)
def _yield_paths(self) -> Iterable[Path]: """Yield paths that match the requested pattern.""" if self.path.is_file(): yield self.path return paths = self.path.glob(self.glob) for path in paths: if self.exclude: if any(path.match(glob) for glob in self.exclude): continue if path.is_file(): if self.suffixes and path.suffix not in self.suffixes: continue yield path
[docs] def count_matching_files(self) -> int: """Count files that match the pattern without loading them.""" # Carry out a full iteration to count the files without # materializing anything expensive in memory. num = 0 for _ in self._yield_paths(): num += 1 return num