[docs]classDirectoryLoader(BaseLoader):"""Load from a directory."""
[docs]def__init__(self,path:str,glob:Union[List[str],Tuple[str],str]="**/[!.]*",silent_errors:bool=False,load_hidden:bool=False,loader_cls:FILE_LOADER_TYPE=UnstructuredFileLoader,loader_kwargs:Union[dict,None]=None,recursive:bool=False,show_progress:bool=False,use_multithreading:bool=False,max_concurrency:int=4,*,exclude:Union[Sequence[str],str]=(),sample_size:int=0,randomize_sample:bool=False,sample_seed:Union[int,None]=None,):"""Initialize with a path to directory and how to glob over it. Args: path: Path to directory. glob: A glob pattern or list of glob patterns to use to find files. Defaults to "**/[!.]*" (all files except hidden). exclude: A pattern or list of patterns to exclude from results. Use glob syntax. silent_errors: Whether to silently ignore errors. Defaults to False. load_hidden: Whether to load hidden files. Defaults to False. loader_cls: Loader class to use for loading files. Defaults to UnstructuredFileLoader. loader_kwargs: Keyword arguments to pass to loader_cls. Defaults to None. recursive: Whether to recursively search for files. Defaults to False. show_progress: Whether to show a progress bar. Defaults to False. use_multithreading: Whether to use multithreading. Defaults to False. max_concurrency: The maximum number of threads to use. Defaults to 4. sample_size: The maximum number of files you would like to load from the directory. randomize_sample: Shuffle the files to get a random sample. sample_seed: set the seed of the random shuffle for reproducibility. Examples: .. code-block:: python from langchain_community.document_loaders import DirectoryLoader # Load all non-hidden files in a directory. loader = DirectoryLoader("/path/to/directory") # Load all text files in a directory without recursion. loader = DirectoryLoader("/path/to/directory", glob="*.txt") # Recursively load all text files in a directory. loader = DirectoryLoader( "/path/to/directory", glob="*.txt", recursive=True ) # Load all files in a directory, except for py files. loader = DirectoryLoader("/path/to/directory", exclude="*.py") # Load all files in a directory, except for py or pyc files. loader = DirectoryLoader( "/path/to/directory", exclude=["*.py", "*.pyc"] ) """ifloader_kwargsisNone:loader_kwargs={}ifisinstance(exclude,str):exclude=(exclude,)self.path=pathself.glob=globself.exclude=excludeself.load_hidden=load_hiddenself.loader_cls=loader_clsself.loader_kwargs=loader_kwargsself.silent_errors=silent_errorsself.recursive=recursiveself.show_progress=show_progressself.use_multithreading=use_multithreadingself.max_concurrency=max_concurrencyself.sample_size=sample_sizeself.randomize_sample=randomize_sampleself.sample_seed=sample_seed
[docs]deflazy_load(self)->Iterator[Document]:"""Load documents lazily."""p=Path(self.path)ifnotp.exists():raiseFileNotFoundError(f"Directory not found: '{self.path}'")ifnotp.is_dir():raiseValueError(f"Expected directory, got file: '{self.path}'")# glob multiple patterns if a list is provided, e.g., multiple file extensionsifisinstance(self.glob,(list,tuple)):paths=[]forpatterninself.glob:paths.extend(list(p.rglob(pattern)ifself.recursiveelsep.glob(pattern)))elifisinstance(self.glob,str):paths=list(p.rglob(self.glob)ifself.recursiveelsep.glob(self.glob))else:raiseTypeError(f"Expected glob to be str or sequence of str, but got {type(self.glob)}")items=[pathforpathinpathsifnot(self.excludeandany(path.match(glob)forglobinself.exclude))andpath.is_file()]ifself.sample_size>0:ifself.randomize_sample:randomizer=random.Random(self.sample_seedifself.sample_seedelseNone)randomizer.shuffle(items)items=items[:min(len(items),self.sample_size)]pbar=Noneifself.show_progress:try:fromtqdmimporttqdmpbar=tqdm(total=len(items))exceptImportErrorase:logger.warning("To log the progress of DirectoryLoader you need to install tqdm, ""`pip install tqdm`")ifself.silent_errors:logger.warning(e)else:raiseImportError("To log the progress of DirectoryLoader ""you need to install tqdm, ""`pip install tqdm`")ifself.use_multithreading:futures=[]withconcurrent.futures.ThreadPoolExecutor(max_workers=self.max_concurrency)asexecutor:foriinitems:futures.append(executor.submit(self._lazy_load_file_to_non_generator(self._lazy_load_file),i,p,pbar,))forfutureinconcurrent.futures.as_completed(futures):foriteminfuture.result():yielditemelse:foriinitems:yield fromself._lazy_load_file(i,p,pbar)ifpbar:pbar.close()
def_lazy_load_file_to_non_generator(self,func:Callable)->Callable:defnon_generator(item:Path,path:Path,pbar:Optional[Any])->List:return[xforxinfunc(item,path,pbar)]returnnon_generatordef_lazy_load_file(self,item:Path,path:Path,pbar:Optional[Any])->Iterator[Document]:"""Load a file. Args: item: File path. path: Directory path. pbar: Progress bar. Defaults to None. """ifitem.is_file():if_is_visible(item.relative_to(path))orself.load_hidden:try:logger.debug(f"Processing file: {str(item)}")loader=self.loader_cls(str(item),**self.loader_kwargs)try:forsubdocinloader.lazy_load():yieldsubdocexceptNotImplementedError:forsubdocinloader.load():yieldsubdocexceptExceptionase:ifself.silent_errors:logger.warning(f"Error loading file {str(item)}: {e}")else:logger.error(f"Error loading file {str(item)}")raiseefinally:ifpbar:pbar.update(1)