[docs]classGitLoader(BaseLoader):"""Load `Git` repository files. The Repository can be local on disk available at `repo_path`, or remote at `clone_url` that will be cloned to `repo_path`. Currently, supports only text files. Each document represents one file in the repository. The `path` points to the local Git repository, and the `branch` specifies the branch to load files from. By default, it loads from the `main` branch. """
[docs]def__init__(self,repo_path:str,clone_url:Optional[str]=None,branch:Optional[str]="main",file_filter:Optional[Callable[[str],bool]]=None,):""" Args: repo_path: The path to the Git repository. clone_url: Optional. The URL to clone the repository from. branch: Optional. The branch to load files from. Defaults to `main`. file_filter: Optional. A function that takes a file path and returns a boolean indicating whether to load the file. Defaults to None. """self.repo_path=repo_pathself.clone_url=clone_urlself.branch=branchself.file_filter=file_filter
[docs]deflazy_load(self)->Iterator[Document]:try:fromgitimportBlob,RepoexceptImportErrorasex:raiseImportError("Could not import git python package. ""Please install it with `pip install GitPython`.")fromexifnotos.path.exists(self.repo_path)andself.clone_urlisNone:raiseValueError(f"Path {self.repo_path} does not exist")elifself.clone_url:# If the repo_path already contains a git repository, verify that it's the# same repository as the one we're trying to clone.ifos.path.isdir(os.path.join(self.repo_path,".git")):repo=Repo(self.repo_path)# If the existing repository is not the same as the one we're trying to# clone, raise an error.ifrepo.remotes.origin.url!=self.clone_url:raiseValueError("A different repository is already cloned at this path.")else:repo=Repo.clone_from(self.clone_url,self.repo_path)repo.git.checkout(self.branch)else:repo=Repo(self.repo_path)repo.git.checkout(self.branch)foriteminrepo.tree().traverse():ifnotisinstance(item,Blob):continuefile_path=os.path.join(self.repo_path,item.path)ignored_files=repo.ignored([file_path])# type: ignore[arg-type]iflen(ignored_files):continue# uses filter to skip filesifself.file_filterandnotself.file_filter(file_path):continuerel_file_path=os.path.relpath(file_path,self.repo_path)try:withopen(file_path,"rb")asf:content=f.read()file_type=os.path.splitext(item.name)[1]# loads only text filestry:text_content=content.decode("utf-8")exceptUnicodeDecodeError:continuemetadata={"source":rel_file_path,"file_path":rel_file_path,"file_name":item.name,"file_type":file_type,}yieldDocument(page_content=text_content,metadata=metadata)exceptExceptionase:print(f"Error reading file {file_path}: {e}")# noqa: T201