[docs]classSlackDirectoryLoader(BaseLoader):"""Load from a `Slack` directory dump."""
[docs]def__init__(self,zip_path:Union[str,Path],workspace_url:Optional[str]=None):"""Initialize the SlackDirectoryLoader. Args: zip_path (str): The path to the Slack directory dump zip file. workspace_url (Optional[str]): The Slack workspace URL. Including the URL will turn sources into links. Defaults to None. """self.zip_path=Path(zip_path)self.workspace_url=workspace_urlself.channel_id_map=self._get_channel_id_map(self.zip_path)
@staticmethoddef_get_channel_id_map(zip_path:Path)->Dict[str,str]:"""Get a dictionary mapping channel names to their respective IDs."""withzipfile.ZipFile(zip_path,"r")aszip_file:try:withzip_file.open("channels.json","r")asf:channels=json.load(f)return{channel["name"]:channel["id"]forchannelinchannels}exceptKeyError:return{}
[docs]deflazy_load(self)->Iterator[Document]:"""Load and return documents from the Slack directory dump."""withzipfile.ZipFile(self.zip_path,"r")aszip_file:forchannel_pathinzip_file.namelist():channel_name=Path(channel_path).parent.nameifnotchannel_name:continueifchannel_path.endswith(".json"):messages=self._read_json(zip_file,channel_path)formessageinmessages:yieldself._convert_message_to_document(message,channel_name)
def_read_json(self,zip_file:zipfile.ZipFile,file_path:str)->List[dict]:"""Read JSON data from a zip subfile."""withzip_file.open(file_path,"r")asf:data=json.load(f)returndatadef_convert_message_to_document(self,message:dict,channel_name:str)->Document:""" Convert a message to a Document object. Args: message (dict): A message in the form of a dictionary. channel_name (str): The name of the channel the message belongs to. Returns: Document: A Document object representing the message. """text=message.get("text","")metadata=self._get_message_metadata(message,channel_name)returnDocument(page_content=text,metadata=metadata,)def_get_message_metadata(self,message:dict,channel_name:str)->dict:"""Create and return metadata for a given message and channel."""timestamp=message.get("ts","")user=message.get("user","")source=self._get_message_source(channel_name,user,timestamp)return{"source":source,"channel":channel_name,"timestamp":timestamp,"user":user,}def_get_message_source(self,channel_name:str,user:str,timestamp:str)->str:""" Get the message source as a string. Args: channel_name (str): The name of the channel the message belongs to. user (str): The user ID who sent the message. timestamp (str): The timestamp of the message. Returns: str: The message source. """ifself.workspace_url:channel_id=self.channel_id_map.get(channel_name,"")return(f"{self.workspace_url}/archives/{channel_id}"+f"/p{timestamp.replace('.','')}")else:returnf"{channel_name} - {user} - {timestamp}"