[docs]classBlackboardLoader(WebBaseLoader):"""Load a `Blackboard` course. This loader is not compatible with all Blackboard courses. It is only compatible with courses that use the new Blackboard interface. To use this loader, you must have the BbRouter cookie. You can get this cookie by logging into the course and then copying the value of the BbRouter cookie from the browser's developer tools. Example: .. code-block:: python from langchain_community.document_loaders import BlackboardLoader loader = BlackboardLoader( blackboard_course_url="https://blackboard.example.com/webapps/blackboard/execute/announcement?method=search&context=course_entry&course_id=_123456_1", bbrouter="expires:12345...", ) documents = loader.load() """
[docs]def__init__(self,blackboard_course_url:str,bbrouter:str,load_all_recursively:bool=True,basic_auth:Optional[Tuple[str,str]]=None,cookies:Optional[dict]=None,continue_on_failure:bool=False,show_progress:bool=True,):"""Initialize with blackboard course url. The BbRouter cookie is required for most blackboard courses. Args: blackboard_course_url: Blackboard course url. bbrouter: BbRouter cookie. load_all_recursively: If True, load all documents recursively. basic_auth: Basic auth credentials. cookies: Cookies. continue_on_failure: whether to continue loading the sitemap if an error occurs loading a url, emitting a warning instead of raising an exception. Setting this to True makes the loader more robust, but also may result in missing data. Default: False show_progress: whether to show a progress bar while loading. Default: True Raises: ValueError: If blackboard course url is invalid. """super().__init__(web_paths=(blackboard_course_url),continue_on_failure=continue_on_failure,show_progress=show_progress,)# Get base urltry:self.base_url=blackboard_course_url.split("/webapps/blackboard")[0]exceptIndexError:raiseIndexError("Invalid blackboard course url. ""Please provide a url that starts with ""https://<blackboard_url>/webapps/blackboard")ifbasic_authisnotNone:self.session.auth=basic_auth# Combine cookiesifcookiesisNone:cookies={}cookies.update({"BbRouter":bbrouter})self.session.cookies.update(cookies)self.load_all_recursively=load_all_recursivelyself.check_bs4()
[docs]defcheck_bs4(self)->None:"""Check if BeautifulSoup4 is installed. Raises: ImportError: If BeautifulSoup4 is not installed. """try:importbs4# noqa: F401exceptImportError:raiseImportError("BeautifulSoup4 is required for BlackboardLoader. ""Please install it with `pip install beautifulsoup4`.")
[docs]defload(self)->List[Document]:"""Load data into Document objects. Returns: List of Documents. """ifself.load_all_recursively:soup_info=self.scrape()self.folder_path=self._get_folder_path(soup_info)relative_paths=self._get_paths(soup_info)documents=[]forpathinrelative_paths:url=self.base_url+pathprint(f"Fetching documents from {url}")# noqa: T201soup_info=self._scrape(url)withcontextlib.suppress(ValueError):documents.extend(self._get_documents(soup_info))returndocumentselse:print(f"Fetching documents from {self.web_path}")# noqa: T201soup_info=self.scrape()self.folder_path=self._get_folder_path(soup_info)returnself._get_documents(soup_info)
def_get_folder_path(self,soup:Any)->str:"""Get the folder path to save the Documents in. Args: soup: BeautifulSoup4 soup object. Returns: Folder path. """# Get the course namecourse_name=soup.find("span",{"id":"crumb_1"})ifcourse_nameisNone:raiseValueError("No course name found.")course_name=course_name.text.strip()# Prepare the folder pathcourse_name_clean=(unquote(course_name).replace(" ","_").replace("/","_").replace(":","_").replace(",","_").replace("?","_").replace("'","_").replace("!","_").replace('"',"_"))# Get the folder pathfolder_path=Path(".")/course_name_cleanreturnstr(folder_path)def_get_documents(self,soup:Any)->List[Document]:"""Fetch content from page and return Documents. Args: soup: BeautifulSoup4 soup object. Returns: List of documents. """attachments=self._get_attachments(soup)self._download_attachments(attachments)documents=self._load_documents()returndocumentsdef_get_attachments(self,soup:Any)->List[str]:"""Get all attachments from a page. Args: soup: BeautifulSoup4 soup object. Returns: List of attachments. """frombs4importBeautifulSoup,Tag# Get content listcontent_list:BeautifulSoupcontent_list=soup.find("ul",{"class":"contentList"})ifcontent_listisNone:raiseValueError("No content list found.")# Get all attachmentsattachments=[]attachment:Tagforattachmentincontent_list.find_all("ul",{"class":"attachments"}):link:Tagforlinkinattachment.find_all("a"):href=link.get("href")# Only add if href is not None and does not start with #ifhrefisnotNoneandnothref.startswith("#"):attachments.append(href)returnattachmentsdef_download_attachments(self,attachments:List[str])->None:"""Download all attachments. Args: attachments: List of attachments. """# Make sure the folder existsPath(self.folder_path).mkdir(parents=True,exist_ok=True)# Download all attachmentsforattachmentinattachments:self.download(attachment)def_load_documents(self)->List[Document]:"""Load all documents in the folder. Returns: List of documents. """# Create the document loaderloader=DirectoryLoader(path=self.folder_path,glob="*.pdf",loader_cls=PyPDFLoader,# type: ignore)# Load the documentsdocuments=loader.load()# Return all documentsreturndocumentsdef_get_paths(self,soup:Any)->List[str]:"""Get all relative paths in the navbar."""relative_paths=[]course_menu=soup.find("ul",{"class":"courseMenu"})ifcourse_menuisNone:raiseValueError("No course menu found.")forlinkincourse_menu.find_all("a"):href=link.get("href")ifhrefisnotNoneandhref.startswith("/"):relative_paths.append(href)returnrelative_paths
[docs]defdownload(self,path:str)->None:"""Download a file from an url. Args: path: Path to the file. """# Get the file contentresponse=self.session.get(self.base_url+path,allow_redirects=True)# Get the filenamefilename=self.parse_filename(response.url)# Write the file to diskwithopen(Path(self.folder_path)/filename,"wb")asf:f.write(response.content)
[docs]defparse_filename(self,url:str)->str:"""Parse the filename from an url. Args: url: Url to parse the filename from. Returns: The filename. """if(url_path:=Path(url))andurl_path.suffix==".pdf":returnurl_path.nameelse:returnself._parse_filename_from_url(url)
def_parse_filename_from_url(self,url:str)->str:"""Parse the filename from an url. Args: url: Url to parse the filename from. Returns: The filename. Raises: ValueError: If the filename could not be parsed. """filename_matches=re.search(r"filename%2A%3DUTF-8%27%27(.+)",url)iffilename_matches:filename=filename_matches.group(1)else:raiseValueError(f"Could not parse filename from {url}")if".pdf"notinfilename:raiseValueError(f"Incorrect file type: {filename}")filename=filename.split(".pdf")[0]+".pdf"filename=unquote(filename)filename=filename.replace("%20"," ")returnfilename
if__name__=="__main__":loader=BlackboardLoader("https://<YOUR BLACKBOARD URL"" HERE>/webapps/blackboard/content/listContent.jsp?course_id=_<YOUR COURSE ID"" HERE>_1&content_id=_<YOUR CONTENT ID HERE>_1&mode=reset","<YOUR BBROUTER COOKIE HERE>",load_all_recursively=True,)documents=loader.load()print(f"Loaded {len(documents)} pages of PDFs from {loader.web_path}")# noqa: T201