Source code for langchain_community.document_loaders.bilibili
importjsonimportreimportwarningsfromtypingimportList,Tupleimportrequestsfromlangchain_core.documentsimportDocumentfromlangchain_community.document_loaders.baseimportBaseLoader# Pre-compile regular expressions for video ID extractionBV_PATTERN=re.compile(r"BV\w+")AV_PATTERN=re.compile(r"av[0-9]+")PAGE_INDEX_PATTERN=re.compile(r"p=(\d+)")
[docs]classBiliBiliLoader(BaseLoader):""" Load fetching transcripts from BiliBili videos. """
[docs]def__init__(self,video_urls:List[str],sessdata:str="",bili_jct:str="",buvid3:str="",):""" Initialize the loader with BiliBili video URLs and authentication cookies. if no authentication cookies are provided, the loader can't get transcripts and will only fetch videos info. Args: video_urls (List[str]): List of BiliBili video URLs. sessdata (str): SESSDATA cookie value for authentication. bili_jct (str): BILI_JCT cookie value for authentication. buvid3 (str): BUVI3 cookie value for authentication. """self.video_urls=video_urlsself.credential=Nonetry:frombilibili_apiimportvideoexceptImportError:raiseImportError("requests package not found, please install it with ""`pip install bilibili-api-python`")ifsessdataandbili_jctandbuvid3:self.credential=video.Credential(sessdata=sessdata,bili_jct=bili_jct,buvid3=buvid3)
[docs]defload(self)->List[Document]:""" Load and return a list of documents containing video transcripts. Returns: List[Document]: List of Document objects transcripts and metadata. """results=[]forurlinself.video_urls:transcript,video_info=self._get_bilibili_subs_and_info(url)doc=Document(page_content=transcript,metadata=video_info)results.append(doc)returnresults
def_get_bilibili_subs_and_info(self,url:str)->Tuple[str,dict]:""" Retrieve video information and transcript for a given BiliBili URL. """bvid=BV_PATTERN.search(url)try:frombilibili_apiimportsync,videoexceptImportError:raiseImportError("requests package not found, please install it with ""`pip install bilibili-api-python`")ifbvid:v=video.Video(bvid=bvid.group(),credential=self.credential)else:aid=AV_PATTERN.search(url)ifaid:v=video.Video(aid=int(aid.group()[2:]),credential=self.credential)else:raiseValueError(f"Unable to find a valid video ID in URL: {url}")video_info=sync(v.get_info())video_info.update({"url":url})# Return if no credential is providedifnotself.credential:return"",video_infocid=0page_match=PAGE_INDEX_PATTERN.search(url)ifpage_match:cid=video_info["pages"][int(page_match.group(1))-1]["cid"]# Bilibili page index starts from 1else:cid=video_info["cid"]# Fetching and processing subtitlessub=sync(v.get_subtitle(cid))sub_list=sub.get("subtitles",[])ifsub_list:sub_url=sub_list[0].get("subtitle_url","")ifnotsub_url.startswith("http"):sub_url="https:"+sub_urlresponse=requests.get(sub_url)ifresponse.status_code==200:raw_sub_titles=json.loads(response.content).get("body",[])raw_transcript=" ".join([c["content"]forcinraw_sub_titles])raw_transcript_with_meta_info=(f"Video Title: {video_info['title']}, "f"description: {video_info['desc']}\n\n"f"Transcript: {raw_transcript}")returnraw_transcript_with_meta_info,video_infoelse:warnings.warn(f"Failed to fetch subtitles for {url}. "f"HTTP Status Code: {response.status_code}")else:warnings.warn(f"No subtitles found for video: {url}. Returning empty transcript.")# Return empty transcript if no subtitles are foundreturn"",video_info