[docs]@dataclassclassGoogleApiClient:"""Generic Google API Client. To use, you should have the ``google_auth_oauthlib,youtube_transcript_api,google`` python package installed. As the google api expects credentials you need to set up a google account and register your Service. "https://developers.google.com/docs/api/quickstart/python" *Security Note*: Note that parsing of the transcripts relies on the standard xml library but the input is viewed as trusted in this case. Example: .. code-block:: python from langchain_community.document_loaders import GoogleApiClient google_api_client = GoogleApiClient( service_account_path=Path("path_to_your_sec_file.json") ) """credentials_path:Path=Path.home()/".credentials"/"credentials.json"service_account_path:Path=Path.home()/".credentials"/"credentials.json"token_path:Path=Path.home()/".credentials"/"token.json"def__post_init__(self)->None:self.creds=self._load_credentials()
[docs]@root_validator(pre=True)defvalidate_channel_or_videoIds_is_set(cls,values:Dict[str,Any])->Dict[str,Any]:"""Validate that either folder_id or document_ids is set, but not both."""ifnotvalues.get("credentials_path")andnotvalues.get("service_account_path"):raiseValueError("Must specify either channel_name or video_ids")returnvalues
def_load_credentials(self)->Any:"""Load credentials."""# Adapted from https://developers.google.com/drive/api/v3/quickstart/pythontry:fromgoogle.auth.transport.requestsimportRequestfromgoogle.oauth2importservice_accountfromgoogle.oauth2.credentialsimportCredentialsfromgoogle_auth_oauthlib.flowimportInstalledAppFlowfromyoutube_transcript_apiimportYouTubeTranscriptApi# noqa: F401exceptImportError:raiseImportError("You must run""`pip install --upgrade ""google-api-python-client google-auth-httplib2 ""google-auth-oauthlib ""youtube-transcript-api` ""to use the Google Drive loader")creds=Noneifself.service_account_path.exists():returnservice_account.Credentials.from_service_account_file(str(self.service_account_path))ifself.token_path.exists():creds=Credentials.from_authorized_user_file(str(self.token_path),SCOPES)ifnotcredsornotcreds.valid:ifcredsandcreds.expiredandcreds.refresh_token:creds.refresh(Request())else:flow=InstalledAppFlow.from_client_secrets_file(str(self.credentials_path),SCOPES)creds=flow.run_local_server(port=0)withopen(self.token_path,"w")astoken:token.write(creds.to_json())returncreds
ALLOWED_SCHEMES={"http","https"}ALLOWED_NETLOCS={"youtu.be","m.youtube.com","youtube.com","www.youtube.com","www.youtube-nocookie.com","vid.plus",}def_parse_video_id(url:str)->Optional[str]:"""Parse a YouTube URL and return the video ID if valid, otherwise None."""parsed_url=urlparse(url)ifparsed_url.schemenotinALLOWED_SCHEMES:returnNoneifparsed_url.netlocnotinALLOWED_NETLOCS:returnNonepath=parsed_url.pathifpath.endswith("/watch"):query=parsed_url.queryparsed_query=parse_qs(query)if"v"inparsed_query:ids=parsed_query["v"]video_id=idsifisinstance(ids,str)elseids[0]else:returnNoneelse:path=parsed_url.path.lstrip("/")video_id=path.split("/")[-1]iflen(video_id)!=11:# Video IDs are 11 characters longreturnNonereturnvideo_id
[docs]classTranscriptFormat(Enum):"""Output formats of transcripts from `YoutubeLoader`."""TEXT="text"LINES="lines"CHUNKS="chunks"
[docs]classYoutubeLoader(BaseLoader):"""Load `YouTube` video transcripts."""
[docs]def__init__(self,video_id:str,add_video_info:bool=False,language:Union[str,Sequence[str]]="en",translation:Optional[str]=None,transcript_format:TranscriptFormat=TranscriptFormat.TEXT,continue_on_failure:bool=False,chunk_size_seconds:int=120,):"""Initialize with YouTube video ID."""self.video_id=video_idself._metadata={"source":video_id}self.add_video_info=add_video_infoself.language=languageifisinstance(language,str):self.language=[language]else:self.language=languageself.translation=translationself.transcript_format=transcript_formatself.continue_on_failure=continue_on_failureself.chunk_size_seconds=chunk_size_seconds
[docs]@staticmethoddefextract_video_id(youtube_url:str)->str:"""Extract video ID from common YouTube URLs."""video_id=_parse_video_id(youtube_url)ifnotvideo_id:raiseValueError(f'Could not determine the video ID for the URL "{youtube_url}".')returnvideo_id
[docs]@classmethoddeffrom_youtube_url(cls,youtube_url:str,**kwargs:Any)->YoutubeLoader:"""Given a YouTube URL, construct a loader. See `YoutubeLoader()` constructor for a list of keyword arguments. """video_id=cls.extract_video_id(youtube_url)returncls(video_id,**kwargs)
def_make_chunk_document(self,chunk_pieces:List[Dict],chunk_start_seconds:int)->Document:"""Create Document from chunk of transcript pieces."""m,s=divmod(chunk_start_seconds,60)h,m=divmod(m,60)returnDocument(page_content=" ".join(map(lambdachunk_piece:chunk_piece["text"].strip(" "),chunk_pieces)),metadata={**self._metadata,"start_seconds":chunk_start_seconds,"start_timestamp":f"{h:02d}:{m:02d}:{s:02d}","source":# replace video ID with URL to start timef"https://www.youtube.com/watch?v={self.video_id}"f"&t={chunk_start_seconds}s",},)def_get_transcript_chunks(self,transcript_pieces:List[Dict])->Generator[Document,None,None]:chunk_pieces:List[Dict[str,Any]]=[]chunk_start_seconds=0chunk_time_limit=self.chunk_size_secondsfortranscript_pieceintranscript_pieces:piece_end=transcript_piece["start"]+transcript_piece["duration"]ifpiece_end>chunk_time_limit:ifchunk_pieces:yieldself._make_chunk_document(chunk_pieces,chunk_start_seconds)chunk_pieces=[]chunk_start_seconds=chunk_time_limitchunk_time_limit+=self.chunk_size_secondschunk_pieces.append(transcript_piece)iflen(chunk_pieces)>0:yieldself._make_chunk_document(chunk_pieces,chunk_start_seconds)
[docs]defload(self)->List[Document]:"""Load YouTube transcripts into `Document` objects."""try:fromyoutube_transcript_apiimport(NoTranscriptFound,TranscriptsDisabled,YouTubeTranscriptApi,)exceptImportError:raiseImportError('Could not import "youtube_transcript_api" Python package. '"Please install it with `pip install youtube-transcript-api`.")ifself.add_video_info:# Get more video meta info# Such as title, description, thumbnail url, publish_datevideo_info=self._get_video_info()self._metadata.update(video_info)try:transcript_list=YouTubeTranscriptApi.list_transcripts(self.video_id)exceptTranscriptsDisabled:return[]try:transcript=transcript_list.find_transcript(self.language)exceptNoTranscriptFound:transcript=transcript_list.find_transcript(["en"])ifself.translationisnotNone:transcript=transcript.translate(self.translation)transcript_pieces:List[Dict[str,Any]]=transcript.fetch()ifself.transcript_format==TranscriptFormat.TEXT:transcript=" ".join(map(lambdatranscript_piece:transcript_piece["text"].strip(" "),transcript_pieces,))return[Document(page_content=transcript,metadata=self._metadata)]elifself.transcript_format==TranscriptFormat.LINES:returnlist(map(lambdatranscript_piece:Document(page_content=transcript_piece["text"].strip(" "),metadata=dict(filter(lambdaitem:item[0]!="text",transcript_piece.items())),),transcript_pieces,))elifself.transcript_format==TranscriptFormat.CHUNKS:returnlist(self._get_transcript_chunks(transcript_pieces))else:raiseValueError("Unknown transcript format.")
def_get_video_info(self)->Dict:"""Get important video information. Components include: - title - description - thumbnail URL, - publish_date - channel author - and more. """try:frompytubeimportYouTubeexceptImportError:raiseImportError('Could not import "pytube" Python package. '"Please install it with `pip install pytube`.")yt=YouTube(f"https://www.youtube.com/watch?v={self.video_id}")video_info={"title":yt.titleor"Unknown","description":yt.descriptionor"Unknown","view_count":yt.viewsor0,"thumbnail_url":yt.thumbnail_urlor"Unknown","publish_date":yt.publish_date.strftime("%Y-%m-%d %H:%M:%S")ifyt.publish_dateelse"Unknown","length":yt.lengthor0,"author":yt.authoror"Unknown",}returnvideo_info
[docs]@dataclassclassGoogleApiYoutubeLoader(BaseLoader):"""Load all Videos from a `YouTube` Channel. To use, you should have the ``googleapiclient,youtube_transcript_api`` python package installed. As the service needs a google_api_client, you first have to initialize the GoogleApiClient. Additionally you have to either provide a channel name or a list of videoids "https://developers.google.com/docs/api/quickstart/python" Example: .. code-block:: python from langchain_community.document_loaders import GoogleApiClient from langchain_community.document_loaders import GoogleApiYoutubeLoader google_api_client = GoogleApiClient( service_account_path=Path("path_to_your_sec_file.json") ) loader = GoogleApiYoutubeLoader( google_api_client=google_api_client, channel_name = "CodeAesthetic" ) load.load() """google_api_client:GoogleApiClientchannel_name:Optional[str]=Nonevideo_ids:Optional[List[str]]=Noneadd_video_info:bool=Truecaptions_language:str="en"continue_on_failure:bool=Falsedef__post_init__(self)->None:self.youtube_client=self._build_youtube_client(self.google_api_client.creds)def_build_youtube_client(self,creds:Any)->Any:try:fromgoogleapiclient.discoveryimportbuildfromyoutube_transcript_apiimportYouTubeTranscriptApi# noqa: F401exceptImportError:raiseImportError("You must run""`pip install --upgrade ""google-api-python-client google-auth-httplib2 ""google-auth-oauthlib ""youtube-transcript-api` ""to use the Google Drive loader")returnbuild("youtube","v3",credentials=creds)
[docs]@root_validator(pre=True)defvalidate_channel_or_videoIds_is_set(cls,values:Dict[str,Any])->Dict[str,Any]:"""Validate that either folder_id or document_ids is set, but not both."""ifnotvalues.get("channel_name")andnotvalues.get("video_ids"):raiseValueError("Must specify either channel_name or video_ids")returnvalues
def_get_transcripe_for_video_id(self,video_id:str)->str:fromyoutube_transcript_apiimportNoTranscriptFound,YouTubeTranscriptApitranscript_list=YouTubeTranscriptApi.list_transcripts(video_id)try:transcript=transcript_list.find_transcript([self.captions_language])exceptNoTranscriptFound:foravailable_transcriptintranscript_list:transcript=available_transcript.translate(self.captions_language)continuetranscript_pieces=transcript.fetch()return" ".join([t["text"].strip(" ")fortintranscript_pieces])def_get_document_for_video_id(self,video_id:str,**kwargs:Any)->Document:captions=self._get_transcripe_for_video_id(video_id)video_response=(self.youtube_client.videos().list(part="id,snippet",id=video_id,).execute())returnDocument(page_content=captions,metadata=video_response.get("items")[0],)def_get_channel_id(self,channel_name:str)->str:request=self.youtube_client.search().list(part="id",q=channel_name,type="channel",maxResults=1,# we only need one result since channel names are unique)response=request.execute()channel_id=response["items"][0]["id"]["channelId"]returnchannel_iddef_get_uploads_playlist_id(self,channel_id:str)->str:request=self.youtube_client.channels().list(part="contentDetails",id=channel_id,)response=request.execute()returnresponse["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]def_get_document_for_channel(self,channel:str,**kwargs:Any)->List[Document]:try:fromyoutube_transcript_apiimport(NoTranscriptFound,TranscriptsDisabled,)exceptImportError:raiseImportError("You must run""`pip install --upgrade ""youtube-transcript-api` ""to use the youtube loader")channel_id=self._get_channel_id(channel)uploads_playlist_id=self._get_uploads_playlist_id(channel_id)request=self.youtube_client.playlistItems().list(part="id,snippet",playlistId=uploads_playlist_id,maxResults=50,)video_ids=[]whilerequestisnotNone:response=request.execute()# Add each video ID to the listforiteminresponse["items"]:video_id=item["snippet"]["resourceId"]["videoId"]meta_data={"videoId":video_id}ifself.add_video_info:item["snippet"].pop("thumbnails")meta_data.update(item["snippet"])try:page_content=self._get_transcripe_for_video_id(video_id)video_ids.append(Document(page_content=page_content,metadata=meta_data,))except(TranscriptsDisabled,NoTranscriptFound,ParseError)ase:ifself.continue_on_failure:logger.error("Error fetching transscript "+f" {item['id']['videoId']}, exception: {e}")else:raiseepassrequest=self.youtube_client.search().list_next(request,response)returnvideo_ids
[docs]defload(self)->List[Document]:"""Load documents."""document_list=[]ifself.channel_name:document_list.extend(self._get_document_for_channel(self.channel_name))elifself.video_ids:document_list.extend([self._get_document_for_video_id(video_id)forvideo_idinself.video_ids])else:raiseValueError("Must specify either channel_name or video_ids")returndocument_list