Source code for langchain_community.document_loaders.blob_loaders.youtube_audio

from typing import Iterable, List

from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader
from langchain_community.document_loaders.blob_loaders.schema import Blob, BlobLoader


[docs] class YoutubeAudioLoader(BlobLoader): """Load YouTube urls as audio file(s)."""
[docs] def __init__(self, urls: List[str], save_dir: str): if not isinstance(urls, list): raise TypeError("urls must be a list") self.urls = urls self.save_dir = save_dir
[docs] def yield_blobs(self) -> Iterable[Blob]: """Yield audio blobs for each url.""" try: import yt_dlp except ImportError: raise ImportError( "yt_dlp package not found, please install it with " "`pip install yt_dlp`" ) # Use yt_dlp to download audio given a YouTube url ydl_opts = { "format": "m4a/bestaudio/best", "noplaylist": True, "outtmpl": self.save_dir + "/%(title)s.%(ext)s", "postprocessors": [ { "key": "FFmpegExtractAudio", "preferredcodec": "m4a", } ], } for url in self.urls: # Download file with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download(url) # Yield the written blobs loader = FileSystemBlobLoader(self.save_dir, glob="*.m4a") for blob in loader.yield_blobs(): yield blob