[docs]classOpenAIWhisperParser(BaseBlobParser):"""Transcribe and parse audio files. Audio transcription is with OpenAI Whisper model. Args: api_key: OpenAI API key chunk_duration_threshold: minimum duration of a chunk in seconds NOTE: According to the OpenAI API, the chunk duration should be at least 0.1 seconds. If the chunk duration is less or equal than the threshold, it will be skipped. """
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:"""Lazily parse the blob."""importiotry:importopenaiexceptImportError:raiseImportError("openai package not found, please install it with ""`pip install openai`")try:frompydubimportAudioSegmentexceptImportError:raiseImportError("pydub package not found, please install it with ""`pip install pydub`")ifis_openai_v1():# api_key optional, defaults to `os.environ['OPENAI_API_KEY']`client=openai.OpenAI(api_key=self.api_key,base_url=self.base_url)else:# Set the API key if providedifself.api_key:openai.api_key=self.api_keyifself.base_url:openai.base_url=self.base_url# Audio file from diskaudio=AudioSegment.from_file(blob.path)# Define the duration of each chunk in minutes# Need to meet 25MB size limit for Whisper APIchunk_duration=20chunk_duration_ms=chunk_duration*60*1000# Split the audio into chunk_duration_ms chunksforsplit_number,iinenumerate(range(0,len(audio),chunk_duration_ms)):# Audio chunkchunk=audio[i:i+chunk_duration_ms]# Skip chunks that are too short to transcribeifchunk.duration_seconds<=self.chunk_duration_threshold:continuefile_obj=io.BytesIO(chunk.export(format="mp3").read())ifblob.sourceisnotNone:file_obj.name=blob.source+f"_part_{split_number}.mp3"else:file_obj.name=f"part_{split_number}.mp3"# Transcribeprint(f"Transcribing part {split_number+1}!")# noqa: T201attempts=0whileattempts<3:try:ifis_openai_v1():transcript=client.audio.transcriptions.create(model="whisper-1",file=file_obj,**self._create_params)else:transcript=openai.Audio.transcribe("whisper-1",file_obj)breakexceptExceptionase:attempts+=1print(f"Attempt {attempts} failed. Exception: {str(e)}")# noqa: T201time.sleep(5)else:print("Failed to transcribe after 3 attempts.")# noqa: T201continueyieldDocument(page_content=transcript.textifnotisinstance(transcript,str)elsetranscript,metadata={"source":blob.source,"chunk":split_number},)
[docs]classOpenAIWhisperParserLocal(BaseBlobParser):"""Transcribe and parse audio files with OpenAI Whisper model. Audio transcription with OpenAI Whisper model locally from transformers. Parameters: device - device to use NOTE: By default uses the gpu if available, if you want to use cpu, please set device = "cpu" lang_model - whisper model to use, for example "openai/whisper-medium" forced_decoder_ids - id states for decoder in multilanguage model, usage example: from transformers import WhisperProcessor processor = WhisperProcessor.from_pretrained("openai/whisper-medium") forced_decoder_ids = WhisperProcessor.get_decoder_prompt_ids(language="french", task="transcribe") forced_decoder_ids = WhisperProcessor.get_decoder_prompt_ids(language="french", task="translate") """
[docs]def__init__(self,device:str="0",lang_model:Optional[str]=None,batch_size:int=8,chunk_length:int=30,forced_decoder_ids:Optional[Tuple[Dict]]=None,):"""Initialize the parser. Args: device: device to use. lang_model: whisper model to use, for example "openai/whisper-medium". Defaults to None. forced_decoder_ids: id states for decoder in a multilanguage model. Defaults to None. batch_size: batch size used for decoding Defaults to 8. chunk_length: chunk length used during inference. Defaults to 30s. """try:fromtransformersimportpipelineexceptImportError:raiseImportError("transformers package not found, please install it with ""`pip install transformers`")try:importtorchexceptImportError:raiseImportError("torch package not found, please install it with ""`pip install torch`")# Determine the device to useifdevice=="cpu":self.device="cpu"else:self.device="cuda:0"iftorch.cuda.is_available()else"cpu"ifself.device=="cpu":default_model="openai/whisper-base"self.lang_model=lang_modeliflang_modelelsedefault_modelelse:# Set the language model based on the device and available memorymem=torch.cuda.get_device_properties(self.device).total_memory/(1024**2)ifmem<5000:rec_model="openai/whisper-base"elifmem<7000:rec_model="openai/whisper-small"elifmem<12000:rec_model="openai/whisper-medium"else:rec_model="openai/whisper-large"self.lang_model=lang_modeliflang_modelelserec_modelprint("Using the following model: ",self.lang_model)# noqa: T201self.batch_size=batch_size# load model for inferenceself.pipe=pipeline("automatic-speech-recognition",model=self.lang_model,chunk_length_s=chunk_length,device=self.device,)ifforced_decoder_idsisnotNone:try:self.pipe.model.config.forced_decoder_ids=forced_decoder_idsexceptExceptionasexception_text:logger.info("Unable to set forced_decoder_ids parameter for whisper model"f"Text of exception: {exception_text}""Therefore whisper model will use default mode for decoder")
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:"""Lazily parse the blob."""importiotry:frompydubimportAudioSegmentexceptImportError:raiseImportError("pydub package not found, please install it with `pip install pydub`")try:importlibrosaexceptImportError:raiseImportError("librosa package not found, please install it with ""`pip install librosa`")# Audio file from diskaudio=AudioSegment.from_file(blob.path)file_obj=io.BytesIO(audio.export(format="mp3").read())# Transcribeprint(f"Transcribing part {blob.path}!")# noqa: T201y,sr=librosa.load(file_obj,sr=16000)prediction=self.pipe(y.copy(),batch_size=self.batch_size)["text"]yieldDocument(page_content=prediction,metadata={"source":blob.source},)
[docs]classYandexSTTParser(BaseBlobParser):"""Transcribe and parse audio files. Audio transcription is with OpenAI Whisper model."""
[docs]def__init__(self,*,api_key:Optional[str]=None,iam_token:Optional[str]=None,model:str="general",language:str="auto",):"""Initialize the parser. Args: api_key: API key for a service account with the `ai.speechkit-stt.user` role. iam_token: IAM token for a service account with the `ai.speechkit-stt.user` role. model: Recognition model name. Defaults to general. language: The language in ISO 639-1 format. Defaults to automatic language recognition. Either `api_key` or `iam_token` must be provided, but not both. """if(api_keyisNone)==(iam_tokenisNone):raiseValueError("Either 'api_key' or 'iam_token' must be provided, but not both.")self.api_key=api_keyself.iam_token=iam_tokenself.model=modelself.language=language
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:"""Lazily parse the blob."""try:fromspeechkitimportconfigure_credentials,creds,model_repositoryfromspeechkit.sttimportAudioProcessingTypeexceptImportError:raiseImportError("yandex-speechkit package not found, please install it with ""`pip install yandex-speechkit`")try:frompydubimportAudioSegmentexceptImportError:raiseImportError("pydub package not found, please install it with ""`pip install pydub`")ifself.api_key:configure_credentials(yandex_credentials=creds.YandexCredentials(api_key=self.api_key))else:configure_credentials(yandex_credentials=creds.YandexCredentials(iam_token=self.iam_token))audio=AudioSegment.from_file(blob.path)model=model_repository.recognition_model()model.model=self.modelmodel.language=self.languagemodel.audio_processing_type=AudioProcessingType.Fullresult=model.transcribe(audio)forresinresult:yieldDocument(page_content=res.normalized_text,metadata={"source":blob.source},)
[docs]classFasterWhisperParser(BaseBlobParser):"""Transcribe and parse audio files with faster-whisper. faster-whisper is a reimplementation of OpenAI's Whisper model using CTranslate2, which is up to 4 times faster than openai/whisper for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU. It can automatically detect the following 14 languages and transcribe the text into their respective languages: en, zh, fr, de, ja, ko, ru, es, th, it, pt, vi, ar, tr. The gitbub repository for faster-whisper is : https://github.com/SYSTRAN/faster-whisper Example: Load a YouTube video and transcribe the video speech into a document. .. code-block:: python from langchain.document_loaders.generic import GenericLoader from langchain_community.document_loaders.parsers.audio import FasterWhisperParser from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader url="https://www.youtube.com/watch?v=your_video" save_dir="your_dir/" loader = GenericLoader( YoutubeAudioLoader([url],save_dir), FasterWhisperParser() ) docs = loader.load() """
[docs]def__init__(self,*,device:Optional[str]="cuda",model_size:Optional[str]=None,):"""Initialize the parser. Args: device: It can be "cuda" or "cpu" based on the available device. model_size: There are four model sizes to choose from: "base", "small", "medium", and "large-v3", based on the available GPU memory. """try:importtorchexceptImportError:raiseImportError("torch package not found, please install it with `pip install torch`")# Determine the device to useifdevice=="cpu":self.device="cpu"else:self.device="cuda"iftorch.cuda.is_available()else"cpu"# Determine the model_sizeifself.device=="cpu":self.model_size="base"else:# Set the model_size based on the available memorymem=torch.cuda.get_device_properties(self.device).total_memory/(1024**2)ifmem<1000:self.model_size="base"elifmem<3000:self.model_size="small"elifmem<5000:self.model_size="medium"else:self.model_size="large-v3"# If the user has assigned a model size, then use the assigned sizeifmodel_sizeisnotNone:ifmodel_sizein["base","small","medium","large-v3"]:self.model_size=model_size
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:"""Lazily parse the blob."""importiotry:frompydubimportAudioSegmentexceptImportError:raiseImportError("pydub package not found, please install it with `pip install pydub`")try:fromfaster_whisperimportWhisperModelexceptImportError:raiseImportError("faster_whisper package not found, please install it with ""`pip install faster-whisper`")# get the audioifisinstance(blob.data,bytes):# blob contains the audioaudio=AudioSegment.from_file(io.BytesIO(blob.data))elifblob.dataisNoneandblob.path:# Audio file from diskaudio=AudioSegment.from_file(blob.path)else:raiseValueError("Unable to get audio from blob")file_obj=io.BytesIO(audio.export(format="mp3").read())# Transcribemodel=WhisperModel(self.model_size,device=self.device,compute_type="float16")segments,info=model.transcribe(file_obj,beam_size=5)forsegmentinsegments:yieldDocument(page_content=segment.text,metadata={"source":blob.source,"timestamps":"[%.2fs -> %.2fs]"%(segment.start,segment.end),"language":info.language,"probability":"%d%%"%round(info.language_probability*100),**blob.metadata,},)