[docs]classAzureOpenAIWhisperParser(BaseBlobParser):""" Transcribe and parse audio files using Azure OpenAI Whisper. This parser integrates with the Azure OpenAI Whisper model to transcribe audio files. It differs from the standard OpenAI Whisper parser, requiring an Azure endpoint and credentials. The parser is limited to files under 25 MB. **Note**: This parser uses the Azure OpenAI API, providing integration with the Azure ecosystem, and making it suitable for workflows involving other Azure services. For files larger than 25 MB, consider using Azure AI Speech batch transcription: https://learn.microsoft.com/azure/ai-services/speech-service/batch-transcription-create?pivots=rest-api#use-a-whisper-model Setup: 1. Follow the instructions here to deploy Azure Whisper: https://learn.microsoft.com/azure/ai-services/openai/whisper-quickstart?tabs=command-line%2Cpython-new&pivots=programming-language-python 2. Install ``langchain`` and set the following environment variables: .. code-block:: bash pip install -U langchain langchain-community export AZURE_OPENAI_API_KEY="your-api-key" export AZURE_OPENAI_ENDPOINT="https://your-endpoint.openai.azure.com/" export OPENAI_API_VERSION="your-api-version" Example Usage: .. code-block:: python from langchain.community import AzureOpenAIWhisperParser whisper_parser = AzureOpenAIWhisperParser( deployment_name="your-whisper-deployment", api_version="2024-06-01", api_key="your-api-key", # other params... ) audio_blob = Blob(path="your-audio-file-path") response = whisper_parser.lazy_parse(audio_blob) for document in response: print(document.page_content) Integration with Other Loaders: The AzureOpenAIWhisperParser can be used with video/audio loaders and `GenericLoader` to automate retrieval and parsing. YoutubeAudioLoader Example: .. code-block:: python from langchain_community.document_loaders.blob_loaders import ( YoutubeAudioLoader ) from langchain_community.document_loaders.generic import GenericLoader # Must be a list youtube_url = ["https://your-youtube-url"] save_dir = "directory-to-download-videos" loader = GenericLoader( YoutubeAudioLoader(youtube_url, save_dir), AzureOpenAIWhisperParser(deployment_name="your-deployment-name") ) docs = loader.load() """
[docs]def__init__(self,*,api_key:Optional[str]=None,azure_endpoint:Optional[str]=None,api_version:Optional[str]=None,azure_ad_token_provider:Union[Callable[[],str],None]=None,language:Optional[str]=None,prompt:Optional[str]=None,response_format:Union[Literal["json","text","srt","verbose_json","vtt"],None]=None,temperature:Optional[float]=None,deployment_name:str,max_retries:int=3,):""" Initialize the AzureOpenAIWhisperParser. Args: api_key (Optional[str]): Azure OpenAI API key. If not provided, defaults to the `AZURE_OPENAI_API_KEY` environment variable. azure_endpoint (Optional[str]): Azure OpenAI service endpoint. Defaults to `AZURE_OPENAI_ENDPOINT` environment variable if not set. api_version (Optional[str]): API version to use, defaults to the `OPENAI_API_VERSION` environment variable. azure_ad_token_provider (Union[Callable[[], str], None]): Azure Active Directory token for authentication (if applicable). language (Optional[str]): Language in which the request should be processed. prompt (Optional[str]): Custom instructions or prompt for the Whisper model. response_format (Union[str, None]): The desired output format. Options: "json", "text", "srt", "verbose_json", "vtt". temperature (Optional[float]): Controls the randomness of the model's output. deployment_name (str): The deployment name of the Whisper model. max_retries (int): Maximum number of retries for failed API requests. Raises: ImportError: If the required package `openai` is not installed. """self.api_key=api_keyoros.environ.get("AZURE_OPENAI_API_KEY")self.azure_endpoint=azure_endpointoros.environ.get("AZURE_OPENAI_ENDPOINT")self.api_version=api_versionoros.environ.get("OPENAI_API_VERSION")self.azure_ad_token_provider=azure_ad_token_providerself.language=languageself.prompt=promptself.response_format=response_formatself.temperature=temperatureself.deployment_name=deployment_nameself.max_retries=max_retriestry:importopenaiexceptImportError:raiseImportError("openai package not found, please install it with `pip install openai`")ifis_openai_v1():self._client=openai.AzureOpenAI(api_key=self.api_key,azure_endpoint=self.azure_endpoint,api_version=self.api_version,max_retries=self.max_retries,azure_ad_token_provider=self.azure_ad_token_provider,)else:ifself.api_key:openai.api_key=self.api_keyifself.azure_endpoint:openai.api_base=self.azure_endpointifself.api_version:openai.api_version=self.api_versionopenai.api_type="azure"self._client=openai
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:""" Lazily parse the provided audio blob for transcription. Args: blob (Blob): The audio file in Blob format to be transcribed. Yields: Document: Parsed transcription from the audio file. Raises: Exception: If an error occurs during transcription. """file_obj=open(str(blob.path),"rb")# Transcribetry:ifis_openai_v1():transcript=self._client.audio.transcriptions.create(model=self.deployment_name,file=file_obj,**self._create_params,)else:transcript=self._client.Audio.transcribe(model=self.deployment_name,deployment_id=self.deployment_name,file=file_obj,**self._create_params,)exceptException:raiseyieldDocument(page_content=transcript.textifnotisinstance(transcript,str)elsetranscript,metadata={"source":blob.source},)
[docs]classOpenAIWhisperParser(BaseBlobParser):"""Transcribe and parse audio files. Audio transcription is with OpenAI Whisper model. Args: api_key: OpenAI API key chunk_duration_threshold: Minimum duration of a chunk in seconds NOTE: According to the OpenAI API, the chunk duration should be at least 0.1 seconds. If the chunk duration is less or equal than the threshold, it will be skipped. """
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:"""Lazily parse the blob."""try:importopenaiexceptImportError:raiseImportError("openai package not found, please install it with `pip install openai`")try:frompydubimportAudioSegmentexceptImportError:raiseImportError("pydub package not found, please install it with `pip install pydub`")ifis_openai_v1():# api_key optional, defaults to `os.environ['OPENAI_API_KEY']`client=openai.OpenAI(api_key=self.api_key,base_url=self.base_url)else:# Set the API key if providedifself.api_key:openai.api_key=self.api_keyifself.base_url:openai.api_base=self.base_url# Audio file from diskaudio=AudioSegment.from_file(blob.path)# Define the duration of each chunk in minutes# Need to meet 25MB size limit for Whisper APIchunk_duration=20chunk_duration_ms=chunk_duration*60*1000# Split the audio into chunk_duration_ms chunksforsplit_number,iinenumerate(range(0,len(audio),chunk_duration_ms)):# Audio chunkchunk=audio[i:i+chunk_duration_ms]# Skip chunks that are too short to transcribeifchunk.duration_seconds<=self.chunk_duration_threshold:continuefile_obj=io.BytesIO(chunk.export(format="mp3").read())ifblob.sourceisnotNone:file_obj.name=blob.source+f"_part_{split_number}.mp3"else:file_obj.name=f"part_{split_number}.mp3"# Transcribeprint(f"Transcribing part {split_number+1}!")# noqa: T201attempts=0whileattempts<3:try:ifis_openai_v1():transcript=client.audio.transcriptions.create(model=self.model,file=file_obj,**self._create_params)else:transcript=openai.Audio.transcribe(self.model,file_obj)# type: ignore[attr-defined]breakexceptExceptionase:attempts+=1print(f"Attempt {attempts} failed. Exception: {str(e)}")# noqa: T201time.sleep(5)else:print("Failed to transcribe after 3 attempts.")# noqa: T201continueyieldDocument(page_content=transcript.textifnotisinstance(transcript,str)elsetranscript,metadata={"source":blob.source,"chunk":split_number},)
[docs]classOpenAIWhisperParserLocal(BaseBlobParser):"""Transcribe and parse audio files with OpenAI Whisper model. Audio transcription with OpenAI Whisper model locally from transformers. Parameters: device - device to use NOTE: By default uses the gpu if available, if you want to use cpu, please set device = "cpu" lang_model - whisper model to use, for example "openai/whisper-medium" forced_decoder_ids - id states for decoder in multilanguage model, usage example: from transformers import WhisperProcessor processor = WhisperProcessor.from_pretrained("openai/whisper-medium") forced_decoder_ids = WhisperProcessor.get_decoder_prompt_ids(language="french", task="transcribe") forced_decoder_ids = WhisperProcessor.get_decoder_prompt_ids(language="french", task="translate") """
[docs]def__init__(self,device:str="0",lang_model:Optional[str]=None,batch_size:int=8,chunk_length:int=30,forced_decoder_ids:Optional[Tuple[Dict]]=None,):"""Initialize the parser. Args: device: device to use. lang_model: whisper model to use, for example "openai/whisper-medium". Defaults to None. forced_decoder_ids: id states for decoder in a multilanguage model. Defaults to None. batch_size: batch size used for decoding Defaults to 8. chunk_length: chunk length used during inference. Defaults to 30s. """try:fromtransformersimportpipelineexceptImportError:raiseImportError("transformers package not found, please install it with ""`pip install transformers`")try:importtorchexceptImportError:raiseImportError("torch package not found, please install it with `pip install torch`")# Determine the device to useifdevice=="cpu":self.device="cpu"else:self.device="cuda:0"iftorch.cuda.is_available()else"cpu"ifself.device=="cpu":default_model="openai/whisper-base"self.lang_model=lang_modeliflang_modelelsedefault_modelelse:# Set the language model based on the device and available memorymem=torch.cuda.get_device_properties(self.device).total_memory/(1024**2)ifmem<5000:rec_model="openai/whisper-base"elifmem<7000:rec_model="openai/whisper-small"elifmem<12000:rec_model="openai/whisper-medium"else:rec_model="openai/whisper-large"self.lang_model=lang_modeliflang_modelelserec_modelprint("Using the following model: ",self.lang_model)# noqa: T201self.batch_size=batch_size# load model for inferenceself.pipe=pipeline("automatic-speech-recognition",model=self.lang_model,chunk_length_s=chunk_length,device=self.device,)ifforced_decoder_idsisnotNone:try:self.pipe.model.config.forced_decoder_ids=forced_decoder_idsexceptExceptionasexception_text:logger.info("Unable to set forced_decoder_ids parameter for whisper model"f"Text of exception: {exception_text}""Therefore whisper model will use default mode for decoder")
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:"""Lazily parse the blob."""try:frompydubimportAudioSegmentexceptImportError:raiseImportError("pydub package not found, please install it with `pip install pydub`")try:importlibrosaexceptImportError:raiseImportError("librosa package not found, please install it with ""`pip install librosa`")# Audio file from diskaudio=AudioSegment.from_file(blob.path)file_obj=io.BytesIO(audio.export(format="mp3").read())# Transcribeprint(f"Transcribing part {blob.path}!")# noqa: T201y,sr=librosa.load(file_obj,sr=16000)prediction=self.pipe(y.copy(),batch_size=self.batch_size)["text"]yieldDocument(page_content=prediction,metadata={"source":blob.source},)
[docs]classYandexSTTParser(BaseBlobParser):"""Transcribe and parse audio files. Audio transcription is with OpenAI Whisper model."""
[docs]def__init__(self,*,api_key:Optional[str]=None,iam_token:Optional[str]=None,model:str="general",language:str="auto",):"""Initialize the parser. Args: api_key: API key for a service account with the `ai.speechkit-stt.user` role. iam_token: IAM token for a service account with the `ai.speechkit-stt.user` role. model: Recognition model name. Defaults to general. language: The language in ISO 639-1 format. Defaults to automatic language recognition. Either `api_key` or `iam_token` must be provided, but not both. """if(api_keyisNone)==(iam_tokenisNone):raiseValueError("Either 'api_key' or 'iam_token' must be provided, but not both.")self.api_key=api_keyself.iam_token=iam_tokenself.model=modelself.language=language
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:"""Lazily parse the blob."""try:fromspeechkitimportconfigure_credentials,creds,model_repositoryfromspeechkit.sttimportAudioProcessingTypeexceptImportError:raiseImportError("yandex-speechkit package not found, please install it with ""`pip install yandex-speechkit`")try:frompydubimportAudioSegmentexceptImportError:raiseImportError("pydub package not found, please install it with `pip install pydub`")ifself.api_key:configure_credentials(yandex_credentials=creds.YandexCredentials(api_key=self.api_key))else:configure_credentials(yandex_credentials=creds.YandexCredentials(iam_token=self.iam_token))audio=AudioSegment.from_file(blob.path)model=model_repository.recognition_model()model.model=self.modelmodel.language=self.languagemodel.audio_processing_type=AudioProcessingType.Fullresult=model.transcribe(audio)forresinresult:yieldDocument(page_content=res.normalized_text,metadata={"source":blob.source},)
[docs]classFasterWhisperParser(BaseBlobParser):"""Transcribe and parse audio files with faster-whisper. faster-whisper is a reimplementation of OpenAI's Whisper model using CTranslate2, which is up to 4 times faster than openai/whisper for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU. It can automatically detect the following 14 languages and transcribe the text into their respective languages: en, zh, fr, de, ja, ko, ru, es, th, it, pt, vi, ar, tr. The gitbub repository for faster-whisper is : https://github.com/SYSTRAN/faster-whisper Example: Load a YouTube video and transcribe the video speech into a document. .. code-block:: python from langchain.document_loaders.generic import GenericLoader from langchain_community.document_loaders.parsers.audio import FasterWhisperParser from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader url="https://www.youtube.com/watch?v=your_video" save_dir="your_dir/" loader = GenericLoader( YoutubeAudioLoader([url],save_dir), FasterWhisperParser() ) docs = loader.load() """
[docs]def__init__(self,*,device:Optional[str]="cuda",model_size:Optional[str]=None,):"""Initialize the parser. Args: device: It can be "cuda" or "cpu" based on the available device. model_size: There are four model sizes to choose from: "base", "small", "medium", and "large-v3", based on the available GPU memory. """try:importtorchexceptImportError:raiseImportError("torch package not found, please install it with `pip install torch`")# Determine the device to useifdevice=="cpu":self.device="cpu"else:self.device="cuda"iftorch.cuda.is_available()else"cpu"# Determine the model_sizeifself.device=="cpu":self.model_size="base"else:# Set the model_size based on the available memorymem=torch.cuda.get_device_properties(self.device).total_memory/(1024**2)ifmem<1000:self.model_size="base"elifmem<3000:self.model_size="small"elifmem<5000:self.model_size="medium"else:self.model_size="large-v3"# If the user has assigned a model size, then use the assigned sizeifmodel_sizeisnotNone:ifmodel_sizein["base","small","medium","large-v3"]:self.model_size=model_size
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:"""Lazily parse the blob."""try:frompydubimportAudioSegmentexceptImportError:raiseImportError("pydub package not found, please install it with `pip install pydub`")try:fromfaster_whisperimportWhisperModelexceptImportError:raiseImportError("faster_whisper package not found, please install it with ""`pip install faster-whisper`")# get the audioifisinstance(blob.data,bytes):# blob contains the audioaudio=AudioSegment.from_file(io.BytesIO(blob.data))elifblob.dataisNoneandblob.path:# Audio file from diskaudio=AudioSegment.from_file(blob.path)else:raiseValueError("Unable to get audio from blob")file_obj=io.BytesIO(audio.export(format="mp3").read())# Transcribemodel=WhisperModel(self.model_size,device=self.device,compute_type="float16")segments,info=model.transcribe(file_obj,beam_size=5)forsegmentinsegments:yieldDocument(page_content=segment.text,metadata={"source":blob.source,"timestamps":"[%.2fs -> %.2fs]"%(segment.start,segment.end),"language":info.language,"probability":"%d%%"%round(info.language_probability*100),**blob.metadata,},)