Source code for langchain_google_community.google_speech_to_text

from __future__ import annotations

from typing import TYPE_CHECKING, List, Optional

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document

from langchain_google_community._utils import get_client_info

if TYPE_CHECKING:
    from google.cloud.speech_v2 import RecognitionConfig  # type: ignore[import]
    from google.protobuf.field_mask_pb2 import FieldMask


[docs]class SpeechToTextLoader(BaseLoader): """ Loader for Google Cloud Speech-to-Text audio transcripts. It uses the Google Cloud Speech-to-Text API to transcribe audio files and loads the transcribed text into one or more Documents, depending on the specified format. To use, you should have the ``google-cloud-speech`` python package installed. Audio files can be specified via a Google Cloud Storage uri or a local file path. For a detailed explanation of Google Cloud Speech-to-Text, refer to the product documentation. https://cloud.google.com/speech-to-text """
[docs] def __init__( self, project_id: str, file_path: str, location: str = "us-central1", recognizer_id: str = "_", config: Optional[RecognitionConfig] = None, config_mask: Optional[FieldMask] = None, is_long: bool = False, ): """ Initializes the GoogleSpeechToTextLoader. Args: project_id: Google Cloud Project ID. file_path: A Google Cloud Storage URI or a local file path. location: Speech-to-Text recognizer location. recognizer_id: Speech-to-Text recognizer id. config: Recognition options and features. For more information: https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v2.types.RecognitionConfig config_mask: The list of fields in config that override the values in the ``default_recognition_config`` of the recognizer during this recognition request. For more information: https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v2.types.RecognizeRequest is_long: use async Cloud Speech recognition, mainly for long documents For more information: https://cloud.google.com/speech-to-text/v2/docs/batch-recognize """ try: from google.api_core.client_options import ClientOptions from google.cloud.speech_v2 import ( AutoDetectDecodingConfig, RecognitionConfig, RecognitionFeatures, SpeechClient, ) except ImportError as exc: raise ImportError( "Could not import google-cloud-speech python package. " "Please, install speech dependency group: " "`pip install langchain-google-community[speech]`" ) from exc self.project_id = project_id self.file_path = file_path self.location = location self.recognizer_id = recognizer_id # Config must be set in speech recognition request. self.config = config or RecognitionConfig( auto_decoding_config=AutoDetectDecodingConfig(), language_codes=["en-US"], model="chirp", features=RecognitionFeatures( # Automatic punctuation could be useful for language applications enable_automatic_punctuation=True, ), ) self.config_mask = config_mask self._client = SpeechClient( client_info=get_client_info(module="speech-to-text"), client_options=( ClientOptions(api_endpoint=f"{location}-speech.googleapis.com") if location != "global" else None ), ) self._recognizer_path = self._client.recognizer_path( project_id, location, recognizer_id ) self._is_long = is_long
[docs] def load(self) -> List[Document]: """Transcribes the audio file and loads the transcript into documents. It uses the Google Cloud Speech-to-Text API to transcribe the audio file and blocks until the transcription is finished. """ if self._is_long: return [Document(page_content=self._load_long())] try: from google.cloud.speech_v2 import RecognizeRequest except ImportError as exc: raise ImportError( "Could not import google-cloud-speech python package. " "Please, install speech dependency group: " "`pip install langchain-google-community[speech]`" ) from exc request = RecognizeRequest( recognizer=self._recognizer_path, config=self.config, config_mask=self.config_mask, ) if "gs://" in self.file_path: request.uri = self.file_path else: with open(self.file_path, "rb") as f: request.content = f.read() response = self._client.recognize(request=request) return [ Document( page_content=result.alternatives[0].transcript, metadata={ "language_code": result.language_code, "result_end_offset": result.result_end_offset, }, ) for result in response.results ]
def _load_long(self) -> str: from google.cloud.speech_v2 import ( BatchRecognizeFileMetadata, BatchRecognizeRequest, InlineOutputConfig, RecognitionOutputConfig, ) request = BatchRecognizeRequest( recognizer=self._recognizer_path, config=self.config, config_mask=self.config_mask, files=[BatchRecognizeFileMetadata(uri=self.file_path)], recognition_output_config=RecognitionOutputConfig( inline_response_config=InlineOutputConfig(), ), ) operation = self._client.batch_recognize(request=request) response = operation.result(timeout=120) return "".join( [ r.alternatives[0].transcript for r in response.results[self.file_path].transcript.results if r.alternatives ] )