[docs]defprocess(self,video_file_path:str,run_manager:Optional[CallbackManagerForChainRun]=None,)->list:try:self._extract_audio(video_file_path)returnself._transcribe_audio()finally:# Cleanup: Delete the MP3 file after processingtry:self.output_audio_path.unlink()exceptFileNotFoundError:pass# File not found, nothing to delete
def_extract_audio(self,video_file_path:str)->None:# Ensure the directory exists where the output file will be savedself.output_audio_path.parent.mkdir(parents=True,exist_ok=True)command=["ffmpeg","-i",video_file_path,"-vn","-acodec","mp3",self.output_audio_path.as_posix(),"-y",# The '-y' flag overwrites the output file if it exists]subprocess.run(command,stdout=subprocess.PIPE,stderr=subprocess.PIPE,check=True)def_transcribe_audio(self)->List[BaseModel]:ifnotself.api_key:raiseValueError("API key for AssemblyAI is not configured")audio_file_path_str=str(self.output_audio_path)loader=AssemblyAIAudioTranscriptLoader(file_path=audio_file_path_str,api_key=self.api_key,transcript_format=TranscriptFormat.SUBTITLES_SRT,)docs=loader.load()returnself._create_transcript_models(docs)@staticmethoddef_create_transcript_models(docs:List[Document])->List[BaseModel]:# Assuming docs is a list of Documents with .page_content as the transcript datamodels=[]fordocindocs:models.extend(AudioProcessor._parse_transcript(doc.page_content))returnmodels@staticmethoddef_parse_transcript(srt_content:str)->List[BaseModel]:models=[]entries=srt_content.strip().split("\n\n")# Split based on double newlineforentryinentries:index,timespan,*subtitle_lines=entry.split("\n")# If not a valid entry format, skipiflen(subtitle_lines)==0:continuestart_time,end_time=timespan.split(" --> ")subtitle_text=" ".join(subtitle_lines).strip()models.append(AudioModel.from_srt(start_time,end_time,subtitle_text))returnmodels