[docs]def__init__(self,llm:BaseLanguageModel,verbose:bool=True,similarity_threshold:int=80,use_unclustered_models:bool=False,)->None:self.llm=llmself.verbose=verbose# Set the percentage value for how similar two video model image# descriptions should be in order for us to cluster them into a groupself._SIMILARITY_THRESHOLD=similarity_threshold# Set to True if you want to include video models which were not clustered.# Will likely result in closed-caption artifactsself._USE_NON_CLUSTERED_VIDEO_MODELS=use_unclustered_models
[docs]defprocess(self,video_models:List[VideoModel],run_manager:Optional[CallbackManagerForChainRun]=None,)->List[VideoModel]:# Remove any consecutive duplicatesvideo_models=self._remove_consecutive_duplicates(video_models)# Holds the video models after clustering has been appliedvideo_models_post_clustering=[]# In this case, index represents a divider between clustersindex=0forstart,endinself._get_model_clusters(video_models):start_vm,end_vm=video_models[start],video_models[end]ifself._USE_NON_CLUSTERED_VIDEO_MODELS:# Append all the non-clustered models in between model clusters# staged for OpenAI combinationvideo_models_post_clustering+=video_models[index:start]index=end+1# Send to llm for description combinationmodels_to_combine=video_models[start:index]combined_description=self._join_similar_video_models(models_to_combine,run_manager)# Strip any prefixes that are redundant in the context of closed-captionsstripped_description=self._remove_video_model_description_prefix(combined_description,run_manager)# Create a new video model which is the combination of all the models in# the clustercombined_and_stripped_model=VideoModel(start_vm.start_time,end_vm.end_time,stripped_description)video_models_post_clustering.append(combined_and_stripped_model)ifself._USE_NON_CLUSTERED_VIDEO_MODELS:# Append any non-clustered models present after every clustered modelvideo_models_post_clustering+=video_models[index:]returnvideo_models_post_clustering
def_remove_consecutive_duplicates(self,video_models:List[VideoModel],)->List[VideoModel]:buffer:List[VideoModel]=[]forvideo_modelinvideo_models:# Join this model and the previous model if they have the same image# descriptionif(len(buffer)>0andbuffer[-1].image_description==video_model.image_description):buffer[-1].end_time=video_model.end_timeelse:buffer.append(video_model)returnbufferdef_remove_video_model_description_prefix(self,description:str,run_manager:Optional[CallbackManagerForChainRun]=None)->str:conversation=LLMChain(llm=self.llm,prompt=REMOVE_VIDEO_MODEL_DESCRIPTION_PROMPT,verbose=True,callbacks=run_manager.get_child()ifrun_managerelseNone,)# Get response from OpenAI using LLMChainresponse=conversation({"description":description})# Take out the Result: part of the responsereturnresponse["text"].replace("Result:","").strip()def_join_similar_video_models(self,video_models:List[VideoModel],run_manager:Optional[CallbackManagerForChainRun]=None,)->str:descriptions=""count=1forvideo_modelinvideo_models:descriptions+=(f"Description {count}: "+video_model.image_description+", ")count+=1# Strip trailing ", "descriptions=descriptions[:-2]conversation=LLMChain(llm=self.llm,prompt=JOIN_SIMILAR_VIDEO_MODELS_PROMPT,verbose=True,callbacks=run_manager.get_child()ifrun_managerelseNone,)# Get response from OpenAI using LLMChainresponse=conversation({"descriptions":descriptions})# Take out the Result: part of the responsereturnresponse["text"].replace("Result:","").strip()def_get_model_clusters(self,video_models:List[VideoModel])->List[Tuple[int,int]]:# Word bank which maps lowercase words (case-insensitive) with trailing s's# removed (singular/plural-insensitive) to video model indexes in video_modelsword_bank:Dict[str,List[int]]={}# Function which formats words to be inserted into word bank, as specified# abovedefformat_word(w:str)->str:returnw.lower().rstrip("s")# Keeps track of the current video model indexindex=0forvminvideo_models:forwordinvm.image_description.split():formatted_word=format_word(word)word_bank[formatted_word]=(word_bank[formatted_word]ifformatted_wordinword_bankelse[])+[index]index+=1# Keeps track of the current video model indexindex=0# Maps video model index to list of other video model indexes that have a# similarity score above the thresholdsims:Dict[int,List[int]]={}forvminvideo_models:# Maps other video model index to number of words it shares in common# with this video modelmatches:Dict[int,int]={}forwordinvm.image_description.split():formatted_word=format_word(word)formatchinword_bank[formatted_word]:ifmatch!=index:matches[match]=matches[match]+1ifmatchinmatcheselse1ifmatches:# Get the highest number of words another video model shares with# this video modelmax_words_in_common=max(matches.values())# Get all video model indexes that share the maximum number of words# with this video modelvms_with_max_words=[keyforkey,valueinmatches.items()ifvalue==max_words_in_common]# Maps other video model index to its similarity score with this# video modelsim_scores:Dict[int,float]={}# Compute similarity score for all video models that share the# highest number of word occurrences with this video modelforvm_indexinvms_with_max_words:sim_scores[vm_index]=video_models[vm_index].similarity_score(vm)# Get the highest similarity score another video model shares with# this video modelmax_score=max(sim_scores.values())# Get a list of all video models that have the maximum similarity# score to this video modelvms_with_max_score=[keyforkey,valueinsim_scores.items()ifvalue==max_score]# Finally, transfer all video models with a high enough similarity# to this video model into the sims dictionaryifmax_score>=self._SIMILARITY_THRESHOLD:sims[index]=[]forvm_indexinvms_with_max_score:sims[index].append(vm_index)index+=1# Maps video model index to boolean, indicates if we have already checked# this video model's similarity array so that we don't have infinite recursionalready_accessed:Dict[int,bool]={}# Recursively search video_model[vm_index]'s similarity matches to find the# earliest and latest video model in the cluster (start and end)def_find_start_and_end(vm_index:int)->Tuple[int,int]:close_matches=sims[vm_index]first_vm,last_vm=min(close_matches),max(close_matches)first_vm,last_vm=min(vm_index,first_vm),max(vm_index,last_vm)ifnotalready_accessed.get(vm_index,None):already_accessed[vm_index]=Trueforclose_matchinclose_matches:ifclose_matchinsims:ifvm_indexinsims[close_match]:s,e=_find_start_and_end(close_match)first_vm=min(s,first_vm)last_vm=max(e,last_vm)returnfirst_vm,last_vm# Add the video model cluster results into a setclusters=set()forvm_indexinsims:clusters.add(_find_start_and_end(vm_index))# Filter the set to include only non-subset intervalsfiltered_clusters=set()forintervalinclusters:start,end=interval[0],interval[1]is_subset=any(start>=other_startandend<=other_endforother_start,other_endinclustersifinterval!=(other_start,other_end))ifnotis_subset:filtered_clusters.add(interval)# Sort these clusters into a list, sorted using the first element of the# tuple (index of video model in the cluster with earliest start time)sorted_clusters=sorted(filtered_clusters,key=lambdax:x[0])# Merge any overlapping clusters into one big clusterdef_merge_overlapping_clusters(array:List[Tuple[int,int]],)->List[Tuple[int,int]]:iflen(array)<=1:returnarraydef_merge(curr:Tuple[int,int],rest:List[Tuple[int,int]])->List[Tuple[int,int]]:ifcurr[1]>=rest[0][0]:return[(curr[0],rest[0][1])]+rest[1:]return[curr]+restreturn_merge(array[0],_merge_overlapping_clusters(array[1:]))merged_clusters=_merge_overlapping_clusters(sorted_clusters)returnmerged_clusters