def_toxicity_init_validate(self,max_size:int)->Any:""" Validate and initialize toxicity processing configuration. Args: max_size (int): Maximum sentence size defined in the configuration object. Raises: Exception: If the maximum sentence size exceeds the 5KB limit. Note: This function ensures that the NLTK punkt tokenizer is downloaded if not already present. Returns: None """ifmax_size>1024*5:raiseException("The sentence length should not exceed 5KB.")try:nltk=importlib.import_module("nltk")nltk.data.find("tokenizers/punkt")returnnltkexceptImportError:raiseModuleNotFoundError("Could not import nltk python package. ""Please install it with `pip install nltk`.")exceptLookupError:nltk.download("punkt")def_split_paragraph(self,prompt_value:str,max_size:int=1024*4)->List[List[str]]:""" Split a paragraph into chunks of sentences, respecting the maximum size limit. Args: paragraph (str): The input paragraph to be split into chunks. max_size (int, optional): The maximum size limit in bytes for each chunk. Defaults to 1024. Returns: List[List[str]]: A list of chunks, where each chunk is a list of sentences. Note: This function validates the maximum sentence size based on service limits using the 'toxicity_init_validate' function. It uses the NLTK sentence tokenizer to split the paragraph into sentences. Example: paragraph = "This is a sample paragraph. It contains multiple sentences. ..." chunks = split_paragraph(paragraph, max_size=2048) """# validate max. sentence size based on Service limitsnltk=self._toxicity_init_validate(max_size)sentences=nltk.sent_tokenize(prompt_value)chunks=list()# type: ignorecurrent_chunk=list()# type: ignorecurrent_size=0forsentenceinsentences:sentence_size=len(sentence.encode("utf-8"))# If adding a new sentence exceeds max_size# or current_chunk has 10 sentences, start a new chunkif(current_size+sentence_size>max_size)or(len(current_chunk)>=10):ifcurrent_chunk:# Avoid appending empty chunkschunks.append(current_chunk)current_chunk=[]current_size=0current_chunk.append(sentence)current_size+=sentence_size# Add any remaining sentencesifcurrent_chunk:chunks.append(current_chunk)returnchunks
[docs]defvalidate(self,prompt_value:str,config:Any=None)->str:""" Check the toxicity of a given text prompt using AWS Comprehend service and apply actions based on configuration. Args: prompt_value (str): The text content to be checked for toxicity. config (Dict[str, Any]): Configuration for toxicity checks and actions. Returns: str: The original prompt_value if allowed or no toxicity found. Raises: ValueError: If the prompt contains toxic labels and cannot be processed based on the configuration. """chunks=self._split_paragraph(prompt_value=prompt_value)forsentence_listinchunks:segments=[{"Text":sentence}forsentenceinsentence_list]response=self.client.detect_toxic_content(TextSegments=segments,LanguageCode="en")ifself.callbackandself.callback.toxicity_callback:self.moderation_beacon["moderation_input"]=segments# type: ignoreself.moderation_beacon["moderation_output"]=responsetoxicity_found=Falsethreshold=config.get("threshold")toxicity_labels=config.get("labels")ifnottoxicity_labels:foriteminresponse["ResultList"]:forlabelinitem["Labels"]:iflabel["Score"]>=threshold:toxicity_found=Truebreakelse:foriteminresponse["ResultList"]:forlabelinitem["Labels"]:if(label["Name"]intoxicity_labelsandlabel["Score"]>=threshold):toxicity_found=Truebreakifself.callbackandself.callback.toxicity_callback:iftoxicity_found:self.moderation_beacon["moderation_status"]="LABELS_FOUND"asyncio.create_task(self.callback.on_after_toxicity(self.moderation_beacon,self.unique_id))iftoxicity_found:raiseModerationToxicityErrorreturnprompt_value