[docs]defexact_matching_strategy(text:str,deanonymizer_mapping:MappingDataType)->str:"""Exact matching strategy for deanonymization. It replaces all the anonymized entities with the original ones. Args: text: text to deanonymize deanonymizer_mapping: mapping between anonymized entities and original ones"""# Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)forentity_typeindeanonymizer_mapping:foranonymized,originalindeanonymizer_mapping[entity_type].items():text=text.replace(anonymized,original)returntext
[docs]defcase_insensitive_matching_strategy(text:str,deanonymizer_mapping:MappingDataType)->str:"""Case insensitive matching strategy for deanonymization. It replaces all the anonymized entities with the original ones irrespective of their letter case. Args: text: text to deanonymize deanonymizer_mapping: mapping between anonymized entities and original ones Examples of matching: keanu reeves -> Keanu Reeves JOHN F. KENNEDY -> John F. Kennedy """# Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)forentity_typeindeanonymizer_mapping:foranonymized,originalindeanonymizer_mapping[entity_type].items():# Use regular expressions for case-insensitive matching and replacingtext=re.sub(anonymized,original,text,flags=re.IGNORECASE)returntext
[docs]deffuzzy_matching_strategy(text:str,deanonymizer_mapping:MappingDataType,max_l_dist:int=3)->str:"""Fuzzy matching strategy for deanonymization. It uses fuzzy matching to find the position of the anonymized entity in the text. It replaces all the anonymized entities with the original ones. Args: text: text to deanonymize deanonymizer_mapping: mapping between anonymized entities and original ones max_l_dist: maximum Levenshtein distance between the anonymized entity and the text segment to consider it a match Examples of matching: Kaenu Reves -> Keanu Reeves John F. Kennedy -> John Kennedy """try:fromfuzzysearchimportfind_near_matchesexceptImportErrorase:raiseImportError("Could not import fuzzysearch, please install with ""`pip install fuzzysearch`.")fromeforentity_typeindeanonymizer_mapping:foranonymized,originalindeanonymizer_mapping[entity_type].items():matches=find_near_matches(anonymized,text,max_l_dist=max_l_dist)new_text=""last_end=0forminmatches:# add the text that isn't part of a matchnew_text+=text[last_end:m.start]# add the replacement textnew_text+=originallast_end=m.end# add the remaining text that wasn't part of a matchnew_text+=text[last_end:]text=new_textreturntext
[docs]defcombined_exact_fuzzy_matching_strategy(text:str,deanonymizer_mapping:MappingDataType,max_l_dist:int=3)->str:"""Combined exact and fuzzy matching strategy for deanonymization. It is a RECOMMENDED STRATEGY. Args: text: text to deanonymize deanonymizer_mapping: mapping between anonymized entities and original ones max_l_dist: maximum Levenshtein distance between the anonymized entity and the text segment to consider it a match Examples of matching: Kaenu Reves -> Keanu Reeves John F. Kennedy -> John Kennedy """text=exact_matching_strategy(text,deanonymizer_mapping)text=fuzzy_matching_strategy(text,deanonymizer_mapping,max_l_dist)returntext
[docs]defngram_fuzzy_matching_strategy(text:str,deanonymizer_mapping:MappingDataType,fuzzy_threshold:int=85,use_variable_length:bool=True,)->str:"""N-gram fuzzy matching strategy for deanonymization. It replaces all the anonymized entities with the original ones. It uses fuzzy matching to find the position of the anonymized entity in the text. It generates n-grams of the same length as the anonymized entity from the text and uses fuzzy matching to find the position of the anonymized entity in the text. Args: text: text to deanonymize deanonymizer_mapping: mapping between anonymized entities and original ones fuzzy_threshold: fuzzy matching threshold use_variable_length: whether to use (n-1, n, n+1)-grams or just n-grams """defgenerate_ngrams(words_list:List[str],n:int)->list:"""Generate n-grams from a list of words"""return[" ".join(words_list[i:i+n])foriinrange(len(words_list)-(n-1))]try:fromfuzzywuzzyimportfuzzexceptImportErrorase:raiseImportError("Could not import fuzzywuzzy, please install with ""`pip install fuzzywuzzy`.")frometext_words=text.split()replacements=[]matched_indices:List[int]=[]forentity_typeindeanonymizer_mapping:foranonymized,originalindeanonymizer_mapping[entity_type].items():anonymized_words=anonymized.split()ifuse_variable_length:gram_lengths=[len(anonymized_words)-1,len(anonymized_words),len(anonymized_words)+1,]else:gram_lengths=[len(anonymized_words)]forningram_lengths:ifn>0:# Take only positive valuessegments=generate_ngrams(text_words,n)fori,segmentinenumerate(segments):if(fuzz.ratio(anonymized.lower(),segment.lower())>fuzzy_thresholdandinotinmatched_indices):replacements.append((i,n,original))# Add the matched segment indices to the listmatched_indices.extend(range(i,i+n))# Sort replacements by index in reverse orderreplacements.sort(key=lambdax:x[0],reverse=True)# Apply replacements in reverse order to not affect subsequent indicesforstart,length,replacementinreplacements:text_words[start:start+length]=replacement.split()return" ".join(text_words)