[docs]defformat_duplicated_operator(operator_name:str,count:int)->str:"""Format the operator name with the count."""clean_operator_name=re.sub(r"[<>]","",operator_name)clean_operator_name=re.sub(r"_\d+$","",clean_operator_name)ifoperator_name.startswith("<")andoperator_name.endswith(">"):returnf"<{clean_operator_name}_{count}>"else:returnf"{clean_operator_name}_{count}"
[docs]@dataclassclassDeanonymizerMapping:"""Deanonymizer mapping."""mapping:MappingDataType=field(default_factory=lambda:defaultdict(lambda:defaultdict(str)))@propertydefdata(self)->MappingDataType:"""Return the deanonymizer mapping."""return{k:dict(v)fork,vinself.mapping.items()}
[docs]defupdate(self,new_mapping:MappingDataType)->None:"""Update the deanonymizer mapping with new values. Duplicated values will not be added If there are multiple entities of the same type, the mapping will include a count to differentiate them. For example, if there are two names in the input text, the mapping will include NAME_1 and NAME_2. """seen_values=set()forentity_type,valuesinnew_mapping.items():count=len(self.mapping[entity_type])+1forkey,valueinvalues.items():if(valuenotinseen_valuesandvaluenotinself.mapping[entity_type].values()):new_key=(format_duplicated_operator(key,count)ifkeyinself.mapping[entity_type]elsekey)self.mapping[entity_type][new_key]=valueseen_values.add(value)count+=1
[docs]defcreate_anonymizer_mapping(original_text:str,analyzer_results:List["RecognizerResult"],anonymizer_results:"EngineResult",is_reversed:bool=False,)->MappingDataType:"""Create or update the mapping used to anonymize and/or deanonymize a text. This method exploits the results returned by the analysis and anonymization processes. If is_reversed is True, it constructs a mapping from each original entity to its anonymized value. If is_reversed is False, it constructs a mapping from each anonymized entity back to its original text value. If there are multiple entities of the same type, the mapping will include a count to differentiate them. For example, if there are two names in the input text, the mapping will include NAME_1 and NAME_2. Example of mapping: { "PERSON": { "<original>": "<anonymized>", "John Doe": "Slim Shady" }, "PHONE_NUMBER": { "111-111-1111": "555-555-5555" } ... } """# We are able to zip and loop through both lists because we expect# them to return corresponding entities for each identified piece# of analyzable data from our input.# We sort them by their 'start' attribute because it allows us to# match corresponding entities by their position in the input text.analyzer_results.sort(key=lambdad:d.start)anonymizer_results.items.sort(key=lambdad:d.start)mapping:MappingDataType=defaultdict(dict)count:dict=defaultdict(int)foranalyzed,anonymizedinzip(analyzer_results,anonymizer_results.items):original_value=original_text[analyzed.start:analyzed.end]entity_type=anonymized.entity_typeifis_reversed:cond=original_valueinmapping[entity_type].values()else:cond=original_valueinmapping[entity_type]ifcond:continueif(anonymized.textinmapping[entity_type].values()oranonymized.textinmapping[entity_type]):anonymized_value=format_duplicated_operator(anonymized.text,count[entity_type]+2)count[entity_type]+=1else:anonymized_value=anonymized.textmapping_key,mapping_value=((anonymized_value,original_value)ifis_reversedelse(original_value,anonymized_value))mapping[entity_type][mapping_key]=mapping_valuereturnmapping