Source code for langchain_experimental.data_anonymizer.presidio
from__future__importannotationsimportjsonfrompathlibimportPathfromtypingimportTYPE_CHECKING,Callable,Dict,List,Optional,Unionimportyamlfromlangchain_experimental.data_anonymizer.baseimport(DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,AnonymizerBase,ReversibleAnonymizerBase,)fromlangchain_experimental.data_anonymizer.deanonymizer_mappingimport(DeanonymizerMapping,MappingDataType,create_anonymizer_mapping,)fromlangchain_experimental.data_anonymizer.deanonymizer_matching_strategiesimport(exact_matching_strategy,)fromlangchain_experimental.data_anonymizer.faker_presidio_mappingimport(get_pseudoanonymizer_mapping,)ifTYPE_CHECKING:frompresidio_analyzerimportAnalyzerEngine,EntityRecognizerfrompresidio_analyzer.nlp_engineimportNlpEngineProviderfrompresidio_anonymizerimportAnonymizerEnginefrompresidio_anonymizer.entitiesimportConflictResolutionStrategy,OperatorConfigdef_import_analyzer_engine()->"AnalyzerEngine":try:frompresidio_analyzerimportAnalyzerEngineexceptImportErrorase:raiseImportError("Could not import presidio_analyzer, please install with ""`pip install presidio-analyzer`. You will also need to download a ""spaCy model to use the analyzer, e.g. ""`python -m spacy download en_core_web_lg`.")fromereturnAnalyzerEnginedef_import_nlp_engine_provider()->"NlpEngineProvider":try:frompresidio_analyzer.nlp_engineimportNlpEngineProviderexceptImportErrorase:raiseImportError("Could not import presidio_analyzer, please install with ""`pip install presidio-analyzer`. You will also need to download a ""spaCy model to use the analyzer, e.g. ""`python -m spacy download en_core_web_lg`.")fromereturnNlpEngineProviderdef_import_anonymizer_engine()->"AnonymizerEngine":try:frompresidio_anonymizerimportAnonymizerEngineexceptImportErrorase:raiseImportError("Could not import presidio_anonymizer, please install with ""`pip install presidio-anonymizer`.")fromereturnAnonymizerEnginedef_import_operator_config()->"OperatorConfig":try:frompresidio_anonymizer.entitiesimportOperatorConfigexceptImportErrorase:raiseImportError("Could not import presidio_anonymizer, please install with ""`pip install presidio-anonymizer`.")fromereturnOperatorConfig# Configuring Anonymizer for multiple languages# Detailed description and examples can be found here:# langchain/docs/extras/guides/privacy/multi_language_anonymization.ipynbDEFAULT_LANGUAGES_CONFIG={# You can also use Stanza or transformers library.# See https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/"nlp_engine_name":"spacy","models":[{"lang_code":"en","model_name":"en_core_web_lg"},# {"lang_code": "de", "model_name": "de_core_news_md"},# {"lang_code": "es", "model_name": "es_core_news_md"},# ...# List of available models: https://spacy.io/usage/models],}
[docs]classPresidioAnonymizerBase(AnonymizerBase):"""Base Anonymizer using Microsoft Presidio. See more: https://microsoft.github.io/presidio/ """
[docs]def__init__(self,analyzed_fields:Optional[List[str]]=None,operators:Optional[Dict[str,OperatorConfig]]=None,languages_config:Optional[Dict]=None,add_default_faker_operators:bool=True,faker_seed:Optional[int]=None,):""" Args: analyzed_fields: List of fields to detect and then anonymize. Defaults to all entities supported by Microsoft Presidio. operators: Operators to use for anonymization. Operators allow for custom anonymization of detected PII. Learn more: https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/ languages_config: Configuration for the NLP engine. First language in the list will be used as the main language in self.anonymize(...) when no language is specified. Learn more: https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/ faker_seed: Seed used to initialize faker. Defaults to None, in which case faker will be seeded randomly and provide random values. """iflanguages_configisNone:languages_config=DEFAULT_LANGUAGES_CONFIGOperatorConfig=_import_operator_config()AnalyzerEngine=_import_analyzer_engine()NlpEngineProvider=_import_nlp_engine_provider()AnonymizerEngine=_import_anonymizer_engine()self.analyzed_fields=(analyzed_fieldsifanalyzed_fieldsisnotNoneelselist(get_pseudoanonymizer_mapping().keys()))ifadd_default_faker_operators:self.operators={field:OperatorConfig(operator_name="custom",params={"lambda":faker_function})forfield,faker_functioninget_pseudoanonymizer_mapping(faker_seed).items()}else:self.operators={}ifoperators:self.add_operators(operators)provider=NlpEngineProvider(nlp_configuration=languages_config)nlp_engine=provider.create_engine()self.supported_languages=list(nlp_engine.nlp.keys())self._analyzer=AnalyzerEngine(supported_languages=self.supported_languages,nlp_engine=nlp_engine)self._anonymizer=AnonymizerEngine()
[docs]defadd_recognizer(self,recognizer:EntityRecognizer)->None:"""Add a recognizer to the analyzer Args: recognizer: Recognizer to add to the analyzer. """self._analyzer.registry.add_recognizer(recognizer)self.analyzed_fields.extend(recognizer.supported_entities)
[docs]defadd_operators(self,operators:Dict[str,OperatorConfig])->None:"""Add operators to the anonymizer Args: operators: Operators to add to the anonymizer. """self.operators.update(operators)
[docs]classPresidioAnonymizer(PresidioAnonymizerBase):"""Anonymizer using Microsoft Presidio."""def_anonymize(self,text:str,language:Optional[str]=None,allow_list:Optional[List[str]]=None,conflict_resolution:Optional[ConflictResolutionStrategy]=None,)->str:"""Anonymize text. Each PII entity is replaced with a fake value. Each time fake values will be different, as they are generated randomly. PresidioAnonymizer has no built-in memory - so it will not remember the effects of anonymizing previous texts. >>> anonymizer = PresidioAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Brett Russell. Hi Brett Russell!' Args: text: text to anonymize language: language to use for analysis of PII If None, the first (main) language in the list of languages specified in the configuration will be used. """iflanguageisNone:language=self.supported_languages[0]eliflanguagenotinself.supported_languages:raiseValueError(f"Language '{language}' is not supported. "f"Supported languages are: {self.supported_languages}. ""Change your language configuration file to add more languages.")# Check supported entities for given language# e.g. IT_FISCAL_CODE is not supported for English in Presidio by default# If you want to use it, you need to add a recognizer manuallysupported_entities=[]forrecognizerinself._analyzer.get_recognizers(language):recognizer_dict=recognizer.to_dict()supported_entities.extend([recognizer_dict["supported_entity"]]if"supported_entity"inrecognizer_dictelserecognizer_dict["supported_entities"])entities_to_analyze=list(set(supported_entities).intersection(set(self.analyzed_fields)))analyzer_results=self._analyzer.analyze(text,entities=entities_to_analyze,language=language,allow_list=allow_list,)filtered_analyzer_results=(self._anonymizer._remove_conflicts_and_get_text_manipulation_data(analyzer_results,conflict_resolution))anonymizer_results=self._anonymizer.anonymize(text,analyzer_results=analyzer_results,operators=self.operators,)anonymizer_mapping=create_anonymizer_mapping(text,filtered_analyzer_results,anonymizer_results,)returnexact_matching_strategy(text,anonymizer_mapping)
[docs]classPresidioReversibleAnonymizer(PresidioAnonymizerBase,ReversibleAnonymizerBase):"""Reversible Anonymizer using Microsoft Presidio."""
@propertydefdeanonymizer_mapping(self)->MappingDataType:"""Return the deanonymizer mapping"""returnself._deanonymizer_mapping.data@propertydefanonymizer_mapping(self)->MappingDataType:"""Return the anonymizer mapping This is just the reverse version of the deanonymizer mapping."""return{key:{v:kfork,vininner_dict.items()}forkey,inner_dictinself.deanonymizer_mapping.items()}def_anonymize(self,text:str,language:Optional[str]=None,allow_list:Optional[List[str]]=None,conflict_resolution:Optional[ConflictResolutionStrategy]=None,)->str:"""Anonymize text. Each PII entity is replaced with a fake value. Each time fake values will be different, as they are generated randomly. At the same time, we will create a mapping from each anonymized entity back to its original text value. Thanks to the built-in memory, all previously anonymised entities will be remembered and replaced by the same fake values: >>> anonymizer = PresidioReversibleAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' Args: text: text to anonymize language: language to use for analysis of PII If None, the first (main) language in the list of languages specified in the configuration will be used. """iflanguageisNone:language=self.supported_languages[0]iflanguagenotinself.supported_languages:raiseValueError(f"Language '{language}' is not supported. "f"Supported languages are: {self.supported_languages}. ""Change your language configuration file to add more languages.")# Check supported entities for given language# e.g. IT_FISCAL_CODE is not supported for English in Presidio by default# If you want to use it, you need to add a recognizer manuallysupported_entities=[]forrecognizerinself._analyzer.get_recognizers(language):recognizer_dict=recognizer.to_dict()supported_entities.extend([recognizer_dict["supported_entity"]]if"supported_entity"inrecognizer_dictelserecognizer_dict["supported_entities"])entities_to_analyze=list(set(supported_entities).intersection(set(self.analyzed_fields)))analyzer_results=self._analyzer.analyze(text,entities=entities_to_analyze,language=language,allow_list=allow_list,)filtered_analyzer_results=(self._anonymizer._remove_conflicts_and_get_text_manipulation_data(analyzer_results,conflict_resolution))anonymizer_results=self._anonymizer.anonymize(text,analyzer_results=analyzer_results,operators=self.operators,)new_deanonymizer_mapping=create_anonymizer_mapping(text,filtered_analyzer_results,anonymizer_results,is_reversed=True,)self._deanonymizer_mapping.update(new_deanonymizer_mapping)returnexact_matching_strategy(text,self.anonymizer_mapping)def_deanonymize(self,text_to_deanonymize:str,deanonymizer_matching_strategy:Callable[[str,MappingDataType],str]=DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,)->str:"""Deanonymize text. Each anonymized entity is replaced with its original value. This method exploits the mapping created during the anonymization process. Args: text_to_deanonymize: text to deanonymize deanonymizer_matching_strategy: function to use to match anonymized entities with their original values and replace them. """ifnotself._deanonymizer_mapping:raiseValueError("Deanonymizer mapping is empty.","Please call anonymize() and anonymize some text first.",)text_to_deanonymize=deanonymizer_matching_strategy(text_to_deanonymize,self.deanonymizer_mapping)returntext_to_deanonymize
[docs]defreset_deanonymizer_mapping(self)->None:"""Reset the deanonymizer mapping"""self._deanonymizer_mapping=DeanonymizerMapping()
[docs]defsave_deanonymizer_mapping(self,file_path:Union[Path,str])->None:"""Save the deanonymizer mapping to a JSON or YAML file. Args: file_path: Path to file to save the mapping to. Example: .. code-block:: python anonymizer.save_deanonymizer_mapping(file_path="path/mapping.json") """save_path=Path(file_path)ifsave_path.suffixnotin[".json",".yaml"]:raiseValueError(f"{save_path} must have an extension of .json or .yaml")# Make sure parent directories existsave_path.parent.mkdir(parents=True,exist_ok=True)ifsave_path.suffix==".json":withopen(save_path,"w")asf:json.dump(self.deanonymizer_mapping,f,indent=2)elifsave_path.suffix.endswith((".yaml",".yml")):withopen(save_path,"w")asf:yaml.dump(self.deanonymizer_mapping,f,default_flow_style=False)
[docs]defload_deanonymizer_mapping(self,file_path:Union[Path,str])->None:"""Load the deanonymizer mapping from a JSON or YAML file. Args: file_path: Path to file to load the mapping from. Example: .. code-block:: python anonymizer.load_deanonymizer_mapping(file_path="path/mapping.json") """load_path=Path(file_path)ifload_path.suffixnotin[".json",".yaml"]:raiseValueError(f"{load_path} must have an extension of .json or .yaml")ifload_path.suffix==".json":withopen(load_path,"r")asf:loaded_mapping=json.load(f)elifload_path.suffix.endswith((".yaml",".yml")):withopen(load_path,"r")asf:loaded_mapping=yaml.load(f,Loader=yaml.FullLoader)self._deanonymizer_mapping.update(loaded_mapping)