Spaces:
Sleeping
Sleeping
add new transformers model for french + update entities
Browse files- README.md +3 -3
- __pycache__/presidio_helpers.cpython-310.pyc +0 -0
- __pycache__/presidio_nlp_engine_config.cpython-310.pyc +0 -0
- __pycache__/transformers_class.cpython-310.pyc +0 -0
- app.py +10 -4
- presidio_helpers.py +2 -1
- presidio_nlp_engine_config.py +94 -105
- recognizers.yaml +11 -11
- requirements.txt +0 -2
- transformers_class.py +52 -0
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
-
title: Anonymizer
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: gray
|
| 5 |
colorTo: gray
|
| 6 |
sdk: streamlit
|
|
@@ -9,4 +9,4 @@ app_file: app.py
|
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Aliae Anonymizer
|
| 3 |
+
emoji: 😻
|
| 4 |
colorFrom: gray
|
| 5 |
colorTo: gray
|
| 6 |
sdk: streamlit
|
|
|
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
__pycache__/presidio_helpers.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/presidio_helpers.cpython-310.pyc and b/__pycache__/presidio_helpers.cpython-310.pyc differ
|
|
|
__pycache__/presidio_nlp_engine_config.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/presidio_nlp_engine_config.cpython-310.pyc and b/__pycache__/presidio_nlp_engine_config.cpython-310.pyc differ
|
|
|
__pycache__/transformers_class.cpython-310.pyc
ADDED
|
Binary file (1.81 kB). View file
|
|
|
app.py
CHANGED
|
@@ -56,7 +56,7 @@ st_ta_key = st_ta_endpoint = ""
|
|
| 56 |
|
| 57 |
model_list = [
|
| 58 |
"spaCy/en_core_web_lg",
|
| 59 |
-
"spaCy/
|
| 60 |
]
|
| 61 |
# "flair/ner-english-large",
|
| 62 |
#
|
|
@@ -78,7 +78,7 @@ lang = st.sidebar.selectbox(
|
|
| 78 |
|
| 79 |
# Extract model package.
|
| 80 |
# st_model_package = st_model.split("/")[0]
|
| 81 |
-
|
| 82 |
|
| 83 |
# # Remove package prefix (if needed)
|
| 84 |
# st_model = (
|
|
@@ -87,8 +87,14 @@ st_model_package = 'spaCy'
|
|
| 87 |
# else "/".join(st_model.split("/")[1:])
|
| 88 |
# )
|
| 89 |
st_model = 'en_core_web_lg'
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
# if st_model == "Other":
|
| 94 |
# st_model_package = st.sidebar.selectbox(
|
|
|
|
| 56 |
|
| 57 |
model_list = [
|
| 58 |
"spaCy/en_core_web_lg",
|
| 59 |
+
"spaCy/fr_core_news_lg",
|
| 60 |
]
|
| 61 |
# "flair/ner-english-large",
|
| 62 |
#
|
|
|
|
| 78 |
|
| 79 |
# Extract model package.
|
| 80 |
# st_model_package = st_model.split("/")[0]
|
| 81 |
+
|
| 82 |
|
| 83 |
# # Remove package prefix (if needed)
|
| 84 |
# st_model = (
|
|
|
|
| 87 |
# else "/".join(st_model.split("/")[1:])
|
| 88 |
# )
|
| 89 |
st_model = 'en_core_web_lg'
|
| 90 |
+
st_model_package = "spaCy"
|
| 91 |
+
|
| 92 |
+
if lang =='en':
|
| 93 |
+
st_model_package = "spaCy"
|
| 94 |
+
st_model = 'en_core_web_lg'
|
| 95 |
+
elif lang == 'fr' :
|
| 96 |
+
st_model_package = "HuggingFace"
|
| 97 |
+
st_model = 'fr_core_news_lg'
|
| 98 |
|
| 99 |
# if st_model == "Other":
|
| 100 |
# st_model_package = st.sidebar.selectbox(
|
presidio_helpers.py
CHANGED
|
@@ -24,7 +24,7 @@ from presidio_anonymizer.entities import OperatorConfig
|
|
| 24 |
from presidio_nlp_engine_config import (
|
| 25 |
create_nlp_engine_with_spacy,
|
| 26 |
# create_nlp_engine_with_flair,
|
| 27 |
-
|
| 28 |
# create_nlp_engine_with_azure_text_analytics,
|
| 29 |
)
|
| 30 |
|
|
@@ -99,6 +99,7 @@ def get_supported_entities(
|
|
| 99 |
# model_family, model_path, ta_key, ta_endpoint
|
| 100 |
# ).get_supported_entities() + ["GENERIC_PII"]
|
| 101 |
return ["PERSON", "IBAN_CODE", "PHONE_NUMBER", "CREDIT_CARD", "CRYPTO", "DATE_TIME", "EMAIL_ADDRESS", "IP_ADDRESS", "NRP", "LOCATION", "URL", "FRENCH_SSN", "FRENCH_PASS", "FRENCH_NID"]
|
|
|
|
| 102 |
|
| 103 |
|
| 104 |
@st.cache_data
|
|
|
|
| 24 |
from presidio_nlp_engine_config import (
|
| 25 |
create_nlp_engine_with_spacy,
|
| 26 |
# create_nlp_engine_with_flair,
|
| 27 |
+
create_nlp_engine_with_transformers,
|
| 28 |
# create_nlp_engine_with_azure_text_analytics,
|
| 29 |
)
|
| 30 |
|
|
|
|
| 99 |
# model_family, model_path, ta_key, ta_endpoint
|
| 100 |
# ).get_supported_entities() + ["GENERIC_PII"]
|
| 101 |
return ["PERSON", "IBAN_CODE", "PHONE_NUMBER", "CREDIT_CARD", "CRYPTO", "DATE_TIME", "EMAIL_ADDRESS", "IP_ADDRESS", "NRP", "LOCATION", "URL", "FRENCH_SSN", "FRENCH_PASS", "FRENCH_NID"]
|
| 102 |
+
#
|
| 103 |
|
| 104 |
|
| 105 |
@st.cache_data
|
presidio_nlp_engine_config.py
CHANGED
|
@@ -3,6 +3,7 @@ import logging
|
|
| 3 |
import spacy
|
| 4 |
from presidio_analyzer import RecognizerRegistry
|
| 5 |
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
|
|
|
|
| 6 |
|
| 7 |
logger = logging.getLogger("presidio-streamlit")
|
| 8 |
|
|
@@ -34,108 +35,96 @@ def create_nlp_engine_with_spacy(
|
|
| 34 |
return nlp_engine, registry
|
| 35 |
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
#
|
| 51 |
-
# )
|
| 52 |
-
#
|
| 53 |
-
#
|
| 54 |
-
#
|
| 55 |
-
#
|
| 56 |
-
#
|
| 57 |
-
#
|
| 58 |
-
#
|
| 59 |
-
#
|
| 60 |
-
#
|
| 61 |
-
#
|
| 62 |
-
|
| 63 |
-
#
|
| 64 |
-
#
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
#
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
#
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
#
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
#
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
#
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
#
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
#
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
#
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
#
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
#
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
# ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key)
|
| 131 |
-
# nlp_configuration = {
|
| 132 |
-
# "nlp_engine_name": "spacy",
|
| 133 |
-
# "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
|
| 134 |
-
# }
|
| 135 |
-
#
|
| 136 |
-
# nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
|
| 137 |
-
#
|
| 138 |
-
# registry.add_recognizer(ta_recognizer)
|
| 139 |
-
# registry.remove_recognizer("SpacyRecognizer")
|
| 140 |
-
#
|
| 141 |
-
# return nlp_engine, registry
|
|
|
|
| 3 |
import spacy
|
| 4 |
from presidio_analyzer import RecognizerRegistry
|
| 5 |
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
|
| 6 |
+
from transformers_class import TransformerRecognizer
|
| 7 |
|
| 8 |
logger = logging.getLogger("presidio-streamlit")
|
| 9 |
|
|
|
|
| 35 |
return nlp_engine, registry
|
| 36 |
|
| 37 |
|
| 38 |
+
def create_nlp_engine_with_transformers(
|
| 39 |
+
model_path: str,
|
| 40 |
+
) -> Tuple[NlpEngine, RecognizerRegistry]:
|
| 41 |
+
"""
|
| 42 |
+
Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
|
| 43 |
+
The TransformersRecognizer would return results from Transformers models, the spaCy model
|
| 44 |
+
would return NlpArtifacts such as POS and lemmas.
|
| 45 |
+
:param model_path: HuggingFace model path.
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# if not spacy.util.is_package("en_core_web_sm"):
|
| 52 |
+
# spacy.cli.download("en_core_web_sm")
|
| 53 |
+
# # Using a small spaCy model + a HF NER model
|
| 54 |
+
# transformers_recognizer = TransformersRecognizer(model_path=model_path)
|
| 55 |
+
#
|
| 56 |
+
# if model_path == "StanfordAIMI/stanford-deidentifier-base":
|
| 57 |
+
# transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
|
| 58 |
+
# elif model_path == "obi/deid_roberta_i2b2":
|
| 59 |
+
# transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
|
| 60 |
+
# else:
|
| 61 |
+
# print(f"Warning: Model has no configuration, loading default.")
|
| 62 |
+
# transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
|
| 63 |
+
|
| 64 |
+
# Use small spaCy model, no need for both spacy and HF models
|
| 65 |
+
# The transformers model is used here as a recognizer, not as an NlpEngine
|
| 66 |
+
if not spacy.util.is_package(model_path):
|
| 67 |
+
spacy.cli.download(model_path)
|
| 68 |
+
|
| 69 |
+
nlp_configuration = {
|
| 70 |
+
"nlp_engine_name": "spacy",
|
| 71 |
+
"models": [{"lang_code": model_path.split('_')[0], "model_name": model_path}],
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
|
| 75 |
+
registry = RecognizerRegistry()
|
| 76 |
+
registry = load_predefined_recognizers(registry)
|
| 77 |
+
|
| 78 |
+
mapping_labels = {"PER": "PERSON", 'LOC': 'LOCATION'}
|
| 79 |
+
model_name = "AliaeAI/camembert_anonymizer_production_v2" # "Jean-Baptiste/camembert-ner" , "AliaeAI/camembert_anonymizer_production"
|
| 80 |
+
transformers_recognizer = TransformerRecognizer(model_name, mapping_labels)
|
| 81 |
+
|
| 82 |
+
registry.add_recognizer(transformers_recognizer)
|
| 83 |
+
registry.remove_recognizer("SpacyRecognizer")
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
return nlp_engine, registry
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
from presidio_analyzer.predefined_recognizers import PhoneRecognizer, EmailRecognizer, CreditCardRecognizer, CryptoRecognizer, DateRecognizer, IpRecognizer, IbanRecognizer, UrlRecognizer
|
| 92 |
+
import phonenumbers
|
| 93 |
+
|
| 94 |
+
def load_predefined_recognizers(registry, lang='fr'):
|
| 95 |
+
# phone number
|
| 96 |
+
phone_recognizer_fr = PhoneRecognizer(supported_language=lang, supported_regions=phonenumbers.SUPPORTED_REGIONS,context=['téléphone'])
|
| 97 |
+
registry.add_recognizer(phone_recognizer_fr)
|
| 98 |
+
|
| 99 |
+
# email
|
| 100 |
+
email_recognizer_fr = EmailRecognizer(supported_language=lang, context=["email", "mail", "e-mail"])
|
| 101 |
+
registry.add_recognizer(email_recognizer_fr)
|
| 102 |
+
|
| 103 |
+
# credit card
|
| 104 |
+
creditcard_recognizer_fr = CreditCardRecognizer(supported_language=lang,context=["crédit", "carte", "carte de crédit"])
|
| 105 |
+
registry.add_recognizer(creditcard_recognizer_fr)
|
| 106 |
+
|
| 107 |
+
# crypto
|
| 108 |
+
crypto_recognizer_fr = CryptoRecognizer(supported_language=lang, context=["crypto"])
|
| 109 |
+
registry.add_recognizer(crypto_recognizer_fr)
|
| 110 |
+
|
| 111 |
+
# date time
|
| 112 |
+
date_recognizer_fr = DateRecognizer(supported_language=lang, context=["mois", "date", "jour", "année"])
|
| 113 |
+
registry.add_recognizer(date_recognizer_fr)
|
| 114 |
+
|
| 115 |
+
# ip address
|
| 116 |
+
ip_recognizer_fr = IpRecognizer(supported_language=lang, context=["IP", "ip"])
|
| 117 |
+
registry.add_recognizer(ip_recognizer_fr)
|
| 118 |
+
|
| 119 |
+
# iban
|
| 120 |
+
iban_recognizer_fr = IbanRecognizer(supported_language=lang, context = ["IBAN", "iban", "bancaire", "compte"])
|
| 121 |
+
registry.add_recognizer(iban_recognizer_fr)
|
| 122 |
+
|
| 123 |
+
# URL
|
| 124 |
+
url_recognizer_fr = UrlRecognizer(supported_language=lang, context = ["site", "web"])
|
| 125 |
+
registry.add_recognizer(url_recognizer_fr)
|
| 126 |
+
|
| 127 |
+
# load from yaml
|
| 128 |
+
registry.add_recognizers_from_yaml("recognizers.yaml")
|
| 129 |
+
|
| 130 |
+
return registry
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
recognizers.yaml
CHANGED
|
@@ -1,15 +1,15 @@
|
|
| 1 |
recognizers:
|
| 2 |
-
-
|
| 3 |
-
name: "FRENCH_NID"
|
| 4 |
-
supported_language: "fr"
|
| 5 |
-
patterns:
|
| 6 |
-
-
|
| 7 |
-
name: "FRENCH_NID"
|
| 8 |
-
regex: "[0-9]{12}|([A-Z]|[0-9]){9}"
|
| 9 |
-
score: 0.5
|
| 10 |
-
context:
|
| 11 |
-
- national
|
| 12 |
-
supported_entity: "FRENCH_NID"
|
| 13 |
-
|
| 14 |
name: "FRENCH_NID"
|
| 15 |
supported_language: "en"
|
|
|
|
| 1 |
recognizers:
|
| 2 |
+
# -
|
| 3 |
+
# name: "FRENCH_NID"
|
| 4 |
+
# supported_language: "fr"
|
| 5 |
+
# patterns:
|
| 6 |
+
# -
|
| 7 |
+
# name: "FRENCH_NID"
|
| 8 |
+
# regex: "[0-9]{12}|([A-Z]|[0-9]){9}"
|
| 9 |
+
# score: 0.5
|
| 10 |
+
# context:
|
| 11 |
+
# - national
|
| 12 |
+
# supported_entity: "FRENCH_NID"
|
| 13 |
-
|
| 14 |
name: "FRENCH_NID"
|
| 15 |
supported_language: "en"
|
requirements.txt
CHANGED
|
@@ -7,7 +7,5 @@ python-dotenv
|
|
| 7 |
st-annotated-text
|
| 8 |
torch
|
| 9 |
transformers
|
| 10 |
-
flair
|
| 11 |
-
openai
|
| 12 |
spacy
|
| 13 |
azure-ai-textanalytics
|
|
|
|
| 7 |
st-annotated-text
|
| 8 |
torch
|
| 9 |
transformers
|
|
|
|
|
|
|
| 10 |
spacy
|
| 11 |
azure-ai-textanalytics
|
transformers_class.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
from presidio_analyzer import (
|
| 3 |
+
RecognizerResult,
|
| 4 |
+
EntityRecognizer,
|
| 5 |
+
AnalysisExplanation,
|
| 6 |
+
)
|
| 7 |
+
from presidio_analyzer.nlp_engine import NlpEngine, SpacyNlpEngine, NlpArtifacts
|
| 8 |
+
|
| 9 |
+
class TransformerRecognizer(EntityRecognizer):
|
| 10 |
+
def __init__(
|
| 11 |
+
self,
|
| 12 |
+
model_id_or_path,
|
| 13 |
+
mapping_labels,
|
| 14 |
+
aggregation_strategy="simple",
|
| 15 |
+
supported_language="fr",
|
| 16 |
+
ignore_labels=["O", "MISC"],
|
| 17 |
+
):
|
| 18 |
+
# inits transformers pipeline for given mode or path
|
| 19 |
+
self.pipeline = pipeline(
|
| 20 |
+
"token-classification", model=model_id_or_path, aggregation_strategy=aggregation_strategy, ignore_labels=ignore_labels
|
| 21 |
+
)
|
| 22 |
+
# map labels to presidio labels
|
| 23 |
+
self.label2presidio = mapping_labels
|
| 24 |
+
|
| 25 |
+
# passes entities from model into parent class
|
| 26 |
+
super().__init__(supported_entities=list(self.label2presidio.values()), supported_language=supported_language)
|
| 27 |
+
|
| 28 |
+
def load(self) -> None:
|
| 29 |
+
"""No loading is required."""
|
| 30 |
+
pass
|
| 31 |
+
|
| 32 |
+
def analyze(
|
| 33 |
+
self, text: str, entities = None, nlp_artifacts: NlpArtifacts = None
|
| 34 |
+
):
|
| 35 |
+
"""
|
| 36 |
+
Extracts entities using Transformers pipeline
|
| 37 |
+
"""
|
| 38 |
+
results = []
|
| 39 |
+
|
| 40 |
+
predicted_entities = self.pipeline(text)
|
| 41 |
+
if len(predicted_entities) > 0:
|
| 42 |
+
for e in predicted_entities:
|
| 43 |
+
if(e['entity_group'] not in self.label2presidio):
|
| 44 |
+
continue
|
| 45 |
+
converted_entity = self.label2presidio[e["entity_group"]]
|
| 46 |
+
if converted_entity in entities or entities is None:
|
| 47 |
+
results.append(
|
| 48 |
+
RecognizerResult(
|
| 49 |
+
entity_type=converted_entity, start=e["start"], end=e["end"], score=e["score"]
|
| 50 |
+
)
|
| 51 |
+
)
|
| 52 |
+
return results
|