Spaces:
Sleeping
Sleeping
| import os | |
| from typing import List, Optional | |
| import logging | |
| import dotenv | |
| from azure.ai.textanalytics import TextAnalyticsClient | |
| from azure.core.credentials import AzureKeyCredential | |
| from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation | |
| from presidio_analyzer.nlp_engine import NlpArtifacts | |
| # 📜 Our trusty scribe, logging every move of our privacy-protecting saga | |
| logger = logging.getLogger("presidio-streamlit") | |
| class AzureAIServiceWrapper(EntityRecognizer): | |
| """ | |
| 🦸♂️ The Azure AI Service Wrapper: A superhero class that wields Azure's Text Analytics | |
| to zap PII/PHI from text like a privacy avenger! Built to integrate with Presidio's | |
| analyzer, it’s ready to team up with your SFT app for world-saving AI missions. 💪 | |
| """ | |
| from azure.ai.textanalytics._models import PiiEntityCategory | |
| # 📋 Our hit list of PII entities Azure can tackle—SSNs, credit cards, you name it! | |
| TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory] | |
| def __init__( | |
| self, | |
| supported_entities: Optional[List[str]] = None, | |
| supported_language: str = "en", | |
| ta_client: Optional[TextAnalyticsClient] = None, | |
| ta_key: Optional[str] = None, | |
| ta_endpoint: Optional[str] = None, | |
| ): | |
| """ | |
| 🎬 Lights, camera, action! Initializes our Azure-powered PII slayer. | |
| :param supported_entities: PII types to hunt (defaults to ALL the baddies). | |
| :param supported_language: Language to analyze (English by default, mate! 🇬🇧). | |
| :param ta_client: Pre-authenticated Azure client (or we’ll forge one ourselves). | |
| :param ta_key: Secret key to unlock Azure’s vault of NLP magic. | |
| :param ta_endpoint: The Azure portal where the PII-zapping happens. | |
| *Clever quip*: Think of this as assembling Iron Man’s suit—credentials, endpoints, | |
| and entity lists snap together for a privacy-protecting masterpiece! 😼 | |
| """ | |
| # 🛡️ Default to all supported entities if none specified—maximum coverage! | |
| if not supported_entities: | |
| supported_entities = self.TA_SUPPORTED_ENTITIES | |
| # 🧬 Inherit Presidio’s EntityRecognizer powers, branding ourselves as Azure’s finest | |
| super().__init__( | |
| supported_entities=supported_entities, | |
| supported_language=supported_language, | |
| name="Azure AI Language PII", | |
| ) | |
| # 🔑 Stash the key and endpoint for Azure’s secret handshake | |
| self.ta_key = ta_key | |
| self.ta_endpoint = ta_endpoint | |
| # 🤝 Authenticate if no client’s provided—time to summon Azure’s NLP beast! | |
| if not ta_client: | |
| ta_client = self.__authenticate_client(ta_key, ta_endpoint) | |
| self.ta_client = ta_client | |
| def __authenticate_client(key: str, endpoint: str): | |
| """ | |
| 🔓 Unlocks Azure’s treasure chest with a key and endpoint. | |
| :param key: The magic password to Azure’s NLP kingdom. | |
| :param endpoint: The gate to Azure’s Text Analytics realm. | |
| :return: A shiny TextAnalyticsClient ready to rumble! | |
| *Fun fact*: This is like getting VIP access to a privacy party—credentials | |
| checked, and we’re in! 🎉 | |
| """ | |
| ta_credential = AzureKeyCredential(key) | |
| text_analytics_client = TextAnalyticsClient( | |
| endpoint=endpoint, credential=ta_credential | |
| ) | |
| return text_analytics_client | |
| def analyze( | |
| self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None | |
| ) -> List[RecognizerResult]: | |
| """ | |
| 🕵️♀️ The main event: Scans text for PII like a hawk and returns redacted results. | |
| :param text: The text to scrub clean of sensitive data. | |
| :param entities: Specific PII types to hunt (or all if None). | |
| :param nlp_artifacts: Optional Presidio NLP goodies (we’re cool without ‘em). | |
| :return: A list of RecognizerResult with PII locations and confidence scores. | |
| *Superpower alert*: This method’s like X-ray vision for sensitive data—SSNs, | |
| credit cards, and emails don’t stand a chance! 🦅 | |
| *SFT tease*: Imagine pairing this with your fine-tuned model for next-level AI! 😏 | |
| """ | |
| # 🗳️ Default to empty entity list if none provided—flexibility is our jam | |
| if not entities: | |
| entities = [] | |
| # 🚀 Fire up Azure’s PII recognizer with the text and language | |
| response = self.ta_client.recognize_pii_entities( | |
| [text], language=self.supported_language | |
| ) | |
| # ✅ Filter out any errors—only the good stuff makes the cut | |
| results = [doc for doc in response if not doc.is_error] | |
| recognizer_results = [] | |
| # 🔍 Loop through results, cherry-picking valid entities | |
| for res in results: | |
| for entity in res.entities: | |
| # 🚫 Skip unsupported entities—we’re picky like that | |
| if entity.category not in self.supported_entities: | |
| continue | |
| # 📝 Craft a fancy explanation for why we flagged this PII | |
| analysis_explanation = self._build_explanation( | |
| original_score=entity.confidence_score, | |
| entity_type=entity.category, | |
| ) | |
| # 🎯 Log the hit: entity type, position, and confidence score | |
| recognizer_results.append( | |
| RecognizerResult( | |
| entity_type=entity.category, | |
| start=entity.offset, | |
| end=entity.offset + len(entity.text), | |
| score=entity.confidence_score, | |
| analysis_explanation=analysis_explanation, | |
| ) | |
| ) | |
| # 🏆 Return the haul of PII findings—mission accomplished! | |
| return recognizer_results | |
| def _build_explanation( | |
| original_score: float, entity_type: str | |
| ) -> AnalysisExplanation: | |
| """ | |
| 📜 Writes a love letter explaining why we flagged a PII entity. | |
| :param original_score: Confidence score from Azure’s NLP oracle. | |
| :param entity_type: The type of PII we nabbed (e.g., SSN, PHONE_NUMBER). | |
| :return: An AnalysisExplanation object with all the juicy details. | |
| *Witty note*: This is like leaving a Post-it note saying, “Caught ya, sneaky | |
| credit card number!” 😜 | |
| """ | |
| explanation = AnalysisExplanation( | |
| recognizer=AzureAIServiceWrapper.__class__.__name__, | |
| original_score=original_score, | |
| textual_explanation=f"Identified as {entity_type} by Text Analytics", | |
| ) | |
| return explanation | |
| def load(self) -> None: | |
| """ | |
| 🛠️ Placeholder for loading resources—Azure’s already warmed up, so we chill. | |
| *Cheeky remark*: Like a superhero on standby, we’re always ready to leap | |
| into action. No prep needed! 😎 | |
| """ | |
| pass | |
| if __name__ == "__main__": | |
| """ | |
| 🎮 Demo mode: Test-drive our PII zapper with sample text! | |
| *Hugging Face nod*: Think of this as a mini HF Space—try it, love it, push it | |
| to the Hub! 🤗 | |
| """ | |
| import presidio_helpers | |
| # 🔐 Load secrets from .env—because hardcoding keys is so last century | |
| dotenv.load_dotenv() | |
| # 📖 Our test story, packed with PII for our hero to vanquish | |
| text = """ | |
| Here are a few example sentences we currently support: | |
| Hello, my name is David Johnson and I live in Maine. | |
| My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ. | |
| On September 18 I visited microsoft.com and sent an email to test@presidio.site, from the IP 192.168.0.1. | |
| My passport: 191280342 and my phone number: (212) 555-1234. | |
| This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544? | |
| Kate's social security number is 078-05-1126. Her driver license? it is 1234567A. | |
| """ | |
| # 🦸♀️ Summon the analyzer with Azure’s secret sauce | |
| analyzer = presidio_helpers.analyzer_engine( | |
| model_path="Azure Text Analytics PII", | |
| ta_key=os.environ["TA_KEY"], | |
| ta_endpoint=os.environ["TA_ENDPOINT"], | |
| ) | |
| # 💥 Unleash the PII-hunting beast on our text | |
| analyzer.analyze(text=text, language="en") |