|
|
import os |
|
|
import logging |
|
|
from typing import Dict, Any |
|
|
|
|
|
import gradio as gr |
|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
|
|
|
|
|
|
|
def predict(text: str) -> Dict[str, Any]: |
|
|
"""Classify text for PII detection.""" |
|
|
if not text or text.strip() == "": |
|
|
return {"No input provided": 0.0} |
|
|
logging.info(f"User input: {text}") |
|
|
|
|
|
try: |
|
|
|
|
|
inputs = tokenizer( |
|
|
text, |
|
|
return_tensors="pt", |
|
|
padding="max_length", |
|
|
max_length=512, |
|
|
truncation=True |
|
|
) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
logits = outputs.logits |
|
|
probabilities = torch.sigmoid(logits) |
|
|
probs = probabilities.squeeze().tolist() |
|
|
|
|
|
|
|
|
results = { |
|
|
"Asking for PII": float(probs[0]), |
|
|
"Giving PII": float(probs[1]) |
|
|
} |
|
|
|
|
|
return results |
|
|
|
|
|
except Exception as e: |
|
|
return {"Error": str(e)} |
|
|
|
|
|
|
|
|
|
|
|
examples = [ |
|
|
["what's your blue app id?"], |
|
|
["I live at 901 Roosevelt St, Redwood City"], |
|
|
["what's you ph0ne rebmun?"], |
|
|
["yellow gh>>ost app id? let's chat there"], |
|
|
["let's z0000m?"], |
|
|
["Let’s meet at the Starbuck close to Stanford"], |
|
|
] |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format='%(asctime)s - %(levelname)s - %(message)s', |
|
|
handlers=[logging.StreamHandler()] |
|
|
) |
|
|
|
|
|
|
|
|
model_id = "Roblox/roblox-pii-classifier" |
|
|
|
|
|
|
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
|
|
|
|
|
|
print(f"Loading model: {model_id}") |
|
|
try: |
|
|
|
|
|
if HF_TOKEN: |
|
|
print("Using HF_TOKEN from environment/secrets") |
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN) |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN) |
|
|
else: |
|
|
print("No HF_TOKEN found, attempting without authentication...") |
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_id) |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
|
|
model.eval() |
|
|
print("Model loaded successfully!") |
|
|
except Exception as e: |
|
|
print(f"Failed to load model: {e}") |
|
|
if not HF_TOKEN: |
|
|
print("\n⚠️ For private models, you need to set HF_TOKEN as a Space secret:") |
|
|
print(" 1. Go to your Space Settings") |
|
|
print(" 2. Add a new secret named 'HF_TOKEN'") |
|
|
print(" 3. Set your Hugging Face token as the value") |
|
|
exit(1) |
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=predict, |
|
|
inputs=gr.Textbox( |
|
|
lines=3, |
|
|
placeholder="Enter text to analyze for PII content...", |
|
|
label="Input Text" |
|
|
), |
|
|
outputs=gr.Label( |
|
|
num_top_classes=2, |
|
|
label="Classification Results" |
|
|
), |
|
|
title="PII Detection Demo", |
|
|
description="This model detects whether text is asking for or giving personal information (PII).", |
|
|
examples=examples, |
|
|
flagging_mode="never", |
|
|
) |
|
|
|
|
|
demo.launch() |