Spaces:
Runtime error
Runtime error
| import json | |
| import os | |
| import random | |
| import gradio as gr | |
| import spacy | |
| from huggingface_hub import snapshot_download | |
| from spacy import displacy | |
| from spacy.tokens import Span | |
| # download spacy model -- | |
| os.system('python -m spacy download en_core_web_sm') | |
| # # set up colors for PII types --- | |
| # options = { | |
| # "colors": { | |
| # "NAME_STUDENT": "#7FDBFF", # Soft blue | |
| # "EMAIL": "#008080", # Dark cyan | |
| # "USERNAME": "#C3B1E1", # Pastel violet | |
| # "ID_NUM": "#2ECC40", # Medium green | |
| # "PHONE_NUM": "#FF851B", # Deep orange | |
| # "URL_PERSONAL": "#4682B4", # Steel blue | |
| # "STREET_ADDRESS": "#808000", # Muted olive | |
| # } | |
| # } | |
| options = { | |
| "colors": { | |
| "NAME_STUDENT": "#6EB5FF", # Lighter blue | |
| "EMAIL": "#42D4B5", # Light teal | |
| "USERNAME": "#D8B4E2", # Light lavender | |
| "ID_NUM": "#7AE88F", # Light green | |
| "PHONE_NUM": "#FFB87D", # Light peach | |
| "URL_PERSONAL": "#C9B4E2", # Pale purple | |
| "STREET_ADDRESS": "#B4B77F" # Light olive | |
| } | |
| } | |
| # download datamix --- | |
| def download_data(): | |
| snapshot_download( | |
| repo_id="rbiswasfc/pii-datamix", | |
| repo_type="dataset", | |
| local_dir="./data", | |
| ) | |
| print("Data downloaded!") | |
| download_data() | |
| # load data --- | |
| with open("./data/datamix.json") as f: | |
| data = json.load(f) | |
| subsets = list(data.keys()) | |
| pii_types = list(options["colors"].keys()) | |
| pii_types.append("Random") | |
| nlp = spacy.load("en_core_web_sm") | |
| # render sample -- | |
| def render_sample(subset, pii_type): | |
| candidates = data[subset] | |
| while True: | |
| sample = random.choice(candidates) | |
| if pii_type == "Random": | |
| break | |
| elif pii_type in sample['piis']: | |
| break | |
| print("---" * 10) | |
| print(sample['document']) | |
| print("---" * 10) | |
| # render | |
| doc = spacy.tokens.Doc(nlp.vocab, words=sample['tokens'], spaces=sample['trailing_whitespace']) | |
| # | |
| ents = [] | |
| in_entity = False | |
| start, end = 0, 0 | |
| for index, label in enumerate(sample['labels']): | |
| if label.startswith('B-'): | |
| if in_entity: # End the previous entity | |
| ents.append(Span(doc, start, end, sample['labels'][start][2:])) | |
| start, end = index, index + 1 # Start a new entity | |
| in_entity = True | |
| elif label.startswith('I-') and in_entity: | |
| end = index + 1 # Continue the entity | |
| elif in_entity: | |
| # End the current entity and reset | |
| ents.append(Span(doc, start, end, sample['labels'][start][2:])) | |
| in_entity = False | |
| # Add the last entity if we're still in one | |
| if in_entity: | |
| ents.append(Span(doc, start, end, sample['labels'][start][2:])) | |
| doc.ents = ents | |
| output = displacy.render(doc, style="ent", jupyter=False, options=options) | |
| return {'document': sample['document']}, output | |
| # app layout & callback --- | |
| # with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| subset_dropdown = gr.Dropdown( | |
| subsets, | |
| value=subsets[0], | |
| label="Subset", | |
| info="Select data subset..." | |
| ) | |
| focus_pii = gr.Dropdown( | |
| pii_types, | |
| value="Random", | |
| label="PII Focus", | |
| info="Select a PII type to focus on..." | |
| ) | |
| sample_btn = gr.Button("Sample") | |
| document_id_display = gr.JSON(label="Document ID") | |
| sample_display = gr.HTML(label="Example") | |
| # callback --- | |
| sample_btn.click( | |
| fn=render_sample, | |
| inputs=[subset_dropdown, focus_pii], | |
| outputs=[document_id_display, sample_display], | |
| ) | |
| # launch app --- | |
| demo.launch() | |