Spaces:
Sleeping
Sleeping
| import logging | |
| import os | |
| from pathlib import Path | |
| from typing import List, Tuple | |
| import gradio as gr | |
| import pandas as pd | |
| import spacy | |
| import torch | |
| from dante_tokenizer import DanteTokenizer | |
| from transformers import AutoModelForTokenClassification, AutoTokenizer | |
| from preprocessing import expand_contractions | |
| try: | |
| nlp = spacy.load("pt_core_news_sm") | |
| except Exception: | |
| os.system("python -m spacy download pt_core_news_sm") | |
| nlp = spacy.load("pt_core_news_sm") | |
| dt_tokenizer = DanteTokenizer() | |
| default_model = "News" | |
| model_choices = { | |
| "News": "Emanuel/porttagger-news-base", | |
| "Tweets (stock market)": "Emanuel/porttagger-tweets-base", | |
| "Oil and Gas (academic texts)": "Emanuel/porttagger-oilgas-base", | |
| "Multigenre": "Emanuel/porttagger-base", | |
| } | |
| pre_tokenizers = { | |
| "News": nlp, | |
| "Tweets (stock market)": dt_tokenizer.tokenize, | |
| "Oil and Gas (academic texts)": nlp, | |
| "Multigenre": nlp, | |
| } | |
| logger = logging.getLogger() | |
| logger.setLevel(logging.DEBUG) | |
| class MyApp: | |
| def __init__(self) -> None: | |
| self.model = None | |
| self.tokenizer = None | |
| self.pre_tokenizer = None | |
| self.load_model() | |
| def load_model(self, model_name: str = default_model): | |
| if model_name not in model_choices.keys(): | |
| logger.error("Selected model is not supported, resetting to the default model.") | |
| model_name = default_model | |
| self.model = AutoModelForTokenClassification.from_pretrained(model_choices[model_name]) | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_choices[model_name]) | |
| self.pre_tokenizer = pre_tokenizers[model_name] | |
| myapp = MyApp() | |
| def predict(text, logger=None) -> Tuple[List[str], List[str]]: | |
| doc = myapp.pre_tokenizer(text) | |
| tokens = [token.text if not isinstance(token, str) else token for token in doc] | |
| logger.info("Starting predictions for sentence: {}".format(text)) | |
| print("Using model {}".format(myapp.model.config.__dict__["_name_or_path"])) | |
| input_tokens = myapp.tokenizer( | |
| tokens, | |
| return_tensors="pt", | |
| is_split_into_words=True, | |
| return_offsets_mapping=True, | |
| return_special_tokens_mask=True, | |
| ) | |
| output = myapp.model(input_tokens["input_ids"]) | |
| i_token = 0 | |
| labels = [] | |
| scores = [] | |
| for off, is_special_token, pred in zip( | |
| input_tokens["offset_mapping"][0], | |
| input_tokens["special_tokens_mask"][0], | |
| output.logits[0], | |
| ): | |
| if is_special_token or off[0] > 0: | |
| continue | |
| label = myapp.model.config.__dict__["id2label"][int(pred.argmax(axis=-1))] | |
| if logger is not None: | |
| logger.info("{}, {}, {}".format(off, tokens[i_token], label)) | |
| labels.append(label) | |
| scores.append( | |
| "{:.2f}".format(100 * float(torch.softmax(pred, dim=-1).detach().max())) | |
| ) | |
| i_token += 1 | |
| return tokens, labels, scores | |
| def text_analysis(text): | |
| text = expand_contractions(text) | |
| tokens, labels, scores = predict(text, logger) | |
| if len(labels) != len(tokens): | |
| m = len(tokens) - len(labels) | |
| labels += [None] * m | |
| scores += [0] * m | |
| pos_count = pd.DataFrame( | |
| { | |
| "token": tokens, | |
| "tag": labels, | |
| "confidence": scores, | |
| } | |
| ) | |
| pos_tokens = [] | |
| for token, label in zip(tokens, labels): | |
| pos_tokens.extend([(token, label), (" ", None)]) | |
| output_highlighted.update(visible=True) | |
| output_df.update(visible=True) | |
| return { | |
| output_highlighted: output_highlighted.update(visible=True, value=(pos_tokens)), | |
| output_df: output_df.update(visible=True, value=pos_count), | |
| } | |
| def batch_analysis(input_file): | |
| text = open(input_file.name, encoding="utf-8").read() | |
| text = text.split("\n") | |
| name = Path(input_file.name).stem | |
| sents = [] | |
| for sent in text: | |
| sub_sents = nlp(sent).sents | |
| sub_sents = [str(_sent).strip() for _sent in sub_sents] | |
| sents += sub_sents | |
| conllu_output = [] | |
| for i, sent in enumerate(sents): | |
| sent = expand_contractions(sent) | |
| conllu_output.append("# sent_id = {}-{}\n".format(name, i + 1)) | |
| conllu_output.append("# text = {}\n".format(sent)) | |
| tokens, labels, scores = predict(sent, logger) | |
| for j, (token, label) in enumerate(zip(tokens, labels)): | |
| conllu_output.append( | |
| "{}\t{}\t_\t{}".format(j + 1, token, label) + "\t_" * 5 + "\n" | |
| ) | |
| conllu_output.append("\n") | |
| output_filename = "output.conllu" | |
| with open(output_filename, "w") as out_f: | |
| out_f.writelines(conllu_output) | |
| return {output_file: output_file.update(visible=True, value=output_filename)} | |
| css = open("style.css").read() | |
| top_html = open("top.html").read() | |
| bottom_html = open("bottom.html").read() | |
| with gr.Blocks(css=css) as demo: | |
| gr.HTML(top_html) | |
| select_model = gr.Dropdown(choices=list(model_choices.keys()), label="Tagger model", value=default_model) | |
| select_model.change(myapp.load_model, inputs=[select_model]) | |
| with gr.Tab("Single sentence"): | |
| text = gr.Textbox(placeholder="Enter your text here...", label="Input") | |
| examples = gr.Examples( | |
| examples=[ | |
| [ | |
| "A população não poderia ter acesso a relatórios que explicassem, por exemplo, os motivos exatos de atrasos em obras de linhas e estações." | |
| ], | |
| [ | |
| "Filme 'Star Wars : Os Últimos Jedi' ganha trailer definitivo; assista." | |
| ], | |
| ], | |
| inputs=[text], | |
| label="Select an example", | |
| ) | |
| output_highlighted = gr.HighlightedText(label="Colorful output", visible=False) | |
| output_df = gr.Dataframe(label="Tabular output", visible=False) | |
| submit_btn = gr.Button("Tag it") | |
| submit_btn.click( | |
| fn=text_analysis, inputs=text, outputs=[output_highlighted, output_df] | |
| ) | |
| with gr.Tab("Multiple sentences"): | |
| gr.HTML( | |
| """ | |
| <p align="justify""> | |
|  Upload a plain text file with sentences in it. | |
| Find below an example of what we expect the content of the file to look like. | |
| Sentences are automatically split by spaCy's sentencizer. | |
| To force an explicit segmentation, manually separate the sentences using a new line for each one.</p> | |
| """ | |
| ) | |
| gr.Markdown( | |
| """ | |
| ``` | |
| Então ele hesitou, quase como se estivesse surpreso com as próprias palavras, e recitou: | |
| – Vá e não tornes a pecar! | |
| Baley, sorrindo de repente, pegou no cotovelo de R. Daneel e eles saíram juntos pela porta. | |
| ``` | |
| """ | |
| ) | |
| input_file = gr.File(label="Upload your input file here...") | |
| output_file = gr.File(label="Tagged file", visible=False) | |
| submit_btn_batch = gr.Button("Tag it") | |
| submit_btn_batch.click( | |
| fn=batch_analysis, inputs=input_file, outputs=output_file | |
| ) | |
| gr.HTML(bottom_html) | |
| demo.launch(debug=True) | |