Spaces:
Sleeping
Sleeping
| from nltk.tokenize import sent_tokenize | |
| import pandas as pd | |
| ###################### | |
| # prerequisite: | |
| # 1. Pip install transformer | |
| # 2. Define tokenizer + MAX_LEN | |
| # 3. Construct DistillBERTClass_SL class | |
| # 4. Construct Triage_SL class | |
| # 5. Define predict__SL class | |
| # 6. Load model_SL & call eval() | |
| # 7. Pre_define predict_params_SL | |
| #################### | |
| from transformers import DistilBertTokenizer | |
| tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') | |
| import torch | |
| """### DataSet Class -- Triage_SL""" | |
| from torch.utils.data import Dataset, DataLoader | |
| class Triage_SL(Dataset): | |
| # initialize the directory containing the dataframe, the tokenizer, and the max lens of sentences | |
| def __init__(self, dataframe, tokenizer, max_len): | |
| self.len = len(dataframe) | |
| self.data = dataframe | |
| self.tokenizer = tokenizer # load in tokenizer, used in _getitem | |
| self.max_len = max_len | |
| # The __getitem__ function loads and returns a sample from the dataset at the given index idx. | |
| def __getitem__(self, index): | |
| if index >= len(self): | |
| raise StopIteration | |
| # preprossessing sentences to standarize format as in: word+""+word | |
| sent = str(self.data.sentence[index]) | |
| sent = " ".join(sent.split()) | |
| # 1.- Split the sentence into tokens. | |
| # 2.- Add the special [CLS] and [SEP] tokens. | |
| # 3.- Map the tokens to their IDs. | |
| # 4.- Pad or truncate all sentences to the same length. | |
| # 5.- Create the attention masks which explicitly differentiate real tokens from [PAD] tokens. | |
| inputs = self.tokenizer.encode_plus( | |
| sent, # Sentence to encode | |
| None, # text_pair | |
| add_special_tokens=True, # Add '[CLS]' and '[SEP]' | |
| max_length=self.max_len, | |
| pad_to_max_length=True, # Pad & truncate all sentences. | |
| return_token_type_ids=True, | |
| truncation=True | |
| ) | |
| ids = inputs['input_ids'] | |
| mask = inputs['attention_mask'] | |
| return { | |
| 'ids': torch.tensor(ids, dtype=torch.long), | |
| 'mask': torch.tensor(mask, dtype=torch.long), | |
| # 'targets': torch.tensor(self.data.ENCODE_LABEL[index], dtype=torch.float), # sentence label -> y value | |
| # 'combined_label': self.data.combined_label[index] | |
| } | |
| # The __len__ function returns the number of samples in our dataset. | |
| def __len__(self): | |
| return self.len | |
| # read in an essay and resturns a df in sentence level | |
| def essay_to_sent_df(essay): | |
| sentences = [] | |
| paragraphs = [l for l in essay.split('\n') if len(l) > 0] | |
| for para in paragraphs: | |
| # tokenize paragraph by "." and concatenate to sentences[] | |
| sentences.extend(sent_tokenize(para)) | |
| return pd.DataFrame(sentences, columns=['sentence']) | |
| # Defining some key variables that will be used later on in the training | |
| MAX_LEN = 512 | |
| """### Predefine predict_params_SL""" | |
| PREDICT_BATCH_SIZE = 1 | |
| predict_params_SL = {'batch_size': PREDICT_BATCH_SIZE, | |
| 'shuffle': False, | |
| 'num_workers': 0 | |
| } | |
| """### Predict Fn -- predict_SL""" | |
| sigmoid = torch.nn.Sigmoid() | |
| def predict_SL(model, validation_loader): | |
| epoch_val_outputs=[] | |
| cpu_device = 'cpu' | |
| model.eval() | |
| with torch.no_grad(): | |
| for _, data in enumerate(validation_loader, 0): | |
| ids = data['ids'].to(cpu_device, dtype = torch.long) | |
| mask = data['mask'].to(cpu_device, dtype = torch.long) | |
| outputs = model(ids, mask)["logits"].squeeze() # ??squeeze?? | |
| outputs = (sigmoid(outputs).data>0.5).float() | |
| epoch_val_outputs.append(outputs.item()) | |
| return epoch_val_outputs | |
| def predict_mainidea_sent_old(paragraph, model): | |
| # prepare data | |
| sent_df = essay_to_sent_df(paragraph) | |
| predicting_SL_set = Triage_SL(sent_df, tokenizer, MAX_LEN) | |
| predicting_SL_loader = DataLoader(predicting_SL_set, **predict_params_SL) | |
| # load model to device | |
| device = 'cpu' | |
| model.to(device) | |
| # predict + roundup | |
| sent_label = predict_SL(model, predicting_SL_loader) | |
| print(sent_label) | |
| return pd.DataFrame([(str(l), s) for l, s in zip(sent_label, sent_df.sentence)], columns=['label', 'sentence']) | |