Spaces:
Runtime error
Runtime error
| import torch | |
| from torch.utils.data import Dataset | |
| from transformers import RobertaTokenizer | |
| class CommentDataset(Dataset): | |
| def __init__(self, dataframe, max_len=128): | |
| self.texts = dataframe["text"].tolist() | |
| self.labels = dataframe["helpfulness_score"].tolist() | |
| self.toxicity = dataframe["toxicity_score"].tolist() | |
| self.readability = dataframe["readability_score"].tolist() | |
| self.anomaly = dataframe["is_anomalous"].tolist() | |
| self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base") | |
| self.max_len = max_len | |
| def __len__(self): | |
| return len(self.texts) | |
| def __getitem__(self, idx): | |
| encoded = self.tokenizer( | |
| self.texts[idx], | |
| padding='max_length', | |
| truncation=True, | |
| max_length=self.max_len, | |
| return_tensors="pt" | |
| ) | |
| # Metadata features: concatenate scalar values | |
| metadata = torch.tensor([ | |
| self.toxicity[idx], | |
| self.readability[idx], | |
| self.anomaly[idx] | |
| ], dtype=torch.float) | |
| return { | |
| "input_ids": encoded["input_ids"].squeeze(0), | |
| "attention_mask": encoded["attention_mask"].squeeze(0), | |
| "metadata": metadata, | |
| "label": torch.tensor(self.labels[idx], dtype=torch.float) | |
| } | |