File size: 1,343 Bytes
d75e318
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizer

class CommentDataset(Dataset):
    def __init__(self, dataframe, max_len=128):
        self.texts = dataframe["text"].tolist()
        self.labels = dataframe["helpfulness_score"].tolist()
        self.toxicity = dataframe["toxicity_score"].tolist()
        self.readability = dataframe["readability_score"].tolist()
        self.anomaly = dataframe["is_anomalous"].tolist()

        self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        # Metadata features: concatenate scalar values
        metadata = torch.tensor([
            self.toxicity[idx],
            self.readability[idx],
            self.anomaly[idx]
        ], dtype=torch.float)

        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "metadata": metadata,
            "label": torch.tensor(self.labels[idx], dtype=torch.float)
        }