File size: 3,261 Bytes
8e41ab0
 
 
 
 
9a4dd2c
8e41ab0
 
 
b1c38c2
8e41ab0
9a4dd2c
8e41ab0
 
 
 
b1c38c2
8e41ab0
9a4dd2c
b1c38c2
9a4dd2c
b1c38c2
 
9a4dd2c
8e41ab0
 
 
b1c38c2
 
 
 
 
 
 
 
 
8e41ab0
 
 
9a4dd2c
b1c38c2
 
 
 
 
 
 
 
 
8e41ab0
 
9a4dd2c
 
8e41ab0
 
 
 
b1c38c2
9a4dd2c
 
8e41ab0
9a4dd2c
 
8e41ab0
9a4dd2c
 
 
8e41ab0
 
 
 
b1c38c2
 
 
 
 
 
 
 
 
 
 
 
 
 
9a4dd2c
 
8e41ab0
 
 
 
 
 
 
9a4dd2c
8e41ab0
 
b1c38c2
 
 
 
 
 
8e41ab0
 
 
b1c38c2
 
 
 
 
 
 
 
 
8e41ab0
 
 
b1c38c2
9a4dd2c
8e41ab0
b1c38c2
8e41ab0
9a4dd2c
8e41ab0
 
b1c38c2
9a4dd2c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import re
import spacy
import random
import pandas as pd

from torchtext import data
from spacy.lang.ar import Arabic
from spacy.tokenizer import Tokenizer

# Reading data into a pandas DataFrame
df = pd.read_csv(
    "data/arabic2english.txt",
    delimiter="\t",
    names=["eng", "ar"],
)

# Loading English language model from spaCy
spacy_eng = spacy.load("en_core_web_sm")

# Creating an instance of Arabic language model from spaCy
arab = Arabic()

# Creating a tokenizer for Arabic text using the Arabic language model
ar_Tokenizer = Tokenizer(arab.vocab)


def engTokenizer(text):
    """
    Tokenizes English text using spaCy tokenizer.

    Args:
        text (str): The input English text.

    Returns:
        list: List of tokens.
    """
    return [word.text for word in spacy_eng.tokenizer(text)]


def arTokenizer(sentence):
    """
    Tokenizes Arabic sentence using spaCy tokenizer.

    Args:
        sentence (str): The input Arabic sentence.

    Returns:
        list: List of tokens.
    """
    return [
        word.text
        for word in ar_Tokenizer(
            re.sub(r"\s+", " ", re.sub(r"[\.\'\"\n+]", " ", sentence)).strip()
        )
    ]


# Defining fields for source and target languages using torchtext
SRC = data.Field(
    tokenize=engTokenizer, batch_first=False, init_token="<sos>", eos_token="<eos>"
)
TRG = data.Field(
    tokenize=arTokenizer,
    batch_first=False,
    tokenizer_language="ar",
    init_token="بداية",
    eos_token="نهاية",
)


class TextDataset(data.Dataset):
    """
    Custom dataset class for text data.

    Args:
        df (pandas.DataFrame): DataFrame containing source and target language data.
        src_field (torchtext.data.Field): Field for source language.
        target_field (torchtext.data.Field): Field for target language.
        is_test (bool): Flag indicating if the dataset is for testing.

    Attributes:
        fields (list): List of tuples containing field names and corresponding Field objects.
        samples (list): List of data examples.

    """

    def __init__(self, df, src_field, target_field, is_test=False, **kwargs):
        fields = [("eng", src_field), ("ar", target_field)]
        samples = []
        for i, row in df.iterrows():
            eng = row.eng
            ar = row.ar
            samples.append(data.Example.fromlist([eng, ar], fields))

        super().__init__(samples, fields, **kwargs)

    def __len__(self):
        """
        Get the number of samples in the dataset.

        Returns:
            int: Number of samples.
        """
        return len(self.samples)

    def __getitem__(self, idx):
        """
        Get a sample from the dataset.

        Args:
            idx (int): Index of the sample.

        Returns:
            torchtext.data.Example: Sample at the specified index.
        """
        return self.samples[idx]


# Creating a TextDataset instance
torchdataset = TextDataset(df, SRC, TRG)

# Splitting the dataset into training and validation sets
train_data, valid_data = torchdataset.split(
    split_ratio=0.8, random_state=random.seed(32)
)

# Building vocabularies for source and target languages
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)