Spaces:
Sleeping
Sleeping
File size: 3,261 Bytes
8e41ab0 9a4dd2c 8e41ab0 b1c38c2 8e41ab0 9a4dd2c 8e41ab0 b1c38c2 8e41ab0 9a4dd2c b1c38c2 9a4dd2c b1c38c2 9a4dd2c 8e41ab0 b1c38c2 8e41ab0 9a4dd2c b1c38c2 8e41ab0 9a4dd2c 8e41ab0 b1c38c2 9a4dd2c 8e41ab0 9a4dd2c 8e41ab0 9a4dd2c 8e41ab0 b1c38c2 9a4dd2c 8e41ab0 9a4dd2c 8e41ab0 b1c38c2 8e41ab0 b1c38c2 8e41ab0 b1c38c2 9a4dd2c 8e41ab0 b1c38c2 8e41ab0 9a4dd2c 8e41ab0 b1c38c2 9a4dd2c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import re
import spacy
import random
import pandas as pd
from torchtext import data
from spacy.lang.ar import Arabic
from spacy.tokenizer import Tokenizer
# Reading data into a pandas DataFrame
df = pd.read_csv(
"data/arabic2english.txt",
delimiter="\t",
names=["eng", "ar"],
)
# Loading English language model from spaCy
spacy_eng = spacy.load("en_core_web_sm")
# Creating an instance of Arabic language model from spaCy
arab = Arabic()
# Creating a tokenizer for Arabic text using the Arabic language model
ar_Tokenizer = Tokenizer(arab.vocab)
def engTokenizer(text):
"""
Tokenizes English text using spaCy tokenizer.
Args:
text (str): The input English text.
Returns:
list: List of tokens.
"""
return [word.text for word in spacy_eng.tokenizer(text)]
def arTokenizer(sentence):
"""
Tokenizes Arabic sentence using spaCy tokenizer.
Args:
sentence (str): The input Arabic sentence.
Returns:
list: List of tokens.
"""
return [
word.text
for word in ar_Tokenizer(
re.sub(r"\s+", " ", re.sub(r"[\.\'\"\n+]", " ", sentence)).strip()
)
]
# Defining fields for source and target languages using torchtext
SRC = data.Field(
tokenize=engTokenizer, batch_first=False, init_token="<sos>", eos_token="<eos>"
)
TRG = data.Field(
tokenize=arTokenizer,
batch_first=False,
tokenizer_language="ar",
init_token="بداية",
eos_token="نهاية",
)
class TextDataset(data.Dataset):
"""
Custom dataset class for text data.
Args:
df (pandas.DataFrame): DataFrame containing source and target language data.
src_field (torchtext.data.Field): Field for source language.
target_field (torchtext.data.Field): Field for target language.
is_test (bool): Flag indicating if the dataset is for testing.
Attributes:
fields (list): List of tuples containing field names and corresponding Field objects.
samples (list): List of data examples.
"""
def __init__(self, df, src_field, target_field, is_test=False, **kwargs):
fields = [("eng", src_field), ("ar", target_field)]
samples = []
for i, row in df.iterrows():
eng = row.eng
ar = row.ar
samples.append(data.Example.fromlist([eng, ar], fields))
super().__init__(samples, fields, **kwargs)
def __len__(self):
"""
Get the number of samples in the dataset.
Returns:
int: Number of samples.
"""
return len(self.samples)
def __getitem__(self, idx):
"""
Get a sample from the dataset.
Args:
idx (int): Index of the sample.
Returns:
torchtext.data.Example: Sample at the specified index.
"""
return self.samples[idx]
# Creating a TextDataset instance
torchdataset = TextDataset(df, SRC, TRG)
# Splitting the dataset into training and validation sets
train_data, valid_data = torchdataset.split(
split_ratio=0.8, random_state=random.seed(32)
)
# Building vocabularies for source and target languages
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)
|