Spaces:
Sleeping
Sleeping
| import re | |
| import spacy | |
| import random | |
| import pandas as pd | |
| from torchtext import data | |
| from spacy.lang.ar import Arabic | |
| from spacy.tokenizer import Tokenizer | |
| # Reading data into a pandas DataFrame | |
| df = pd.read_csv( | |
| "data/arabic2english.txt", | |
| delimiter="\t", | |
| names=["eng", "ar"], | |
| ) | |
| # Loading English language model from spaCy | |
| spacy_eng = spacy.load("en_core_web_sm") | |
| # Creating an instance of Arabic language model from spaCy | |
| arab = Arabic() | |
| # Creating a tokenizer for Arabic text using the Arabic language model | |
| ar_Tokenizer = Tokenizer(arab.vocab) | |
| def engTokenizer(text): | |
| """ | |
| Tokenizes English text using spaCy tokenizer. | |
| Args: | |
| text (str): The input English text. | |
| Returns: | |
| list: List of tokens. | |
| """ | |
| return [word.text for word in spacy_eng.tokenizer(text)] | |
| def arTokenizer(sentence): | |
| """ | |
| Tokenizes Arabic sentence using spaCy tokenizer. | |
| Args: | |
| sentence (str): The input Arabic sentence. | |
| Returns: | |
| list: List of tokens. | |
| """ | |
| return [ | |
| word.text | |
| for word in ar_Tokenizer( | |
| re.sub(r"\s+", " ", re.sub(r"[\.\'\"\n+]", " ", sentence)).strip() | |
| ) | |
| ] | |
| # Defining fields for source and target languages using torchtext | |
| SRC = data.Field( | |
| tokenize=engTokenizer, batch_first=False, init_token="<sos>", eos_token="<eos>" | |
| ) | |
| TRG = data.Field( | |
| tokenize=arTokenizer, | |
| batch_first=False, | |
| tokenizer_language="ar", | |
| init_token="بداية", | |
| eos_token="نهاية", | |
| ) | |
| class TextDataset(data.Dataset): | |
| """ | |
| Custom dataset class for text data. | |
| Args: | |
| df (pandas.DataFrame): DataFrame containing source and target language data. | |
| src_field (torchtext.data.Field): Field for source language. | |
| target_field (torchtext.data.Field): Field for target language. | |
| is_test (bool): Flag indicating if the dataset is for testing. | |
| Attributes: | |
| fields (list): List of tuples containing field names and corresponding Field objects. | |
| samples (list): List of data examples. | |
| """ | |
| def __init__(self, df, src_field, target_field, is_test=False, **kwargs): | |
| fields = [("eng", src_field), ("ar", target_field)] | |
| samples = [] | |
| for i, row in df.iterrows(): | |
| eng = row.eng | |
| ar = row.ar | |
| samples.append(data.Example.fromlist([eng, ar], fields)) | |
| super().__init__(samples, fields, **kwargs) | |
| def __len__(self): | |
| """ | |
| Get the number of samples in the dataset. | |
| Returns: | |
| int: Number of samples. | |
| """ | |
| return len(self.samples) | |
| def __getitem__(self, idx): | |
| """ | |
| Get a sample from the dataset. | |
| Args: | |
| idx (int): Index of the sample. | |
| Returns: | |
| torchtext.data.Example: Sample at the specified index. | |
| """ | |
| return self.samples[idx] | |
| # Creating a TextDataset instance | |
| torchdataset = TextDataset(df, SRC, TRG) | |
| # Splitting the dataset into training and validation sets | |
| train_data, valid_data = torchdataset.split( | |
| split_ratio=0.8, random_state=random.seed(32) | |
| ) | |
| # Building vocabularies for source and target languages | |
| SRC.build_vocab(train_data, min_freq=2) | |
| TRG.build_vocab(train_data, min_freq=2) | |