Spaces:
Runtime error
Runtime error
| import torch | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer | |
| from datasets import load_dataset | |
| import numpy as np | |
| from sklearn.metrics import accuracy_score, precision_recall_fscore_support | |
| def compute_metrics(pred): | |
| labels = pred.label_ids | |
| preds = pred.predictions.argmax(-1) | |
| precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted') | |
| acc = accuracy_score(labels, preds) | |
| return { | |
| 'accuracy': acc, | |
| 'f1': f1, | |
| 'precision': precision, | |
| 'recall': recall | |
| } | |
| class ArabicDialectTrainer: | |
| def __init__(self, model_name="CAMeL-Lab/bert-base-arabic-camelbert-msa"): | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # 18 فئة للهجات العربية المختلفة | |
| self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=18) | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| self.model.to(self.device) | |
| # تعريف تصنيف اللهجات | |
| self.dialect_mapping = { | |
| 0: 'OM', # عُمان | |
| 1: 'SD', # السودان | |
| 2: 'SA', # السعودية | |
| 3: 'KW', # الكويت | |
| 4: 'QA', # قطر | |
| 5: 'LB', # لبنان | |
| 6: 'JO', # الأردن | |
| 7: 'SY', # سوريا | |
| 8: 'IQ', # العراق | |
| 9: 'MA', # المغرب | |
| 10: 'EG', # مصر | |
| 11: 'PL', # فلسطين | |
| 12: 'YE', # اليمن | |
| 13: 'BH', # البحرين | |
| 14: 'DZ', # الجزائر | |
| 15: 'AE', # الإمارات | |
| 16: 'TN', # تونس | |
| 17: 'LY' # ليبيا | |
| } | |
| def tokenize_data(self, examples): | |
| return self.tokenizer( | |
| examples['text'], | |
| padding='max_length', | |
| truncation=True, | |
| max_length=128 | |
| ) | |
| def prepare_dataset(self, dataset): | |
| tokenized_dataset = dataset.map(self.tokenize_data, batched=True) | |
| tokenized_dataset = tokenized_dataset.remove_columns(['text', 'id']) | |
| tokenized_dataset = tokenized_dataset.rename_column('label', 'labels') | |
| tokenized_dataset.set_format('torch') | |
| return tokenized_dataset | |
| def train(self, train_dataset, eval_dataset=None, output_dir="./trained_model", num_train_epochs=3): | |
| print("تهيئة معلمات التدريب...") | |
| training_args = TrainingArguments( | |
| output_dir=output_dir, | |
| num_train_epochs=num_train_epochs, | |
| per_device_train_batch_size=32, | |
| per_device_eval_batch_size=32, | |
| warmup_steps=500, | |
| weight_decay=0.01, | |
| logging_dir='./logs', | |
| logging_steps=100, | |
| evaluation_strategy="epoch" if eval_dataset else "no", | |
| save_strategy="epoch", | |
| load_best_model_at_end=True if eval_dataset else False, | |
| metric_for_best_model="f1" if eval_dataset else None, | |
| ) | |
| trainer = Trainer( | |
| model=self.model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| compute_metrics=compute_metrics, | |
| ) | |
| print("بدء التدريب...") | |
| trainer.train() | |
| if eval_dataset: | |
| print("تقييم النموذج...") | |
| results = trainer.evaluate() | |
| print(f"نتائج التقييم: {results}") | |
| print("حفظ النموذج...") | |
| self.model.save_pretrained(output_dir) | |
| self.tokenizer.save_pretrained(output_dir) | |
| print("تم حفظ النموذج بنجاح!") | |
| def main(): | |
| print("تحميل مجموعة البيانات...") | |
| dataset = load_dataset("Abdelrahman-Rezk/Arabic_Dialect_Identification") | |
| trainer = ArabicDialectTrainer() | |
| print("تجهيز البيانات للتدريب...") | |
| train_dataset = trainer.prepare_dataset(dataset['train']) | |
| eval_dataset = trainer.prepare_dataset(dataset['validation']) | |
| print("بدء عملية التدريب...") | |
| trainer.train(train_dataset, eval_dataset) | |
| if __name__ == "__main__": | |
| main() | |