Spaces:
Running
Running
| # import torch | |
| # from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification | |
| # # Same as before | |
| # checkpoint = "bert-base-uncased" | |
| # tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
| # model = AutoModelForSequenceClassification.from_pretrained(checkpoint) | |
| # sequences = [ | |
| # "I've been waiting for a HuggingFace course my whole life.", | |
| # "This course is amazing!", | |
| # ] | |
| # batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt") | |
| # # This is new | |
| # batch["labels"] = torch.tensor([1, 1]) | |
| # optimizer = AdamW(model.parameters()) | |
| # loss = model(**batch).loss | |
| # loss.backward() | |
| # optimizer.step() | |
| from datasets import load_dataset | |
| # raw_datasets = load_dataset("glue", "sst2") | |
| # raw_datasets | |
| # raw_train_dataset = raw_datasets["train"] | |
| # output = raw_train_dataset[0]['sentence'] | |
| # print(output) | |
| # raw_train_dataset = raw_datasets["validation"] | |
| # output = raw_train_dataset[87] | |
| # print(raw_train_dataset.features) | |
| # from transformers import AutoTokenizer | |
| # checkpoint = "bert-base-uncased" | |
| # tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
| # print(tokenizer(output)) | |
| # inputs = tokenizer(output) | |
| # print(tokenizer.convert_ids_to_tokens(inputs["input_ids"])) | |
| # inputs = tokenizer("This is the first sentence.") | |
| # print(inputs) | |
| # print(tokenizer.convert_ids_to_tokens(inputs["input_ids"])) | |
| # # tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"]) | |
| # # tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"]) | |
| # # inputs = tokenizer("This is the first sentence.", "This is the second one.") | |
| # # inputs = tokenizer.convert_ids_to_tokens(inputs["input_ids"]) | |
| # # print(inputs) | |
| # def tokenize_function(example): | |
| # return tokenizer(example["sentence"], truncation=True) | |
| # tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) | |
| # print(tokenized_datasets) | |
| # from transformers import DataCollatorWithPadding | |
| # data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
| # samples = tokenized_datasets["train"][:8] | |
| # samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]} | |
| # print([len(x) for x in samples["input_ids"]]) | |
| # batch = data_collator(samples) | |
| # print(batch) | |
| # print({k: v.shape for k, v in batch.items()}) | |
| # # Try it yourself | |
| from datasets import load_dataset | |
| raw_datasets = load_dataset("glue", "sst2") | |
| raw_train_dataset = raw_datasets["train"] | |
| output = raw_train_dataset[0]['sentence'] | |
| # print(output) | |
| from transformers import AutoTokenizer | |
| checkpoint = "bert-base-uncased" | |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
| # print(tokenizer(output)) | |
| inputs = tokenizer(output) | |
| # print(tokenizer.convert_ids_to_tokens(inputs["input_ids"])) | |
| tokenized_dataset = tokenizer( | |
| output, | |
| padding=True, | |
| truncation=True, | |
| ) | |
| def tokenize_function(example): | |
| return tokenizer(example["sentence"], truncation=True) | |
| tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) | |
| # print(tokenized_datasets) | |
| # from datasets import load_dataset | |
| # from transformers import AutoTokenizer, DataCollatorWithPadding | |
| # raw_datasets = load_dataset("glue", "mrpc") | |
| # checkpoint = "bert-base-uncased" | |
| # tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
| # def tokenize_function(example): | |
| # return tokenizer(example["sentence1"], example["sentence2"], truncation=True) | |
| # tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) | |
| # data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
| # from transformers import TrainingArguments | |
| # training_args = TrainingArguments("test-trainer") | |
| # from transformers import AutoModelForSequenceClassification | |
| # model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) | |
| # from transformers import Trainer | |
| # trainer = Trainer( | |
| # model, | |
| # training_args, | |
| # train_dataset=tokenized_datasets["train"], | |
| # eval_dataset=tokenized_datasets["validation"], | |
| # data_collator=data_collator, | |
| # tokenizer=tokenizer, | |
| # ) | |
| # predictions = trainer.predict(tokenized_datasets["validation"]) | |
| # print(predictions.predictions.shape, predictions.label_ids.shape) | |
| # import numpy as np | |
| # preds = np.argmax(predictions.predictions, axis=-1) | |
| # import evaluate | |
| # metric = evaluate.load("glue", "mrpc") | |
| # metric.compute(predictions=preds, references=predictions.label_ids) | |
| # def compute_metrics(eval_preds): | |
| # metric = evaluate.load("glue", "mrpc") | |
| # logits, labels = eval_preds | |
| # predictions = np.argmax(logits, axis=-1) | |
| # return metric.compute(predictions=predictions, references=labels) | |
| # training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch") | |
| # model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) | |
| # trainer = Trainer( | |
| # model, | |
| # training_args, | |
| # train_dataset=tokenized_datasets["train"], | |
| # eval_dataset=tokenized_datasets["validation"], | |
| # data_collator=data_collator, | |
| # tokenizer=tokenizer, | |
| # compute_metrics=compute_metrics, | |
| # ) | |
| # trainer.train() | |
| from transformers import AutoTokenizer, DataCollatorWithPadding | |
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
| tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"]) | |
| tokenized_datasets = tokenized_datasets.rename_column("label", "labels") | |
| tokenized_datasets.set_format("torch") | |
| tokenized_datasets["train"].column_names | |
| from torch.utils.data import DataLoader | |
| train_dataloader = DataLoader( | |
| tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator | |
| ) | |
| eval_dataloader = DataLoader( | |
| tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator | |
| ) | |
| for batch in train_dataloader: | |
| break | |
| output = {k: v.shape for k, v in batch.items()} | |
| # print(output) | |
| from transformers import AutoModelForSequenceClassification | |
| model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) | |
| outputs = model(**batch) | |
| # print(outputs.loss, outputs.logits.shape) | |
| from transformers import AdamW | |
| optimizer = AdamW(model.parameters(), lr=5e-5) | |
| from transformers import get_scheduler | |
| num_epochs = 3 | |
| num_training_steps = num_epochs * len(train_dataloader) | |
| lr_scheduler = get_scheduler( | |
| "linear", | |
| optimizer=optimizer, | |
| num_warmup_steps=0, | |
| num_training_steps=num_training_steps, | |
| ) | |
| print(num_training_steps) | |
| # The training loop | |
| import torch | |
| device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
| model.to(device) | |
| # print(device) | |
| from tqdm.auto import tqdm | |
| progress_bar = tqdm(range(num_training_steps)) | |
| model.train() | |
| for epoch in range(num_epochs): | |
| for batch in train_dataloader: | |
| batch = {k: v.to(device) for k, v in batch.items()} | |
| outputs = model(**batch) | |
| loss = outputs.loss | |
| loss.backward() | |
| optimizer.step() | |
| lr_scheduler.step() | |
| optimizer.zero_grad() | |
| progress_bar.update(1) | |
| # The evaluation loop | |
| import evaluate | |
| metric = evaluate.load("glue", "mrpc") | |
| model.eval() | |
| for batch in eval_dataloader: | |
| batch = {k: v.to(device) for k, v in batch.items()} | |
| with torch.no_grad(): | |
| outputs = model(**batch) | |
| logits = outputs.logits | |
| predictions = torch.argmax(logits, dim=-1) | |
| metric.add_batch(predictions=predictions, references=batch["labels"]) | |
| metric.compute() |