|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer |
|
|
from datasets import load_dataset |
|
|
import argparse |
|
|
|
|
|
def parse_args(): |
|
|
p = argparse.ArgumentParser() |
|
|
p.add_argument("--data", type=str, default="data_examples/sample_dataset.jsonl", help="jsonl with {'code','doc'}") |
|
|
p.add_argument("--output_dir", type=str, default="./code-explain-viz-model") |
|
|
p.add_argument("--epochs", type=int, default=1) |
|
|
return p.parse_args() |
|
|
|
|
|
def preprocess_batch(examples, tokenizer, max_src=512, max_tgt=256): |
|
|
inputs = ["explain: " + c for c in examples["code"]] |
|
|
model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=max_src) |
|
|
labels = tokenizer(text_target=examples["doc"], truncation=True, padding="max_length", max_length=max_tgt) |
|
|
model_inputs["labels"] = labels["input_ids"] |
|
|
return model_inputs |
|
|
|
|
|
def main(): |
|
|
args = parse_args() |
|
|
model_name = "Salesforce/codet5-small" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
|
|
|
|
ds = load_dataset("json", data_files={"train": args.data}) |
|
|
tokenized = ds["train"].map(lambda x: preprocess_batch(x, tokenizer), batched=True, remove_columns=ds["train"].column_names) |
|
|
|
|
|
training_args = Seq2SeqTrainingArguments( |
|
|
output_dir=args.output_dir, |
|
|
num_train_epochs=args.epochs, |
|
|
per_device_train_batch_size=2, |
|
|
save_strategy="epoch", |
|
|
logging_steps=50 |
|
|
) |
|
|
trainer = Seq2SeqTrainer(model=model, args=training_args, train_dataset=tokenized) |
|
|
trainer.train() |
|
|
trainer.save_model(args.output_dir) |
|
|
tokenizer.save_pretrained(args.output_dir) |
|
|
print("Saved model to", args.output_dir) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|