Spaces:
Running
Running
| import torch | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments | |
| from peft import LoraConfig, get_peft_model | |
| from datasets import load_dataset | |
| from shared.utils import load_yaml_config, ensure_dir, print_banner | |
| def main(): | |
| cfg = load_yaml_config("config.yaml") | |
| print_banner("Training FinanceGPT") | |
| tokenizer = AutoTokenizer.from_pretrained(cfg["base_model"]) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(cfg["base_model"]) | |
| # LoRA configuration | |
| peft_config = LoraConfig( | |
| r=cfg["train"]["lora_r"], | |
| lora_alpha=cfg["train"]["lora_alpha"], | |
| lora_dropout=cfg["train"]["lora_dropout"], | |
| bias="none", | |
| task_type="SEQ_2_SEQ_LM", | |
| ) | |
| model = get_peft_model(model, peft_config) | |
| dataset = load_dataset("json", data_files="datasets/financegpt_sample.jsonl", split="train") | |
| def preprocess(batch): | |
| inputs = tokenizer(batch["question"], truncation=True, padding="max_length", max_length=256) | |
| labels = tokenizer(batch["answer"], truncation=True, padding="max_length", max_length=256) | |
| inputs["labels"] = labels["input_ids"] | |
| return inputs | |
| tokenized = dataset.map(preprocess, batched=True) | |
| args = TrainingArguments( | |
| output_dir="models/financegpt", | |
| per_device_train_batch_size=cfg["train"]["batch_size"], | |
| learning_rate=cfg["train"]["lr"], | |
| num_train_epochs=cfg["train"]["epochs"], | |
| fp16=torch.cuda.is_available(), | |
| save_strategy="epoch", | |
| ) | |
| trainer = Trainer(model=model, args=args, train_dataset=tokenized) | |
| trainer.train() | |
| ensure_dir("models/financegpt") | |
| model.save_pretrained("models/financegpt") | |
| tokenizer.save_pretrained("models/financegpt") | |
| print("β Model saved at models/financegpt") | |
| if __name__ == "__main__": | |
| main() | |