import json from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from shared.metrics import compute_rouge, compute_bleu, factuality_score from shared.utils import print_banner def evaluate_model(model_path="models/financegpt"): print_banner("Evaluating FinanceGPT") tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForSeq2SeqLM.from_pretrained(model_path) dataset = load_dataset("json", data_files="datasets/financegpt_sample.jsonl", split="train[:50]") preds, refs = [], [] for row in dataset: inputs = tokenizer(row["question"], return_tensors="pt", truncation=True) output = model.generate(**inputs, max_new_tokens=64) preds.append(tokenizer.decode(output[0], skip_special_tokens=True)) refs.append(row["answer"]) results = {} results.update(compute_rouge(preds, refs)) results.update(compute_bleu(preds, refs)) results.update(factuality_score(preds, refs)) with open("models/financegpt/eval_results.json", "w") as f: json.dump(results, f, indent=2) print("✅ Evaluation complete:", results) if __name__ == "__main__": evaluate_model()