File size: 857 Bytes
beb5479
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from datasets import load_dataset
import pandas as pd
import os

def load_finance_dataset():
    """
    Loads a small sample of SEC 10-K/10-Q Q&A style data.
    Replace with your own dataset or HF dataset ID.
    """
    dataset = load_dataset("Abirate/financial_phrasebank", split="train[:100]")
    df = pd.DataFrame(dataset)
    # Create synthetic QA pairs for demo
    df["question"] = "Summarize this financial statement: " + df["sentence"]
    df["answer"] = df["label"].astype(str)
    dataset_dict = df[["question", "answer"]].to_dict(orient="records")
    os.makedirs("datasets", exist_ok=True)
    pd.DataFrame(dataset_dict).to_json("datasets/financegpt_sample.jsonl", orient="records", lines=True)
    print("βœ… Saved dataset to datasets/financegpt_sample.jsonl")
    return dataset_dict

if __name__ == "__main__":
    load_finance_dataset()