Spaces:

KaiquanMah
/

DSIP

Configuration error

File size: 3,536 Bytes

a5908eb
 
 
6d4b65a
26f170a
a5908eb
 
 
 
 
 
 
 
 
 
 
54181e3
 
 
a5908eb
54181e3
a5908eb
 
176b18e
 
 
54181e3
40cf979
f8b9a00
ceabc60
788eddb
 
 
 
 
4b6a510
40cf979
 
26f170a
 
 
 
 
 
 
 
40cf979
54181e3
 
55f21d5
40cf979
 
54181e3
 
 
a5908eb
 
 
 
 
 
 
 
 
 
 
 
 
efad283
 
 
a5908eb

import argparse
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import os



def parse(csv_path):
    print(f"Location of the file: {csv_path}")
    
    # Step 1: Load the dataset
    # file_path = "dataset.csv"  # Path to the original dataset
    data = pd.read_csv(csv_path)

    # Drop NA and duplicates
    data = data.dropna()
    data.to_csv('data/01 dropna.csv', index=False)
    data = data.drop_duplicates()
    data.to_csv('data/02 drop_duplicates.csv', index=False)
    
    # Step 2: Define the feature columns (X) and target column (y)
    # X = data[["name", "attendance percentage", "average sleep time", "average screen time"]]  # Feature columns
    X = data[["DateTime","product","campaign_id","webpage_id","product_category_1","product_category_2","user_group_id","gender","age_level","user_depth","city_development_index","var_1"]]  # Feature columns
    y = data["is_click"]  # Target column

    # Extract datetime features
    X.loc[:,'DateTime'] = pd.to_datetime(X['DateTime'], errors='coerce')
    X = X.dropna(subset=['DateTime'])
    # print(X.columns)
    # print(X.iloc[:5,0])
    # print(X.iloc[:5,0].dt.weekday)
    X.loc[:,'weekday'] = pd.to_numeric(pd.to_datetime(X['DateTime'], errors='coerce').dt.weekday, errors='coerce', downcast='integer')
    X.loc[:,'month'] = pd.to_numeric(pd.to_datetime(X['DateTime'], errors='coerce').dt.month, errors='coerce', downcast='integer')
    X.loc[:,"hour"] = pd.to_datetime(X['DateTime'], errors='coerce').dt.hour.values
    X = X.drop('DateTime', axis=1)

    # Product label to number
    le = LabelEncoder()
    X.loc[:,"product"] = le.fit_transform(X["product"])
    # Gender label to number
    X['gender'] = X['gender'].map({'Female': 1,
                                   'Male': 0,
                                   'M': 0})
    
    
    # Normalize numerical features
    scaler = MinMaxScaler()
    numerical_features = ['campaign_id','webpage_id','user_depth',"product_category_1","product_category_2","user_group_id", 'age_level',"user_depth", 'city_development_index', 'var_1']
    X[numerical_features] = scaler.fit_transform(X[numerical_features])
    data = pd.concat([X, y.to_frame(name="is_click")], axis=1)
    data.to_csv('data/03 normalize.csv', index=False)

    
    
    # Step 3: Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Step 4: Combine X and y back into dataframes for train and test
    train_data = pd.concat([X_train, y_train], axis=1)  # Combine features and target for training data
    test_data = pd.concat([X_test, y_test], axis=1)    # Combine features and target for testing data
    
    # Step 5: Create the 'data' folder if it doesn't exist
    output_folder = "data"
    os.makedirs(output_folder, exist_ok=True)
    
    # Step 6: Save the train and test sets as CSV files
    # train_file_path = os.path.join(output_folder, "train.csv")
    train_file_path = "data/train.csv"
    test_file_path = "data/test.csv"
    
    train_data.to_csv(train_file_path, index=False)
    test_data.to_csv(test_file_path, index=False)
    
    print(f"Train and test datasets saved in '{output_folder}' folder.")
    




if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--csv-path", type=str)
    
    
    args = parser.parse_args()
    parse(args.csv_path)