Spaces:
Runtime error
Runtime error
| import requests | |
| import pandas as pd | |
| from io import StringIO | |
| import streamlit as st | |
| import os | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import plotly.colors as pc | |
| import numpy as np | |
| from sklearn.metrics import mean_squared_error | |
| from statsmodels.tsa.stattools import acf | |
| from statsmodels.graphics.tsaplots import plot_acf | |
| import matplotlib.pyplot as plt | |
| def get_current_time(): | |
| now = datetime.now() | |
| current_hour = now.hour | |
| current_minute = now.minute | |
| # Return the hour and a boolean indicating if it is after the 10th minute | |
| return current_hour, current_minute >= 10 | |
| ##GET ALL FILES FROM GITHUB | |
| def load_GitHub(github_token, file_name, hour, after_10_min): | |
| url = f'https://raw.githubusercontent.com/margaridamascarenhas/Transparency_Data/main/{file_name}' | |
| headers = {'Authorization': f'token {github_token}'} | |
| response = requests.get(url, headers=headers) | |
| if response.status_code == 200: | |
| csv_content = StringIO(response.text) | |
| df = pd.read_csv(csv_content) | |
| if 'Date' in df.columns: | |
| df['Date'] = pd.to_datetime(df['Date']) # Convert 'Date' column to datetime | |
| df.set_index('Date', inplace=True) # Set 'Date' column as the index | |
| #df.to_csv(file_name) | |
| return df | |
| else: | |
| print(f"Failed to download {file_name}. Status code: {response.status_code}") | |
| return None | |
| def load_forecast(github_token, hour, after_10_min): | |
| predictions_dict = {} | |
| for hour in range(24): | |
| file_name = f'Predictions_{hour}h.csv' | |
| df = load_GitHub(github_token, file_name, hour, after_10_min) | |
| if df is not None: | |
| predictions_dict[file_name] = df | |
| return predictions_dict | |
| def convert_European_time(data, time_zone): | |
| data.index = pd.to_datetime(data.index, utc=True) | |
| data.index = data.index.tz_convert(time_zone) | |
| data.index = data.index.tz_localize(None) | |
| return data | |
| def simplify_model_names(df): | |
| # Define the mapping of complex names to simpler ones | |
| replacements = { | |
| r'\.LightGBMModel\.\dD\.TimeCov\.Temp\.Forecast_elia': '.LightGBM_with_Forecast_elia', | |
| r'\.LightGBMModel\.\dD\.TimeCov\.Temp': '.LightGBM', | |
| r'\.Naive\.\dD': '.Naive', | |
| } | |
| # Apply the replacements | |
| for original, simplified in replacements.items(): | |
| df.columns = df.columns.str.replace(original, simplified, regex=True) | |
| return df | |
| def simplify_model_names_in_index(df): | |
| # Define the mapping of complex names to simpler ones | |
| replacements = { | |
| r'\.LightGBMModel\.\dD\.TimeCov\.Temp\.Forecast_elia': '.LightGBM_with_Forecast_elia', | |
| r'\.LightGBMModel\.\dD\.TimeCov\.Temp': '.LightGBM', | |
| r'\.Naive\.\dD': '.Naive', | |
| } | |
| # Apply the replacements to the DataFrame index | |
| for original, simplified in replacements.items(): | |
| df.index = df.index.str.replace(original, simplified, regex=True) | |
| return df | |
| current_hour, after_10_min = get_current_time() | |
| github_token = st.secrets["GitHub_Token_KUL_Margarida"] | |
| if github_token: | |
| forecast_dict = load_forecast(github_token, current_hour, after_10_min) | |
| historical_forecast=load_GitHub(github_token, 'Historical_forecast.csv') | |
| Data_BE=load_GitHub(github_token, 'BE_Elia_Entsoe_UTC.csv') | |
| Data_FR=load_GitHub(github_token, 'FR_Entsoe_UTC.csv') | |
| Data_NL=load_GitHub(github_token, 'NL_Entsoe_UTC.csv') | |
| Data_DE=load_GitHub(github_token, 'DE_Entsoe_UTC.csv') | |
| Data_BE=convert_European_time(Data_BE, 'Europe/Brussels') | |
| Data_FR=convert_European_time(Data_FR, 'Europe/Paris') | |
| Data_NL=convert_European_time(Data_NL, 'Europe/Amsterdam') | |
| Data_DE=convert_European_time(Data_DE, 'Europe/Berlin') | |
| else: | |
| print("Please enter your GitHub Personal Access Token to proceed.") | |
| def conformal_predictions(data, target, my_forecast): | |
| data['Residuals'] = data[my_forecast] - data[actual_col] | |
| data['Hour'] = data.index.hour | |
| min_date = data.index.min() | |
| for date in data.index.normalize().unique(): | |
| if date >= min_date + pd.DateOffset(days=30): | |
| start_date = date - pd.DateOffset(days=30) | |
| end_date = date | |
| calculation_window = data[start_date:end_date-pd.DateOffset(hours=1)] | |
| quantiles = calculation_window.groupby('Hour')['Residuals'].quantile(0.8) | |
| # Use .loc to safely access and modify data | |
| if date in data.index: | |
| current_day_data = data.loc[date.strftime('%Y-%m-%d')] | |
| for hour in current_day_data['Hour'].unique(): | |
| if hour in quantiles.index: | |
| hour_quantile = quantiles[hour] | |
| idx = (data.index.normalize() == date) & (data.Hour == hour) | |
| data.loc[idx, 'Quantile_80'] = hour_quantile | |
| data.loc[idx, 'Lower_Interval'] = data.loc[idx, my_forecast] - hour_quantile | |
| data.loc[idx, 'Upper_Interval'] = data.loc[idx, my_forecast] + hour_quantile | |
| #data.reset_index(inplace=True) | |
| return data | |
| # Main layout of the app | |
| col1, col2 = st.columns([5, 2]) # Adjust the ratio to better fit your layout needs | |
| with col1: | |
| st.title("Transparency++") | |
| with col2: | |
| upper_space = col2.empty() | |
| upper_space = col2.empty() | |
| col2_1, col2_2 = st.columns(2) # Create two columns within the right column for side-by-side images | |
| with col2_1: | |
| st.image("KU_Leuven_logo.png", width=100) # Adjust the path and width as needed | |
| with col2_2: | |
| st.image("energyville_logo.png", width=100) | |
| upper_space.markdown(""" | |
| | |
| | |
| """, unsafe_allow_html=True) | |
| countries = { | |
| 'Netherlands': 'NL', | |
| 'Germany': 'DE', | |
| 'France': 'FR', | |
| 'Belgium': 'BE', | |
| } | |
| st.sidebar.header('Filters') | |
| st.sidebar.subheader("Select Country") | |
| st.sidebar.caption("Choose the country for which you want to display data or forecasts.") | |
| selected_country = st.sidebar.selectbox('Select Country', list(countries.keys())) | |
| st.sidebar.subheader("Select Date Range ") | |
| st.sidebar.caption("Define the time period over which the accuracy metrics will be calculated.") | |
| st.write() | |
| date_range = st.sidebar.date_input("Select Date Range for Metrics Calculation:", | |
| value=(pd.to_datetime("2024-01-01"), pd.to_datetime(pd.Timestamp('today')))) | |
| # Ensure the date range provides two dates | |
| if len(date_range) == 2: | |
| start_date = pd.Timestamp(date_range[0]) | |
| end_date = pd.Timestamp(date_range[1]) | |
| else: | |
| st.error("Please select a valid date range.") | |
| st.stop() | |
| st.sidebar.subheader("Section") | |
| st.sidebar.caption("Select the type of information you want to explore.") | |
| # Sidebar with radio buttons for different sections | |
| section = st.sidebar.radio('', ['Data', 'Forecasts', 'Insights'],index=1) | |
| country_code = countries[selected_country] | |
| if country_code == 'BE': | |
| data = Data_BE | |
| weather_columns = ['Temperature', 'Wind Speed Onshore', 'Wind Speed Offshore'] | |
| data['Temperature'] = data['temperature_2m_8'] | |
| data['Wind Speed Offshore'] = data['wind_speed_100m_4'] | |
| data['Wind Speed Onshore'] = data['wind_speed_100m_8'] | |
| elif country_code == 'DE': | |
| data = Data_DE | |
| weather_columns = ['Temperature', 'Wind Speed'] | |
| data['Temperature'] = data['temperature_2m'] | |
| data['Wind Speed'] = data['wind_speed_100m'] | |
| elif country_code == 'NL': | |
| data = Data_NL | |
| weather_columns = ['Temperature', 'Wind Speed'] | |
| data['Temperature'] = data['temperature_2m'] | |
| data['Wind Speed'] = data['wind_speed_100m'] | |
| elif country_code == 'FR': | |
| data = Data_FR | |
| weather_columns = ['Temperature', 'Wind Speed'] | |
| data['Temperature'] = data['temperature_2m'] | |
| data['Wind Speed'] = data['wind_speed_100m'] | |
| def add_feature(df2, df_main): | |
| #df_main.index = pd.to_datetime(df_main.index) | |
| #df2.index = pd.to_datetime(df2.index) | |
| df_combined = df_main.combine_first(df2) | |
| last_date_df1 = df_main.index.max() | |
| first_date_df2 = df2.index.min() | |
| if first_date_df2 == last_date_df1 + pd.Timedelta(hours=1): | |
| df_combined = pd.concat([df_main, df2[df2.index > last_date_df1]], axis=0) | |
| #df_combined.reset_index(inplace=True) | |
| return df_combined | |
| #data.index = data.index.tz_localize('UTC') | |
| forecast_columns = [ | |
| 'Load_entsoe','Load_forecast_entsoe','Wind_onshore_entsoe','Wind_onshore_forecast_entsoe','Wind_offshore_entsoe','Wind_offshore_forecast_entsoe','Solar_entsoe','Solar_forecast_entsoe'] | |
| if section == 'Data': | |
| st.header("Data") | |
| st.write(""" | |
| This section allows you to explore and upload your datasets. | |
| You can visualize raw data, clean it, and prepare it for analysis. | |
| """) | |
| st.header('Data Quality') | |
| st.write('The table below presents the data quality metrics for various energy-related datasets, focusing on the percentage of missing values and the occurrence of extreme or nonsensical values for the selected country.') | |
| data_quality=data.iloc[:-28] | |
| if country_code=='BE': | |
| data_quality=data.iloc[:-5*24] | |
| print(data_quality.tail(48)) | |
| # Report % of missing values | |
| missing_values = data_quality[forecast_columns].isna().mean() * 100 | |
| missing_values = missing_values.round(2) | |
| installed_capacities = { | |
| 'FR': { 'Solar': 17419, 'Wind Offshore': 1483, 'Wind Onshore': 22134}, | |
| 'DE': { 'Solar': 73821, 'Wind Offshore': 8386, 'Wind Onshore': 59915}, | |
| 'BE': { 'Solar': 8789, 'Wind Offshore': 2262, 'Wind Onshore': 3053}, | |
| 'NL': { 'Solar': 22590, 'Wind Offshore': 3220, 'Wind Onshore': 6190}, | |
| } | |
| if country_code not in installed_capacities: | |
| st.error(f"Installed capacities not defined for country code '{country_code}'.") | |
| st.stop() | |
| # Report % of extreme, impossible values for the selected country | |
| capacities = installed_capacities[country_code] | |
| extreme_values = {} | |
| for col in forecast_columns: | |
| if 'Solar_entsoe' in col: | |
| extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Solar'])).mean() * 100 | |
| elif 'Solar_forecast_entsoe' in col: | |
| extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Solar'])).mean() * 100 | |
| elif 'Wind_onshore_entsoe' in col: | |
| extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Wind Onshore'])).mean() * 100 | |
| elif 'Wind_onshore_forecast_entsoe' in col: | |
| extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Wind Onshore'])).mean() * 100 | |
| elif 'Wind_offshore_entsoe' in col: | |
| extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Wind Offshore'])).mean() * 100 | |
| elif 'Wind_offshore_forecast_entsoe' in col: | |
| extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Wind Offshore'])).mean() * 100 | |
| elif 'Load_entsoe' in col: | |
| extreme_values[col] = ((data_quality[col] < 0)).mean() * 100 | |
| elif 'Load_forecast_entsoe' in col: | |
| extreme_values[col] = ((data_quality[col] < 0)).mean() * 100 | |
| extreme_values = pd.Series(extreme_values).round(2) | |
| # Combine all metrics into one DataFrame | |
| metrics_df = pd.DataFrame({ | |
| 'Missing Values (%)': missing_values, | |
| 'Extreme/Nonsensical Values (%)': extreme_values, | |
| }) | |
| st.markdown( | |
| """ | |
| <style> | |
| .dataframe {font-size: 45px !important;} | |
| </style> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| st.dataframe(metrics_df) | |
| st.write('<b><u>Missing values (%)</u></b>: Percentage of missing values in the dataset', unsafe_allow_html=True) | |
| st.write('<b><u>Extreme/Nonsensical values (%)</u></b>: Values that are considered implausible such as negative or out-of-bound values i.e., (generation<0) or (generation>capacity)', unsafe_allow_html=True) | |
| # Section 2: Forecasts | |
| elif section == 'Forecasts': | |
| st.header('Forecast Quality') | |
| # Time series for last 1 week | |
| st.subheader('Time Series: Last 1 Week') | |
| last_week = data.loc[data.index >= (data.index[-1] - pd.Timedelta(days=7))] | |
| st.write('The below plots show the time series of forecasts vs. observations provided by the ENTSO-E Transparency platform between the selected data range.') | |
| forecast_columns = [ | |
| 'Load_entsoe','Load_forecast_entsoe','Wind_onshore_entsoe','Wind_onshore_forecast_entsoe','Wind_offshore_entsoe','Wind_offshore_forecast_entsoe','Solar_entsoe','Solar_forecast_entsoe'] | |
| num_per_var=2 | |
| if country_code=='BE': | |
| operation_forecast_load=forecast_dict['Predictions_10h.csv'].filter(like='Load_', axis=1) | |
| operation_forecast_res=forecast_dict['Predictions_17h.csv'].filter(regex='^(?!Load_)') | |
| operation_forecast_load.columns = [col.replace('_entsoe.', '_').replace('Naive.7D', 'WeeklyNaiveSeasonal') for col in operation_forecast_load.columns] | |
| operation_forecast_res.columns = [col.replace('_entsoe.', '_').replace('Naive.1D', 'DailyNaiveSeasonal') for col in operation_forecast_res.columns] | |
| Historical_and_Load=add_feature(operation_forecast_load, historical_forecast) | |
| Historical_and_operational=add_feature(operation_forecast_res, Historical_and_Load) | |
| best_forecast = Historical_and_operational.filter(like='Forecast_elia', axis=1) | |
| df_combined = Historical_and_operational.join(Data_BE, how='inner') | |
| last_week_best_forecast = best_forecast.loc[best_forecast.index >= (best_forecast.index[-24] - pd.Timedelta(days=7))] | |
| num_per_var=3 | |
| forecast_columns_line=['Load_entsoe','Load_forecast_entsoe', 'Load_LightGBMModel.7D.TimeCov.Temp.Forecast_elia', 'Wind_onshore_entsoe','Wind_onshore_forecast_entsoe','Wind_onshore_LightGBMModel.1D.TimeCov.Temp.Forecast_elia','Wind_offshore_entsoe','Wind_offshore_forecast_entsoe','Wind_offshore_LightGBMModel.1D.TimeCov.Temp.Forecast_elia','Solar_entsoe','Solar_forecast_entsoe', 'Solar_LightGBMModel.1D.TimeCov.Temp.Forecast_elia'] | |
| else: | |
| forecast_columns_line=forecast_columns | |
| for i in range(0, len(forecast_columns_line), num_per_var): | |
| actual_col = forecast_columns_line[i] | |
| forecast_col = forecast_columns_line[i + 1] | |
| if country_code=='BE': | |
| my_forecast = forecast_columns_line[i + 2] | |
| if forecast_col in data.columns: | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter(x=last_week.index, y=last_week[actual_col], mode='lines', name='Actual')) | |
| fig.add_trace(go.Scatter(x=last_week.index, y=last_week[forecast_col], mode='lines', name='Forecast ENTSO-E')) | |
| if country_code=='BE': | |
| conformal=conformal_predictions(df_combined, actual_col, my_forecast) | |
| last_week_conformal = conformal.loc[conformal.index >= (conformal.index[-24] - pd.Timedelta(days=7))] | |
| if actual_col =='Load_entsoe': | |
| last_week_conformal = conformal.loc[conformal.index >= (conformal.index[-24] - pd.Timedelta(days=5))] | |
| fig.add_trace(go.Scatter(x=last_week_best_forecast.index, y=last_week_best_forecast[my_forecast], mode='lines', name='Forecast EDS')) | |
| fig.add_trace(go.Scatter( | |
| x=last_week_conformal.index, | |
| y=last_week_conformal['Lower_Interval'], | |
| mode='lines', | |
| line=dict(width=0), | |
| showlegend=False | |
| )) | |
| # Add the upper interval trace and fill to the lower interval | |
| fig.add_trace(go.Scatter( | |
| x=last_week_conformal.index, | |
| y=last_week_conformal['Upper_Interval'], | |
| mode='lines', | |
| line=dict(width=0), | |
| fill='tonexty', # Fill between this trace and the previous one | |
| fillcolor='rgba(68, 68, 68, 0.3)', | |
| name='P10/P90 prediction intervals' | |
| )) | |
| fig.update_layout(title=f'Forecasts vs Actual for {actual_col}', xaxis_title='Date', yaxis_title='Value [MW]') | |
| st.plotly_chart(fig) | |
| def plot_category(df_dict, category_prefix, title): | |
| fig = go.Figure() | |
| # Define base colors for each model | |
| model_colors = { | |
| 'LightGBMModel.TimeCov.Temp.Forecast_elia': '#1f77b4', # Blue | |
| 'LightGBMModel.TimeCov.Temp': '#2ca02c', # Green | |
| 'Naive': '#ff7f0e' # Orange | |
| } | |
| # To keep track of which model has been added to the legend | |
| legend_added = {'LightGBMModel.TimeCov.Temp.Forecast_elia': False, 'LightGBMModel.TimeCov.Temp': False, 'Naive': False} | |
| for file_name, df in df_dict.items(): | |
| # Extract the hour from the filename, assuming the format is "Predictions_Xh.csv" | |
| hour = int(file_name.split('_')[1].replace('h.csv', '')) | |
| filtered_columns = [col for col in df.columns if col.startswith(category_prefix)] | |
| for column in filtered_columns: | |
| # Identify the model type with more precise logic | |
| if 'LightGBMModel' in column: | |
| if 'Forecast_elia' in column: | |
| model_key = 'LightGBMModel.TimeCov.Temp.Forecast_elia' | |
| elif 'TimeCov' in column: | |
| model_key = 'LightGBMModel.TimeCov.Temp' | |
| elif 'Naive' in column: | |
| model_key = 'Naive' | |
| else: | |
| continue # Skip if it doesn't match any model type | |
| # Extract the relevant part of the model name | |
| parts = column.split('.') | |
| model_name_parts = parts[1:] # Skip the variable prefix | |
| model_name = '.'.join(model_name_parts) # Rejoin the parts to form the model name | |
| # Get the base color for the model | |
| base_color = model_colors[model_key] | |
| # Calculate the color shade based on the hour | |
| color_scale = pc.hex_to_rgb(base_color) | |
| scale_factor = 0.3 + (hour / 40) # Adjust scale to ensure the gradient is visible | |
| adjusted_color = tuple(int(c * scale_factor) for c in color_scale) | |
| # Convert to RGBA with transparency for plot lines | |
| line_color = f'rgba({adjusted_color[0]}, {adjusted_color[1]}, {adjusted_color[2]}, 0.1)' # Transparent color for lines | |
| # Combine the hour and the model name for the legend, but only add the legend entry once | |
| show_legend = not legend_added[model_key] | |
| fig.add_trace(go.Scatter( | |
| x=df.index, # Assuming 'Date' is the index, use 'df.index' for x-axis | |
| y=df[column], | |
| mode='lines', | |
| name=model_name if show_legend else None, # Use the model name for the legend, but only once | |
| line=dict(color=base_color if show_legend else line_color), # Use opaque color for legend, transparent for lines | |
| showlegend=show_legend, # Show legend only once per model | |
| legendgroup=model_key # Grouping for consistent legend color | |
| )) | |
| # Mark that this model has been added to the legend | |
| if show_legend: | |
| legend_added[model_key] = True | |
| # Add real values as a separate trace, if provided | |
| filtered_Data_BE_df = Data_BE.loc[df.index] | |
| if filtered_Data_BE_df[f'{category_prefix}_entsoe'].notna().any(): | |
| fig.add_trace(go.Scatter( | |
| x=filtered_Data_BE_df.index, | |
| y=filtered_Data_BE_df[f'{category_prefix}_entsoe'], | |
| mode='lines', | |
| name=f'Actual {category_prefix}', | |
| line=dict(color='black', width=2), # Black line for real values | |
| showlegend=True # Always show this in the legend | |
| )) | |
| # Update layout to position the legend at the top, side by side | |
| fig.update_layout( | |
| title=dict( | |
| text=title, | |
| x=0, # Center the title horizontally | |
| y=1.00, # Slightly lower the title to create more space | |
| xanchor='left', | |
| yanchor='top' | |
| ), | |
| xaxis_title='Date', | |
| yaxis_title='Value', | |
| legend=dict( | |
| orientation="h", # Horizontal legend | |
| yanchor="bottom", # Align to the bottom of the legend box | |
| y=1, # Increase y position to avoid overlap with the title | |
| xanchor="center", # Center the legend horizontally | |
| x=0.5 # Position at the center of the plot | |
| ) | |
| ) | |
| return fig | |
| def calculate_mae(y_true, y_pred): | |
| return np.mean(np.abs(y_true - y_pred)) | |
| def plot_mae_comparison(df_dict, category_prefix, title, real_values_df): | |
| hours = list(range(24)) | |
| if category_prefix=='Load': | |
| model_colors = { | |
| 'LightGBMModel.7D.TimeCov.Temp.Forecast_elia': '#1F77B4', # Blue | |
| 'LightGBMModel.7D.TimeCov.Temp': '#2CA02C', # Green | |
| 'Naive': '#FF7F0E' # Orange | |
| } | |
| else: | |
| model_colors = { | |
| 'LightGBMModel.1D.TimeCov.Temp.Forecast_elia': '#1F77B4', # Blue | |
| 'LightGBMModel.1D.TimeCov.Temp': '#2CA02C', # Green | |
| 'Naive': '#FF7F0E' # Orange | |
| } | |
| fig = go.Figure() | |
| for model_key, base_color in model_colors.items(): | |
| hours_with_data = [] | |
| mae_ratios = [] | |
| for hour in hours: | |
| file_name = f'Predictions_{hour}h.csv' | |
| df = df_dict.get(file_name, None) | |
| if df is None: | |
| continue | |
| if isinstance(df.index, pd.DatetimeIndex): | |
| first_day = df.index.min().normalize() | |
| last_day = df.index.max().normalize() | |
| df = df[df.index.normalize() != first_day] | |
| df = df[df.index.normalize() != last_day] | |
| # Adjusted filtering logic based on actual column names | |
| filtered_columns = [col for col in df.columns if col.startswith(f"{category_prefix}_entsoe") and model_key in col] | |
| if not filtered_columns: | |
| continue | |
| # Assuming only one column matches, otherwise refine the selection logic | |
| model_predictions = df[filtered_columns[0]] | |
| actual_values = real_values_df[f'{category_prefix}_entsoe'] | |
| actual_values = actual_values.dropna() | |
| # Align both series by their common indices | |
| common_indices = model_predictions.index.intersection(actual_values.index) | |
| aligned_model_predictions = model_predictions.loc[common_indices] | |
| aligned_actual_values = actual_values.loc[common_indices] | |
| # Calculate MAE for the model | |
| model_mae = calculate_mae(aligned_actual_values, aligned_model_predictions) | |
| # Calculate MAE for the entsoe forecast | |
| entsoe_forecast = real_values_df[f'{category_prefix}_forecast_entsoe'].loc[common_indices] | |
| entsoe_mae = calculate_mae(aligned_actual_values, entsoe_forecast) | |
| # Calculate MAE ratio | |
| mae_ratio = model_mae / entsoe_mae | |
| mae_ratios.append(mae_ratio) | |
| hours_with_data.append(hour) | |
| # Plot the MAE ratio for this model as points | |
| if mae_ratios: # Only plot if there's data | |
| fig.add_trace(go.Scatter( | |
| x=hours_with_data, # The hours where we have data | |
| y=mae_ratios, | |
| mode='markers+lines', # Plot as points connected by lines | |
| name=model_key, | |
| line=dict(color=base_color), | |
| marker=dict(color=base_color, size=8) # Customize marker size | |
| )) | |
| # Update layout | |
| fig.update_layout( | |
| title=f'{category_prefix}: rMAE<span style="font-size:11px;">ENTSO-E</span> by hour of Forecasting.', | |
| xaxis_title='Hour of Forecast', | |
| yaxis_title='MAE Ratio (Model / entsoe)', | |
| legend=dict( | |
| orientation="h", | |
| yanchor="bottom", | |
| y=1.02, | |
| xanchor="center", | |
| x=0.5 | |
| ) | |
| ) | |
| return fig | |
| def plot_mae_comparison_clock(df_dict, category_prefix, title, real_values_df): | |
| hours = list(range(24)) | |
| if category_prefix=='Load': | |
| model_colors = { | |
| 'LightGBM_with_Forecast_elia': '#1F77B4', # Blue | |
| 'LightGBM': '#2CA02C', # Green | |
| 'Naive': '#FF7F0E' # Orange | |
| } | |
| else: | |
| model_colors = { | |
| 'LightGBM_with_Forecast_elia': '#1F77B4', # Blue | |
| 'LightGBM': '#2CA02C', # Green | |
| 'Naive': '#FF7F0E' # Orange | |
| } | |
| fig = go.Figure() | |
| for model_key, base_color in model_colors.items(): | |
| hours_with_data = [] | |
| mae_ratios = [] | |
| for hour in hours: | |
| file_name = f'Predictions_{hour}h.csv' | |
| df = df_dict.get(file_name, None) | |
| if df is None: | |
| continue | |
| if isinstance(df.index, pd.DatetimeIndex): | |
| first_day = df.index.min().normalize() | |
| last_day = df.index.max().normalize() | |
| df = df[df.index.normalize() != first_day] | |
| df = df[df.index.normalize() != last_day] | |
| filtered_columns = [col for col in df.columns if col.startswith(f"{category_prefix}_entsoe") and model_key in col] | |
| if not filtered_columns: | |
| print(f"No matching columns for {model_key} at hour {hour}. Skipping...") | |
| continue | |
| model_predictions = df[filtered_columns[0]] | |
| actual_values = real_values_df[f'{category_prefix}_entsoe'] | |
| actual_values = actual_values.dropna() | |
| common_indices = model_predictions.index.intersection(actual_values.index) | |
| aligned_model_predictions = model_predictions.loc[common_indices] | |
| aligned_actual_values = actual_values.loc[common_indices] | |
| model_mae = calculate_mae(aligned_actual_values, aligned_model_predictions) | |
| entsoe_forecast = real_values_df[f'{category_prefix}_forecast_entsoe'].loc[common_indices] | |
| entsoe_mae = calculate_mae(aligned_actual_values, entsoe_forecast) | |
| mae_ratio = model_mae / entsoe_mae | |
| mae_ratios.append(mae_ratio) | |
| hours_with_data.append(hour) | |
| if mae_ratios: | |
| fig.add_trace(go.Scatterpolar( | |
| r=mae_ratios + [mae_ratios[0]], # Ensure closure of the polar plot | |
| theta=[h * 15 for h in hours_with_data] + [0], # Ensure closure at 0 degrees | |
| mode='lines+markers', | |
| name=model_key, | |
| line=dict(color=base_color), | |
| marker=dict(color=base_color, size=8) | |
| )) | |
| else: | |
| print(f"No data to plot for {model_key}.") # Debugging print | |
| fig.update_layout( | |
| polar=dict( | |
| radialaxis=dict(visible=True, range=[0, max(max(mae_ratios), 1.0) * 1.1] if mae_ratios else [0, 1.0]), | |
| angularaxis=dict(tickmode='array', tickvals=[h * 15 for h in hours], ticktext=hours) | |
| ), | |
| title=f'{category_prefix}: rMAE<span style="font-size:11px;">ENTSO-E</span> by Hour of Forecasting', | |
| showlegend=True | |
| ) | |
| return fig | |
| if country_code == "BE": | |
| st.header('MAE Ratio Comparison by Forecast Hour') | |
| st.write("These clock-plots shows the relative Mean Absolute Error (rMAE) of different forecasting models compared to the ENTSO-E forecast, by the hour at which the forecast was made. " | |
| "The rMAE is calculated as the ratio of the model's MAE to the ENTSO-E forecast's MAE.") | |
| forecast_dict2 = forecast_dict.copy() | |
| forecast_dict2 = {k: simplify_model_names(v) for k, v in forecast_dict.items()} | |
| mae_comparison_fig = plot_mae_comparison_clock(forecast_dict2, 'Solar', 'rMAE Ratio Comparison for Solar', real_values_df=Data_BE) | |
| st.plotly_chart(mae_comparison_fig) | |
| mae_comparison_fig_wind_onshore = plot_mae_comparison_clock(forecast_dict2, 'Wind_onshore', 'MAE Ratio Comparison for Wind Onshore', real_values_df=Data_BE) | |
| st.plotly_chart(mae_comparison_fig_wind_onshore) | |
| mae_comparison_fig_wind_offshore = plot_mae_comparison_clock(forecast_dict2, 'Wind_offshore', 'MAE Ratio Comparison for Wind Offshore', real_values_df=Data_BE) | |
| st.plotly_chart(mae_comparison_fig_wind_offshore) | |
| mae_comparison_fig_load = plot_mae_comparison_clock(forecast_dict2, 'Load', 'MAE Ratio Comparison for Load', real_values_df=Data_BE) | |
| st.plotly_chart(mae_comparison_fig_load) | |
| # Scatter plots for error distribution | |
| st.subheader('Error Distribution') | |
| st.write('The below scatter plots show the error distribution of all three fields: Solar, Wind and Load between the selected date range') | |
| data_2024 = data[data.index.year > 2023] | |
| for i in range(0, len(forecast_columns), 2): | |
| actual_col = forecast_columns[i] | |
| forecast_col = forecast_columns[i + 1] | |
| if forecast_col in data_2024.columns: | |
| obs = data_2024[actual_col] | |
| pred = data_2024[forecast_col] | |
| error = pred - obs | |
| fig = px.scatter(x=obs, y=pred, labels={'x': 'Observed [MW]', 'y': 'Predicted by ENTSO-E [MW]'}) | |
| fig.update_layout(title=f'Error Distribution for {forecast_col}') | |
| st.plotly_chart(fig) | |
| st.subheader('Accuracy Metrics (Sorted by rMAE):') | |
| output_text = f"The below metrics are calculated from the selected date range from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}. This interval can be adjusted from the sidebar." | |
| st.write(output_text) | |
| if country_code == "BE": | |
| # Combine the two DataFrames on their index | |
| df_combined = Historical_and_operational.join(Data_BE, how='inner') | |
| # List of model columns from historical_forecast | |
| model_columns = historical_forecast.columns | |
| # Initialize dictionaries to store MAE and RMSE results for each variable | |
| results_wind_onshore = {} | |
| results_wind_offshore = {} | |
| results_load = {} | |
| results_solar = {} | |
| # Mapping of variables to their corresponding naive models | |
| naive_models = { | |
| 'Wind_onshore': 'Wind_onshore_DailyNaiveSeasonal', | |
| 'Wind_offshore': 'Wind_offshore_DailyNaiveSeasonal', | |
| 'Load': 'Load_WeeklyNaiveSeasonal', | |
| 'Solar': 'Solar_DailyNaiveSeasonal' | |
| } | |
| # Step 1: Calculate MAE, RMSE, and rMAE for each model | |
| for col in model_columns: | |
| # Extract the variable name by taking everything before the first underscore | |
| base_variable = col.split('_')[0] | |
| # Handle cases where variable names might be combined with multiple parts (e.g., "Load_LightGBMModel...") | |
| if base_variable in ['Wind', 'Load', 'Solar']: | |
| if 'onshore' in col: | |
| variable_name = 'Wind_onshore' | |
| results_dict = results_wind_onshore | |
| elif 'offshore' in col: | |
| variable_name = 'Wind_offshore' | |
| results_dict = results_wind_offshore | |
| else: | |
| variable_name = base_variable | |
| results_dict = results_load if base_variable == 'Load' else results_solar | |
| else: | |
| variable_name = base_variable | |
| # Construct the corresponding `variable_entsoe` column name | |
| entsoe_column = f'{variable_name}_entsoe' | |
| naive_model_col = naive_models.get(variable_name, None) | |
| # Drop NaNs for the specific pair of columns before calculating MAE and RMSE | |
| if entsoe_column in df_combined.columns and naive_model_col in df_combined.columns: | |
| valid_data = df_combined[[col, entsoe_column]].dropna() | |
| valid_naive_data = df_combined[[entsoe_column, naive_model_col]].dropna() | |
| # Calculate MAE and RMSE for the model against the `variable_entsoe` | |
| mae = np.mean(abs(valid_data[col] - valid_data[entsoe_column])) | |
| rmse = np.sqrt(mean_squared_error(valid_data[col], valid_data[entsoe_column])) | |
| # Calculate MAE for the Naive model | |
| mae_naive = np.mean(abs(valid_naive_data[entsoe_column] - valid_naive_data[naive_model_col])) | |
| # Calculate rMAE for the model | |
| rMAE = mae / mae_naive if mae_naive != 0 else np.inf | |
| # Store the results in the corresponding dictionary | |
| results_dict[f'{col}'] = {'MAE': mae, 'RMSE': rmse, 'rMAE': rMAE} | |
| # Step 2: Calculate MAE, RMSE, and rMAE for ENTSO-E forecasts specifically | |
| for variable_name in naive_models.keys(): | |
| entsoe_column = f'{variable_name}_entsoe' | |
| forecast_entsoe_column = f'{variable_name}_forecast_entsoe' | |
| naive_model_col = naive_models[variable_name] | |
| # Ensure that the ENTSO-E forecast is included in the results | |
| if forecast_entsoe_column in df_combined.columns: | |
| valid_data = df_combined[[forecast_entsoe_column, entsoe_column]].dropna() | |
| valid_naive_data = df_combined[[entsoe_column, naive_model_col]].dropna() | |
| # Calculate MAE and RMSE for the ENTSO-E forecast against the actuals | |
| mae_entsoe = np.mean(abs(valid_data[forecast_entsoe_column] - valid_data[entsoe_column])) | |
| rmse_entsoe = np.sqrt(mean_squared_error(valid_data[forecast_entsoe_column], valid_data[entsoe_column])) | |
| # Calculate rMAE for the ENTSO-E forecast | |
| mae_naive = np.mean(abs(valid_naive_data[entsoe_column] - valid_naive_data[naive_model_col])) | |
| rMAE_entsoe = mae_entsoe / mae_naive if mae_naive != 0 else np.inf | |
| # Add the ENTSO-E results to the corresponding dictionary | |
| if variable_name == 'Wind_onshore': | |
| results_wind_onshore[forecast_entsoe_column] = {'MAE': mae_entsoe, 'RMSE': rmse_entsoe, 'rMAE': rMAE_entsoe} | |
| elif variable_name == 'Wind_offshore': | |
| results_wind_offshore[forecast_entsoe_column] = {'MAE': mae_entsoe, 'RMSE': rmse_entsoe, 'rMAE': rMAE_entsoe} | |
| elif variable_name == 'Load': | |
| results_load[forecast_entsoe_column] = {'MAE': mae_entsoe, 'RMSE': rmse_entsoe, 'rMAE': rMAE_entsoe} | |
| elif variable_name == 'Solar': | |
| results_solar[forecast_entsoe_column] = {'MAE': mae_entsoe, 'RMSE': rmse_entsoe, 'rMAE': rMAE_entsoe} | |
| # Convert the dictionaries to DataFrames and sort by rMAE | |
| df_wind_onshore = pd.DataFrame.from_dict(results_wind_onshore, orient='index').sort_values(by='rMAE') | |
| df_wind_offshore = pd.DataFrame.from_dict(results_wind_offshore, orient='index').sort_values(by='rMAE') | |
| df_load = pd.DataFrame.from_dict(results_load, orient='index').sort_values(by='rMAE') | |
| df_solar = pd.DataFrame.from_dict(results_solar, orient='index').sort_values(by='rMAE') | |
| st.write("##### Wind Onshore:") | |
| df_wind_onshore = simplify_model_names_in_index(df_wind_onshore) | |
| st.dataframe(df_wind_onshore) | |
| st.write("##### Wind Offshore:") | |
| df_wind_offshore2 = simplify_model_names_in_index(df_wind_offshore) | |
| st.dataframe(df_wind_offshore) | |
| st.write("##### Load:") | |
| df_load = simplify_model_names_in_index(df_load) | |
| st.dataframe(df_load) | |
| st.write("##### Solar:") | |
| df_solar = simplify_model_names_in_index(df_solar) | |
| st.dataframe(df_solar) | |
| else: | |
| data = data.loc[start_date:end_date] | |
| accuracy_metrics = pd.DataFrame(columns=['MAE', 'rMAE'], index=['Load', 'Solar', 'Wind Onshore', 'Wind Offshore']) | |
| for i in range(0, len(forecast_columns), 2): | |
| actual_col = forecast_columns[i] | |
| forecast_col = forecast_columns[i + 1] | |
| if forecast_col in data.columns: | |
| obs = data[actual_col] | |
| pred = data[forecast_col] | |
| error = pred - obs | |
| mae = round(np.mean(np.abs(error)),2) | |
| if 'Load' in actual_col: | |
| persistence = obs.shift(168) # Weekly persistence | |
| else: | |
| persistence = obs.shift(24) # Daily persistence | |
| # Using the whole year's data for rMAE calculations | |
| rmae = round(mae / np.mean(np.abs(obs - persistence)),2) | |
| row_label = 'Load' if 'Load' in actual_col else 'Solar' if 'Solar' in actual_col else 'Wind Offshore' if 'Wind_offshore' in actual_col else 'Wind Onshore' | |
| accuracy_metrics.loc[row_label] = [mae, rmae] | |
| accuracy_metrics.dropna(how='all', inplace=True)# Sort by rMAE (second column) | |
| accuracy_metrics.sort_values(by=accuracy_metrics.columns[1], ascending=True, inplace=True) | |
| accuracy_metrics = accuracy_metrics.round(4) | |
| col1, col2 = st.columns([3, 2]) | |
| with col1: | |
| st.dataframe(accuracy_metrics) | |
| with col2: | |
| st.markdown(""" | |
| <style> | |
| .big-font { | |
| font-size: 20px; | |
| font-weight: 500; | |
| } | |
| </style> | |
| <div class="big-font"> | |
| Equations | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.markdown(r""" | |
| $\text{MAE} = \frac{1}{n}\sum_{i=1}^{n}|y_i - \hat{y}_i|$ | |
| $\text{rMAE} = \frac{\text{MAE}}{MAE_{\text{Persistence Model}}}$ | |
| """) | |
| st.subheader('ACF plots of Errors') | |
| st.write('The below plots show the ACF (Auto-Correlation Function) for the errors of all three data fields obtained from ENTSO-E: Solar, Wind and Load.') | |
| for i in range(0, len(forecast_columns), 2): | |
| actual_col = forecast_columns[i] | |
| forecast_col = forecast_columns[i + 1] | |
| if forecast_col in data.columns: | |
| obs = data[actual_col] | |
| pred = data[forecast_col] | |
| error = pred - obs | |
| st.write(f"**ACF of Errors for {actual_col}**") | |
| fig, ax = plt.subplots(figsize=(10, 5)) | |
| plot_acf(error.dropna(), ax=ax) | |
| st.pyplot(fig) | |
| acf_values = acf(error.dropna(), nlags=240) | |
| # Section 3: Insights | |
| elif section == 'Insights': | |
| st.header("Insights") | |
| st.write(""" | |
| This section provides insights derived from the data and forecasts. | |
| You can visualize trends, anomalies, and other important findings. | |
| """) | |
| # Scatter plots for correlation between wind, solar, and load | |
| st.subheader('Correlation between Wind, Solar, and Load') | |
| st.write('The below scatter plots are made for checking whether there exists a correlation between all three data fields obtained from ENTSO-E: Solar, Wind and Load.') | |
| combinations = [('Solar_entsoe', 'Load_entsoe'), ('Wind_onshore_entsoe', 'Load_entsoe'), ('Wind_offshore_entsoe', 'Load_entsoe'), ('Solar_entsoe', 'Wind_onshore_entsoe'), ('Solar_entsoe', 'Wind_offshore_entsoe')] | |
| for x_col, y_col in combinations: | |
| if x_col in data.columns and y_col in data.columns: | |
| # For solar combinations, filter out zero values | |
| if 'Solar_entsoe' in x_col: | |
| filtered_data = data[data['Solar_entsoe'] > 0] | |
| x_values = filtered_data[x_col] | |
| y_values = filtered_data[y_col] | |
| else: | |
| x_values = data[x_col] | |
| y_values = data[y_col] | |
| corr_coef = x_values.corr(y_values) | |
| fig = px.scatter( | |
| x=x_values, | |
| y=y_values, | |
| labels={'x': f'{x_col} [MW]', 'y': f'{y_col} [MW]'}, | |
| title=f'{x_col} vs {y_col} (Correlation: {corr_coef:.2f})', color_discrete_sequence=['grey']) | |
| st.plotly_chart(fig) | |
| st.subheader('Weather vs. Generation/Demand') | |
| st.write('The below scatter plots show the relation between weather parameters (i.e., Temperature, Wind Speed) and the generation/demand data from ENTSO-E.') | |
| for weather_col in weather_columns: | |
| for actual_col in ['Load_entsoe', 'Solar_entsoe', 'Wind_onshore_entsoe', 'Wind_offshore_entsoe']: | |
| if weather_col in data.columns and actual_col in data.columns: | |
| clean_label = actual_col.replace('_entsoe', '') | |
| if weather_col == 'Temperature': | |
| fig = px.scatter(x=data[weather_col], y=data[actual_col], labels={'x': f'{weather_col} (°C)', 'y': f'{clean_label} Generation [MW]'}, color_discrete_sequence=['orange']) | |
| else: | |
| fig = px.scatter(x=data[weather_col], y=data[actual_col], labels={'x': f'{weather_col} (km/h)', 'y': clean_label}) | |
| fig.update_layout(title=f'{weather_col} vs {actual_col}') | |
| st.plotly_chart(fig) | |