Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| import pandas as pd | |
| from constants import BIGOS_INFO, PELCRA_INFO, ANALYSIS_INFO, ABOUT_INFO, INSPECTION_INFO | |
| from utils import read_latest_results, basic_stats_per_dimension, retrieve_asr_systems_meta_from_the_catalog, box_plot_per_dimension, get_total_audio_duration, check_impact_of_normalization, calculate_wer_per_meta_category, calculate_wer_per_audio_feature | |
| from app_utils import calculate_height_to_display, filter_dataframe | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| from datasets import load_dataset | |
| hf_token = os.getenv('HF_TOKEN') | |
| if hf_token is None: | |
| raise ValueError("HF_TOKEN environment variable is not set. Please check your secrets settings.") | |
| # Tabs | |
| # About - Description, references, contact points | |
| # Analysis and insights - questions and answers about the benchmark results | |
| # Leaderboard - BIGOS | |
| # Leaderboard - PELCRA | |
| # TODO - add other tabs for other datasets e.g. Hallucinations, Children speech, etc. | |
| st.set_page_config(layout="wide") | |
| about, lead_bigos, lead_bigos_diagnostic, lead_bigos_synth, lead_pelcra, analysis, inspection = st.tabs(["About BIGOS benchmark", "AMU BIGOS-v2 leaderboard", "AMU BIGOS-diagnostic leaderboard", "AMU BIGOS-med leaderboard", "PELCRA4BIGOS leaderboard", "Analysis", "Data and results inspection"]) | |
| cols_to_select_all = ["system", "subset", "ref_type", "norm_type", "SER", "MER", "WER", "CER"] | |
| def plot_performance(systems_to_plot, df_per_system_with_type): | |
| # Get unique subsets | |
| subsets = df_per_system_with_type['subset'].unique() | |
| # Create a color and label map | |
| color_label_map = { | |
| free_system_with_best_wer: ('blue', 'Best Free'), | |
| free_system_with_worst_wer: ('red', 'Worst Free'), | |
| commercial_system_with_best_wer: ('green', 'Best Paid'), | |
| commercial_system_with_worst_wer: ('orange', 'Worst Paid') | |
| } | |
| # Plot the data | |
| fig, ax = plt.subplots(figsize=(14, 7)) | |
| bar_width = 0.3 | |
| index = np.arange(len(subsets)) | |
| for i, system in enumerate(systems_to_plot): | |
| subset_wer = df_per_system_with_type[df_per_system_with_type['system'] == system].set_index('subset')['WER'] | |
| color, label = color_label_map[system] | |
| ax.bar(index + i * bar_width, subset_wer.loc[subsets], bar_width, label=label + ' - ' + system, color=color) | |
| # Adding labels and title | |
| ax.set_xlabel('Subset') | |
| ax.set_ylabel('WER (%)') | |
| ax.set_title('Comparison of performance of ASR systems.') | |
| ax.set_xticks(index + bar_width * 1.5) | |
| ax.set_xticklabels(subsets, rotation=90, ha='right') | |
| ax.legend() | |
| st.pyplot(fig) | |
| def round_to_nearest(value, multiple): | |
| return multiple * round(value / multiple) | |
| def create_bar_chart(df, systems, metric, norm_type, ref_type='orig', orientation='vertical'): | |
| df = df[df['norm_type'] == norm_type] | |
| df = df[df['ref_type'] == ref_type] | |
| # Prepare the data for the bar chart | |
| subsets = df['subset'].unique() | |
| num_vars = len(subsets) | |
| bar_width = 0.2 # Width of the bars | |
| fig, ax = plt.subplots(figsize=(10, 10)) | |
| max_value_all_systems = 0 | |
| for i, system in enumerate(systems): | |
| system_data = df[df['system'] == system] | |
| max_value_for_system = max(system_data[metric]) | |
| if max_value_for_system > max_value_all_systems: | |
| max_value_all_systems = round_to_nearest(max_value_for_system + 2, 10) | |
| # Ensure the system data is in the same order as subsets | |
| values = [] | |
| for subset in subsets: | |
| subset_value = system_data[system_data['subset'] == subset][metric].values | |
| if len(subset_value) > 0: | |
| values.append(subset_value[0]) | |
| else: | |
| values.append(0) # Append 0 if the subset value is missing | |
| if orientation == 'vertical': | |
| # Plot each system's bars with an offset for vertical orientation | |
| x_pos = np.arange(len(subsets)) + i * bar_width | |
| ax.bar(x_pos, values, bar_width, label=system) | |
| # Add value labels | |
| for j, value in enumerate(values): | |
| ax.text(x_pos[j], value + max(values) * 0.03, f'{value}', ha='center', va='bottom',fontsize=6) | |
| else: | |
| # Plot each system's bars with an offset for horizontal orientation | |
| y_pos = np.arange(len(subsets)) + i * bar_width | |
| ax.barh(y_pos, values, bar_width, label=system) | |
| # Add value labels | |
| for j, value in enumerate(values): | |
| ax.text(value + max(values) * 0.03, y_pos[j], f'{value}', ha='left', va='center', fontsize=6) | |
| if orientation == 'vertical': | |
| ax.set_xticks(np.arange(len(subsets)) + bar_width * (len(systems) - 1) / 2) | |
| ax.set_xticklabels(subsets, rotation=45, ha='right') | |
| ax.set_ylabel(metric) | |
| else: | |
| ax.set_yticks(np.arange(len(subsets)) + bar_width * (len(systems) - 1) / 2) | |
| ax.set_yticklabels(subsets) | |
| ax.set_xlabel(metric) | |
| # Add grid values for the vertical and horizontal bar plots | |
| if orientation == 'vertical': | |
| ax.set_yticks(np.linspace(0, max_value_all_systems, 5)) | |
| else: | |
| ax.set_xticks(np.linspace(0, max_value_all_systems, 5)) | |
| # Put legend on the right side outside of the plot | |
| plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1), shadow=True, ncol=1) | |
| st.pyplot(fig) | |
| def create_radar_plot(df, enable_labels, systems, metric, norm_type, ref_type='orig'): | |
| df = df[df['norm_type'] == norm_type] | |
| df = df[df['ref_type'] == ref_type] | |
| # Prepare the data for the radar plot | |
| #systems = df['system'].unique() | |
| subsets = df['subset'].unique() | |
| num_vars = len(subsets) | |
| angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist() | |
| angles += angles[:1] # Complete the loop | |
| fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True)) | |
| max_value_all_systems = 0 | |
| for system in systems: | |
| system_data = df[df['system'] == system] | |
| max_value_for_system = max(system_data[metric]) | |
| if max_value_for_system > max_value_all_systems: | |
| max_value_all_systems = round_to_nearest(max_value_for_system + 2, 10) | |
| # Ensure the system data is in the same order as subsets | |
| values = [] | |
| for subset in subsets: | |
| subset_value = system_data[system_data['subset'] == subset][metric].values | |
| if len(subset_value) > 0: | |
| values.append(subset_value[0]) | |
| else: | |
| values.append(0) # Append 0 if the subset value is missing | |
| values += values[:1] # Complete the loop | |
| # Plot each system | |
| ax.plot(angles, values, label=system) | |
| ax.fill(angles, values, alpha=0.25) | |
| # Add value labels | |
| for angle, value in zip(angles, values): | |
| ax.text(angle, value + max(values) * 0.01, f'{value}', ha='center', va='center', fontsize=6) | |
| ax.set_xticklabels(subsets) | |
| ax.set_yticks(np.linspace(0, max_value_all_systems, 5)) | |
| # put legend at the bottom of the page | |
| plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1), shadow=True, ncol=1) | |
| st.pyplot(fig) | |
| with about: | |
| st.title("About BIGOS benchmark") | |
| st.markdown(ABOUT_INFO, unsafe_allow_html=True) | |
| # TODO - load and display about BIGOS benchmark | |
| # Table - evaluated systems # TODO - change to concatenated table | |
| st.header("Evaluated ASR systems") | |
| dataset = "amu-cai/pl-asr-bigos-v2-secret" | |
| split = "test" | |
| df_per_sample, df_per_dataset = read_latest_results(dataset, split, codename_to_shortname_mapping=None) | |
| evaluated_systems_list = df_per_sample["system"].unique() | |
| #print("ASR systems available in the eval results for dataset {}: ".format(dataset), evaluated_systems_list ) | |
| df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list) | |
| codename_to_shortname_mapping = dict(zip(df_evaluated_systems["Codename"],df_evaluated_systems["Shortname"])) | |
| #print(codename_to_shortname_mapping) | |
| h_df_systems = calculate_height_to_display(df_evaluated_systems) | |
| df_evaluated_systems_types_and_count = df_evaluated_systems["Type"].value_counts().reset_index() | |
| df_evaluated_systems_types_and_count.columns = ["Type", "Count"] | |
| st.write("Evaluated ASR systems types") | |
| st.dataframe(df_evaluated_systems_types_and_count, hide_index=True, use_container_width=False) | |
| st.write("Evaluated ASR systems details") | |
| #TODO - add info who created the system (company, institution, team, etc.) | |
| st.dataframe(df_evaluated_systems, hide_index=True, height = h_df_systems, use_container_width=True) | |
| # Table - evaluation datasets | |
| # Table - evaluation metrics | |
| # Table - evaluation metadata | |
| # List - references | |
| # List - contact points | |
| # List - acknowledgements | |
| # List - changelog | |
| # List - FAQ | |
| # List - TODOs | |
| with lead_bigos: | |
| # configuration for tab | |
| dataset = "amu-cai/pl-asr-bigos-v2-secret" | |
| dataset_short_name = "BIGOS" | |
| dataset_version = "V2" | |
| eval_date = "March 2024" | |
| split = "test" | |
| norm_type = "all" | |
| ref_type = "orig" | |
| # common, reusable part for all tabs presenting leaderboards for specific datasets | |
| #### DATA LOADING AND AUGMENTATION #### | |
| df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping) | |
| # filter only the ref_type and norm_type we want to analyze | |
| df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)] | |
| # filter only the ref_type and norm_type we want to analyze | |
| df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)] | |
| ##### PARAMETERS CALCULATION #### | |
| evaluated_systems_list = df_per_sample["system"].unique() | |
| no_of_evaluated_systems = len(evaluated_systems_list) | |
| no_of_eval_subsets = len(df_per_dataset["subset"].unique()) | |
| no_of_test_cases = len(df_per_sample) | |
| no_of_unique_recordings = len(df_per_sample["id"].unique()) | |
| total_audio_duration_hours = get_total_audio_duration(df_per_sample) | |
| no_of_unique_speakers = len(df_per_sample["speaker_id"].unique()) | |
| df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname") | |
| ########### EVALUATION PARAMETERS PRESENTATION ################ | |
| st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version)) | |
| st.markdown(BIGOS_INFO, unsafe_allow_html=True) | |
| st.markdown("**Evaluation date:** {}".format(eval_date)) | |
| st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems)) | |
| st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets)) | |
| st.markdown("**Number of evaluated system-model-subsets combinations**: {}".format(len(df_per_dataset))) | |
| st.markdown("**Number of unique speakers**: {}".format(no_of_unique_speakers)) | |
| st.markdown("**Number of unique recordings used for evaluation:** {}".format(no_of_unique_recordings)) | |
| st.markdown("**Total size of the dataset:** {:.2f} hours".format(total_audio_duration_hours)) | |
| st.markdown("**Total number of test cases (audio-hypothesis pairs):** {}".format(no_of_test_cases)) | |
| st.markdown("**Dataset:** {}".format(dataset)) | |
| st.markdown("**Dataset version:** {}".format(dataset_version)) | |
| st.markdown("**Split:** {}".format(split)) | |
| st.markdown("**Text reference type:** {}".format(ref_type)) | |
| st.markdown("**Normalization steps:** {}".format(norm_type)) | |
| ########### RESULTS ################ | |
| st.header("WER (Word Error Rate) analysis") | |
| st.subheader("Average WER for the whole dataset") | |
| df_wer_avg = basic_stats_per_dimension(df_per_dataset, "WER", "dataset") | |
| st.dataframe(df_wer_avg) | |
| st.subheader("Comparison of average WER for free and commercial systems") | |
| df_wer_avg_free_commercial = basic_stats_per_dimension(df_per_dataset_with_asr_systems_meta, "WER", "Type") | |
| st.dataframe(df_wer_avg_free_commercial) | |
| ##################### PER SYSTEM ANALYSIS ######################### | |
| analysis_dim = "system" | |
| metric = "WER" | |
| st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim) | |
| h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset) | |
| st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset ) | |
| st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]") | |
| st.pyplot(fig, clear_figure=True, use_container_width=True) | |
| ##################### PER SUBSET ANALYSIS ######################### | |
| analysis_dim = "subset" | |
| metric = "WER" | |
| st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim) | |
| h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset) | |
| st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset ) | |
| st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]") | |
| st.pyplot(fig, clear_figure=True, use_container_width=True) | |
| ### IMPACT OF NORMALIZATION ON ERROR RATES ##### | |
| # Calculate the average impact of various norm_types for all datasets and systems | |
| df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all] | |
| diff_in_metrics = check_impact_of_normalization(df_per_dataset_selected_cols) | |
| st.subheader("Impact of normalization of references and hypothesis on evaluation metrics") | |
| st.dataframe(diff_in_metrics, use_container_width=False) | |
| # Visualizing the differences in metrics graphically with data labels | |
| # Visualizing the differences in metrics graphically with data labels | |
| fig, axs = plt.subplots(3, 2, figsize=(12, 12)) | |
| fig.subplots_adjust(hspace=0.6, wspace=0.6) | |
| #remove the sixth subplot | |
| fig.delaxes(axs[2,1]) | |
| metrics = ['SER', 'WER', 'MER', 'CER', "Average"] | |
| colors = ['blue', 'orange', 'green', 'red', 'purple'] | |
| for ax, metric, color in zip(axs.flatten(), metrics, colors): | |
| bars = ax.bar(diff_in_metrics.index, diff_in_metrics[metric], color=color) | |
| ax.set_title(f'Normalization impact on {metric}') | |
| if metric == 'Average': | |
| ax.set_title('Average normalization impact on all metrics') | |
| ax.set_xlabel('Normalization Type') | |
| ax.set_ylabel(f'Difference in {metric}') | |
| ax.grid(True) | |
| ax.set_xticklabels(diff_in_metrics.index, rotation=45, ha='right') | |
| min_val = diff_in_metrics[metric].min() | |
| ax.set_ylim([min_val * 1.1, diff_in_metrics[metric].max() * 1.1]) | |
| for bar in bars: | |
| height = bar.get_height() | |
| ax.annotate(f'{height:.2f}', | |
| xy=(bar.get_x() + bar.get_width() / 2, height), | |
| xytext=(0, -12), # 3 points vertical offset | |
| textcoords="offset points", | |
| ha='center', va='bottom') | |
| # Display the plot in Streamlit | |
| st.pyplot(fig) | |
| ##################### APPENDIX ######################### | |
| st.header("Appendix - Full evaluation results per subset for all evaluated systems") | |
| # select only the columns we want to plot | |
| st.dataframe(df_per_dataset_selected_cols, hide_index=True, use_container_width=False) | |
| with lead_bigos_diagnostic: | |
| # configuration for tab | |
| dataset = "amu-cai/pl-asr-bigos-v2-diagnostic" | |
| dataset_short_name = "BIGOS DIAGNOSTIC" | |
| dataset_version = "V2" | |
| eval_date = "March 2024" | |
| split = "test" | |
| norm_type = "all" | |
| ref_type = "orig" | |
| # common, reusable part for all tabs presenting leaderboards for specific datasets | |
| #### DATA LOADING AND AUGMENTATION #### | |
| df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping) | |
| # filter only the ref_type and norm_type we want to analyze | |
| df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)] | |
| # filter only the ref_type and norm_type we want to analyze | |
| df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)] | |
| ##### PARAMETERS CALCULATION #### | |
| evaluated_systems_list = df_per_sample["system"].unique() | |
| no_of_evaluated_systems = len(evaluated_systems_list) | |
| no_of_eval_subsets = len(df_per_dataset["subset"].unique()) | |
| no_of_test_cases = len(df_per_sample) | |
| no_of_unique_recordings = len(df_per_sample["id"].unique()) | |
| total_audio_duration_hours = get_total_audio_duration(df_per_sample) | |
| #no_of_unique_speakers = len(df_per_sample["speaker_id"].unique()) | |
| no_of_unique_speakers="N/A" | |
| df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname") | |
| ########### EVALUATION PARAMETERS PRESENTATION ################ | |
| st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version)) | |
| st.markdown(BIGOS_INFO, unsafe_allow_html=True) | |
| st.markdown("**Evaluation date:** {}".format(eval_date)) | |
| st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems)) | |
| st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets)) | |
| st.markdown("**Number of evaluated system-model-subsets combinations**: {}".format(len(df_per_dataset))) | |
| st.markdown("**Number of unique speakers**: {}".format(no_of_unique_speakers)) | |
| st.markdown("**Number of unique recordings used for evaluation:** {}".format(no_of_unique_recordings)) | |
| st.markdown("**Total size of the dataset:** {:.2f} hours".format(total_audio_duration_hours)) | |
| st.markdown("**Total number of test cases (audio-hypothesis pairs):** {}".format(no_of_test_cases)) | |
| st.markdown("**Dataset:** {}".format(dataset)) | |
| st.markdown("**Dataset version:** {}".format(dataset_version)) | |
| st.markdown("**Split:** {}".format(split)) | |
| st.markdown("**Text reference type:** {}".format(ref_type)) | |
| st.markdown("**Normalization steps:** {}".format(norm_type)) | |
| ########### RESULTS ################ | |
| st.header("WER (Word Error Rate) analysis") | |
| st.subheader("Average WER for the whole dataset") | |
| df_wer_avg = basic_stats_per_dimension(df_per_dataset, "WER", "dataset") | |
| st.dataframe(df_wer_avg) | |
| st.subheader("Comparison of average WER for free and commercial systems") | |
| df_wer_avg_free_commercial = basic_stats_per_dimension(df_per_dataset_with_asr_systems_meta, "WER", "Type") | |
| st.dataframe(df_wer_avg_free_commercial) | |
| ##################### PER SYSTEM ANALYSIS ######################### | |
| analysis_dim = "system" | |
| metric = "WER" | |
| st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim) | |
| h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset) | |
| st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset ) | |
| st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]") | |
| st.pyplot(fig, clear_figure=True, use_container_width=True) | |
| ##################### PER SUBSET ANALYSIS ######################### | |
| analysis_dim = "subset" | |
| metric = "WER" | |
| st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim) | |
| h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset) | |
| st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset ) | |
| st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]") | |
| st.pyplot(fig, clear_figure=True, use_container_width=True) | |
| ##################### APPENDIX ######################### | |
| st.header("Appendix - Full evaluation results per subset for all evaluated systems") | |
| # select only the columns we want to plot | |
| df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all] | |
| st.dataframe(df_per_dataset_selected_cols, hide_index=True, use_container_width=False) | |
| with lead_bigos_synth: | |
| # configuration for tab | |
| dataset = "amu-cai/pl-asr-bigos-synth" | |
| dataset_short_name = "BIGOS synthetic" | |
| dataset_version = "V1" | |
| eval_date = "March 2024" | |
| split = "test" | |
| norm_type = "all" | |
| ref_type = "orig" | |
| # common, reusable part for all tabs presenting leaderboards for specific datasets | |
| #### DATA LOADING AND AUGMENTATION #### | |
| df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping) | |
| # filter only the ref_type and norm_type we want to analyze | |
| df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)] | |
| # filter only the ref_type and norm_type we want to analyze | |
| df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)] | |
| ##### PARAMETERS CALCULATION #### | |
| evaluated_systems_list = df_per_sample["system"].unique() | |
| no_of_evaluated_systems = len(evaluated_systems_list) | |
| no_of_eval_subsets = len(df_per_dataset["subset"].unique()) | |
| no_of_test_cases = len(df_per_sample) | |
| no_of_unique_recordings = len(df_per_sample["id"].unique()) | |
| total_audio_duration_hours = get_total_audio_duration(df_per_sample) | |
| #no_of_unique_speakers = len(df_per_sample["speaker_id"].unique()) | |
| no_of_unique_speakers="N/A" | |
| df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list) | |
| df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname") | |
| ########### EVALUATION PARAMETERS PRESENTATION ################ | |
| st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version)) | |
| st.markdown(BIGOS_INFO, unsafe_allow_html=True) | |
| st.markdown("**Evaluation date:** {}".format(eval_date)) | |
| st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems)) | |
| st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets)) | |
| st.markdown("**Number of evaluated system-model-subsets combinations**: {}".format(len(df_per_dataset))) | |
| st.markdown("**Number of unique speakers**: {}".format(no_of_unique_speakers)) | |
| st.markdown("**Number of unique recordings used for evaluation:** {}".format(no_of_unique_recordings)) | |
| st.markdown("**Total size of the dataset:** {:.2f} hours".format(total_audio_duration_hours)) | |
| st.markdown("**Total number of test cases (audio-hypothesis pairs):** {}".format(no_of_test_cases)) | |
| st.markdown("**Dataset:** {}".format(dataset)) | |
| st.markdown("**Dataset version:** {}".format(dataset_version)) | |
| st.markdown("**Split:** {}".format(split)) | |
| st.markdown("**Text reference type:** {}".format(ref_type)) | |
| st.markdown("**Normalization steps:** {}".format(norm_type)) | |
| ########### RESULTS ################ | |
| st.header("WER (Word Error Rate) analysis") | |
| st.subheader("Average WER for the whole dataset") | |
| df_wer_avg = basic_stats_per_dimension(df_per_dataset, "WER", "dataset") | |
| st.dataframe(df_wer_avg) | |
| st.subheader("Comparison of average WER for free and commercial systems") | |
| df_wer_avg_free_commercial = basic_stats_per_dimension(df_per_dataset_with_asr_systems_meta, "WER", "Type") | |
| st.dataframe(df_wer_avg_free_commercial) | |
| ##################### PER SYSTEM ANALYSIS ######################### | |
| analysis_dim = "system" | |
| metric = "WER" | |
| st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim) | |
| h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset) | |
| st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset ) | |
| st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]") | |
| st.pyplot(fig, clear_figure=True, use_container_width=True) | |
| ##################### PER SUBSET ANALYSIS ######################### | |
| analysis_dim = "subset" | |
| metric = "WER" | |
| st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim) | |
| h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset) | |
| st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset ) | |
| st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]") | |
| st.pyplot(fig, clear_figure=True, use_container_width=True) | |
| ### IMPACT OF NORMALIZATION ON ERROR RATES ##### | |
| # Calculate the average impact of various norm_types for all datasets and systems | |
| df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all] | |
| diff_in_metrics = check_impact_of_normalization(df_per_dataset_selected_cols) | |
| st.subheader("Impact of normalization of references and hypothesis on evaluation metrics") | |
| st.dataframe(diff_in_metrics, use_container_width=False) | |
| ##################### APPENDIX ######################### | |
| st.header("Appendix - Full evaluation results per subset for all evaluated systems") | |
| # select only the columns we want to plot | |
| df_per_dataset_selected_cols = df_per_dataset[cols_to_select_all] | |
| st.dataframe(df_per_dataset_selected_cols, hide_index=True, use_container_width=False) | |
| with lead_pelcra: | |
| st.title("PELCRA Leaderboard") | |
| st.markdown(PELCRA_INFO, unsafe_allow_html=True) | |
| # configuration for tab | |
| dataset = "pelcra/pl-asr-pelcra-for-bigos-secret" | |
| dataset_short_name = "PELCRA" | |
| dataset_version = "V1" | |
| eval_date = "March 2024" | |
| split = "test" | |
| norm_type = "all" | |
| ref_type = "orig" | |
| # common, reusable part for all tabs presenting leaderboards for specific datasets | |
| #### DATA LOADING AND AUGMENTATION #### | |
| df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping) | |
| # filter only the ref_type and norm_type we want to analyze | |
| df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)] | |
| # filter only the ref_type and norm_type we want to analyze | |
| df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)] | |
| ##### PARAMETERS CALCULATION #### | |
| evaluated_systems_list = df_per_sample["system"].unique() | |
| no_of_evaluated_systems = len(evaluated_systems_list) | |
| no_of_eval_subsets = len(df_per_dataset["subset"].unique()) | |
| no_of_test_cases = len(df_per_sample) | |
| no_of_unique_recordings = len(df_per_sample["id"].unique()) | |
| total_audio_duration_hours = get_total_audio_duration(df_per_sample) | |
| no_of_unique_speakers = len(df_per_sample["speaker_id"].unique()) | |
| df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname") | |
| ########### EVALUATION PARAMETERS PRESENTATION ################ | |
| st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version)) | |
| st.markdown(BIGOS_INFO, unsafe_allow_html=True) | |
| st.markdown("**Evaluation date:** {}".format(eval_date)) | |
| st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems)) | |
| st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets)) | |
| st.markdown("**Number of evaluated system-model-subsets combinations**: {}".format(len(df_per_dataset))) | |
| st.markdown("**Number of unique speakers**: {}".format(no_of_unique_speakers)) | |
| st.markdown("**Number of unique recordings used for evaluation:** {}".format(no_of_unique_recordings)) | |
| st.markdown("**Total size of the dataset:** {:.2f} hours".format(total_audio_duration_hours)) | |
| st.markdown("**Total number of test cases (audio-hypothesis pairs):** {}".format(no_of_test_cases)) | |
| st.markdown("**Dataset:** {}".format(dataset)) | |
| st.markdown("**Dataset version:** {}".format(dataset_version)) | |
| st.markdown("**Split:** {}".format(split)) | |
| st.markdown("**Text reference type:** {}".format(ref_type)) | |
| st.markdown("**Normalization steps:** {}".format(norm_type)) | |
| ########### RESULTS ################ | |
| st.header("WER (Word Error Rate) analysis") | |
| st.subheader("Average WER for the whole dataset") | |
| df_wer_avg = basic_stats_per_dimension(df_per_dataset, "WER", "dataset") | |
| st.dataframe(df_wer_avg) | |
| st.subheader("Comparison of average WER for free and commercial systems") | |
| df_wer_avg_free_commercial = basic_stats_per_dimension(df_per_dataset_with_asr_systems_meta, "WER", "Type") | |
| st.dataframe(df_wer_avg_free_commercial) | |
| ##################### PER SYSTEM ANALYSIS ######################### | |
| analysis_dim = "system" | |
| metric = "WER" | |
| st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim) | |
| h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset) | |
| st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset ) | |
| st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]") | |
| st.pyplot(fig, clear_figure=True, use_container_width=True) | |
| ##################### PER SUBSET ANALYSIS ######################### | |
| analysis_dim = "subset" | |
| metric = "WER" | |
| st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim) | |
| h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset) | |
| st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset ) | |
| st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim)) | |
| fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]") | |
| st.pyplot(fig, clear_figure=True, use_container_width=True) | |
| ### IMPACT OF NORMALIZATION ON ERROR RATES ##### | |
| # Calculate the average impact of various norm_types for all datasets and systems | |
| df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all] | |
| diff_in_metrics = check_impact_of_normalization(df_per_dataset_selected_cols) | |
| st.subheader("Impact of normalization on WER") | |
| st.dataframe(diff_in_metrics, use_container_width=False) | |
| # Visualizing the differences in metrics graphically with data labels | |
| # Visualizing the differences in metrics graphically with data labels | |
| fig, axs = plt.subplots(3, 2, figsize=(12, 12)) | |
| fig.subplots_adjust(hspace=0.6, wspace=0.6) | |
| #remove the sixth subplot | |
| fig.delaxes(axs[2,1]) | |
| metrics = ['SER', 'WER', 'MER', 'CER', "Average"] | |
| colors = ['blue', 'orange', 'green', 'red', 'purple'] | |
| for ax, metric, color in zip(axs.flatten(), metrics, colors): | |
| bars = ax.bar(diff_in_metrics.index, diff_in_metrics[metric], color=color) | |
| ax.set_title(f'Normalization impact on {metric}') | |
| if metric == 'Average': | |
| ax.set_title('Average normalization impact on all metrics') | |
| ax.set_xlabel('Normalization Type') | |
| ax.set_ylabel(f'Difference in {metric}') | |
| ax.grid(True) | |
| ax.set_xticklabels(diff_in_metrics.index, rotation=45, ha='right') | |
| min_val = diff_in_metrics[metric].min() | |
| ax.set_ylim([min_val * 1.1, diff_in_metrics[metric].max() * 1.1]) | |
| for bar in bars: | |
| height = bar.get_height() | |
| ax.annotate(f'{height:.2f}', | |
| xy=(bar.get_x() + bar.get_width() / 2, height), | |
| xytext=(0, -12), # 3 points vertical offset | |
| textcoords="offset points", | |
| ha='center', va='bottom') | |
| # Display the plot in Streamlit | |
| st.pyplot(fig) | |
| ##################### APPENDIX ######################### | |
| st.header("Appendix - Full evaluation results per subset for all evaluated systems") | |
| # select only the columns we want to plot | |
| df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all] | |
| st.dataframe(df_per_dataset_selected_cols, hide_index=True, use_container_width=False) | |
| with analysis: | |
| datasets = [ | |
| "amu-cai/pl-asr-bigos-v2-secret", | |
| "pelcra/pl-asr-pelcra-for-bigos-secret", | |
| "amu-cai/pl-asr-bigos-v2-diagnostic", | |
| "amu-cai/pl-asr-bigos-v2-med"] | |
| st.title("Analysis and insights") | |
| st.markdown(ANALYSIS_INFO, unsafe_allow_html=True) | |
| st.title("Plots for analyzing ASR Systems performance") | |
| # select the dataset to display results | |
| dataset = st.selectbox("Select Dataset", datasets, index=datasets.index('amu-cai/pl-asr-bigos-v2-secret')) | |
| # read the latest results for the selected dataset | |
| print("Reading the latest results for dataset: ", dataset) | |
| df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping) | |
| # filter only the ref_type and norm_type we want to analyze | |
| df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)] | |
| # filter only the ref_type and norm_type we want to analyze | |
| df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)] | |
| evaluated_systems_list = df_per_sample["system"].unique() | |
| print(evaluated_systems_list) | |
| df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list) | |
| print(df_evaluated_systems) | |
| # read available options to analyze for specific dataset | |
| splits = list(df_per_dataset_all['subset'].unique()) # Get the unique splits | |
| norm_types = list(df_per_dataset_all['norm_type'].unique()) # Get the unique norm_types | |
| ref_types = list(df_per_dataset_all['ref_type'].unique()) # Get the unique ref_types | |
| systems = list(df_per_dataset_all['system'].unique()) # Get the unique systems | |
| metrics = list(df_per_dataset_all.columns[7:]) # Get the unique metrics | |
| # Select the system to display. More than 1 system can be selected. | |
| systems_selected = st.multiselect("Select ASR Systems", systems) | |
| # Select the metric to display | |
| metric = st.selectbox("Select Metric", metrics, index=metrics.index('WER')) | |
| # Select the normalization type | |
| norm_type = st.selectbox("Select Normalization Type", norm_types, index=norm_types.index('all')) | |
| # Select the reference type | |
| ref_type = st.selectbox("Select Reference Type", ref_types, index=ref_types.index('orig')) | |
| enable_labels = st.checkbox("Enable labels on radar plot", value=True) | |
| enable_bar_chart = st.checkbox("Enable bar chart", value=True) | |
| enable_polar_plot = st.checkbox("Enable radar plot", value=True) | |
| orientation = st.selectbox("Select orientation", ["vertical", "horizontal"], index=0) | |
| if enable_polar_plot: | |
| if metric: | |
| if systems_selected: | |
| create_radar_plot(df_per_dataset_all, enable_labels, systems_selected, metric, norm_type, ref_type) | |
| if enable_bar_chart: | |
| if metric: | |
| if systems_selected: | |
| create_bar_chart(df_per_dataset_all, systems_selected , metric, norm_type, ref_type, orientation) | |
| ##### ANALYSIS - COMMERCIAL VS FREE SYSTEMS ##### | |
| # Generate dataframe with columns as follows System Type Subset Avg_WER | |
| df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname") | |
| df_wer_avg_per_system_all_subsets_with_type = df_per_dataset_with_asr_systems_meta.groupby(['system', 'Type', 'subset'])['WER'].mean().reset_index() | |
| print(df_wer_avg_per_system_all_subsets_with_type) | |
| # Select the best and worse system for free and commercial systems | |
| free_systems = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['Type'] == 'free']['system'].unique() | |
| commercial_systems = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['Type'] == 'commercial']['system'].unique() | |
| free_system_with_best_wer = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['system'].isin(free_systems)].groupby('system')['WER'].mean().idxmin() | |
| free_system_with_worst_wer = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['system'].isin(free_systems)].groupby('system')['WER'].mean().idxmax() | |
| commercial_system_with_best_wer = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['system'].isin(commercial_systems)].groupby('system')['WER'].mean().idxmin() | |
| commercial_system_with_worst_wer = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['system'].isin(commercial_systems)].groupby('system')['WER'].mean().idxmax() | |
| #print(f"Best free system: {free_system_with_best_wer}") | |
| #print(f"Worst free system: {free_system_with_worst_wer}") | |
| #print(f"Best commercial system: {commercial_system_with_best_wer}") | |
| #print(f"Worst commercial system: {commercial_system_with_worst_wer}") | |
| st.subheader("Comparison of WER for free and commercial systems") | |
| # Best and worst system for free and commercial systems - print table | |
| header = ["Type", "Best System", "Worst System"] | |
| data = [ | |
| ["Free", free_system_with_best_wer, free_system_with_worst_wer], | |
| ["Commercial", commercial_system_with_best_wer, commercial_system_with_worst_wer] | |
| ] | |
| st.subheader("Best and worst systems for dataset {}".format(dataset)) | |
| df_best_worse_systems = pd.DataFrame(data, columns=header) | |
| # do not display index | |
| st.dataframe(df_best_worse_systems) | |
| st.subheader("Comparison of average WER for best systems") | |
| df_per_dataset_best_systems = df_per_dataset_with_asr_systems_meta[df_per_dataset_with_asr_systems_meta['system'].isin([free_system_with_best_wer, commercial_system_with_best_wer])] | |
| df_wer_avg_best_free_commercial = basic_stats_per_dimension(df_per_dataset_best_systems, "WER", "Type") | |
| st.dataframe(df_wer_avg_best_free_commercial) | |
| # Create lookup table to get system type based on its name | |
| #system_type_lookup = dict(zip(df_wer_avg_per_system_all_subsets_with_type['system'], df_wer_avg_per_system_all_subsets_with_type['Type'])) | |
| systems_to_plot_best= [free_system_with_best_wer, commercial_system_with_best_wer] | |
| plot_performance(systems_to_plot_best, df_wer_avg_per_system_all_subsets_with_type) | |
| st.subheader("Comparison of average WER for the worst systems") | |
| df_per_dataset_worst_systems = df_per_dataset_with_asr_systems_meta[df_per_dataset_with_asr_systems_meta['system'].isin([free_system_with_worst_wer, commercial_system_with_worst_wer])] | |
| df_wer_avg_worst_free_commercial = basic_stats_per_dimension(df_per_dataset_worst_systems, "WER", "Type") | |
| st.dataframe(df_wer_avg_worst_free_commercial) | |
| systems_to_plot_worst=[free_system_with_worst_wer, commercial_system_with_worst_wer] | |
| plot_performance(systems_to_plot_worst, df_wer_avg_per_system_all_subsets_with_type) | |
| # WER in function of model size | |
| st.subheader("WER in function of model size for dataset {}".format(dataset)) | |
| # select only free systems for the analysis from df_wer_avg_per_system_all_subsets_with_type dataframe | |
| free_systems_wer_per_subset = df_per_dataset_with_asr_systems_meta.groupby(['system', 'Parameters [M]', 'subset'])['WER'].mean().reset_index() | |
| # sort by model size | |
| # change column type Parameters [M] to integer | |
| free_systems_wer_per_subset['Parameters [M]'] = free_systems_wer_per_subset['Parameters [M]'].astype(int) | |
| free_systems_wer_per_subset = free_systems_wer_per_subset.sort_values(by='Parameters [M]') | |
| free_systems_wer_average_across_all_subsets = free_systems_wer_per_subset.groupby(['system', 'Parameters [M]'])['WER'].mean().reset_index() | |
| # change column type Parameters [M] to integer | |
| free_systems_wer_average_across_all_subsets['Parameters [M]'] = free_systems_wer_average_across_all_subsets['Parameters [M]'].astype(int) | |
| # sort by model size | |
| free_systems_wer_average_across_all_subsets = free_systems_wer_average_across_all_subsets.sort_values(by='Parameters [M]') | |
| free_systems_wer = free_systems_wer_average_across_all_subsets | |
| # use system name as index | |
| free_systems_wer_to_show = free_systems_wer.set_index('system') | |
| # sort by WER and round WER by value to 2 decimal places | |
| free_systems_wer_to_show = free_systems_wer_to_show.sort_values(by='WER').round({'WER': 2}) | |
| # print dataframe in streamlit with average WER, system name and model size | |
| st.dataframe(free_systems_wer_to_show) | |
| # plot scatter plot with values of WER | |
| # X axis is the model size (parameters [M]) | |
| # Y is thw average WER | |
| # make each point a different color | |
| # provide legend with system names | |
| fig, ax = plt.subplots() | |
| for system in free_systems_wer['system'].unique(): | |
| subset = free_systems_wer[free_systems_wer['system'] == system] | |
| ax.scatter(subset['Parameters [M]'], subset['WER'], label=system) | |
| # Add text annotation for each point | |
| for i, point in subset.iterrows(): | |
| ax.annotate(point['system'], (point['Parameters [M]'], point['WER']), textcoords="offset points", xytext=(-10,-10), ha='left', rotation=-30, fontsize=5) | |
| ax.set_xlabel('Model Size [M]') | |
| ax.set_ylabel('WER (%)') | |
| ax.set_title('WER in function of model size') | |
| # decrease font size of the legend and place it outside the plot | |
| ax.legend(title='System', bbox_to_anchor=(1.05, 1), loc='upper left') | |
| st.pyplot(fig) | |
| ################################################################################################################################################## | |
| # WER per audio duration | |
| # calculate average WER per audio duration bucket for the best and worse commercial and free systems | |
| selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer] | |
| # filter out results for selected systems | |
| df_per_sample_selected_systems = df_per_sample[df_per_sample['system'].isin(selected_systems)] | |
| # calculate average WER per audio duration for the best system | |
| # add column with audio duration in seconds rounded to nearest integer value. | |
| audio_duration_buckets = [1,2,3,4,5,10,15,20,30,40,50,60] | |
| # map audio duration to the closest bucket | |
| df_per_sample_selected_systems['audio_duration_buckets'] = df_per_sample_selected_systems['audio_duration'].apply(lambda x: min(audio_duration_buckets, key=lambda y: abs(x-y))) | |
| # calculate average WER per audio duration bucket | |
| df_per_sample_wer_audio = df_per_sample_selected_systems.groupby(['system', 'audio_duration_buckets'])['WER'].mean().reset_index() | |
| # add column with number of samples for specific audio bucket size | |
| df_per_sample_wer_audio['number_of_samples'] = df_per_sample_selected_systems.groupby(['system', 'audio_duration_buckets'])['WER'].count().values | |
| df_per_sample_wer_audio = df_per_sample_wer_audio.sort_values(by='audio_duration_buckets') | |
| # round values in WER column in df_per_sample_wer to 2 decimal places | |
| df_per_sample_wer_audio['WER'].round(2) | |
| # transform df_per_sample_wer. Use system values as columns, while audio_duration_buckets as main index | |
| df_per_sample_wer_audio_pivot = df_per_sample_wer_audio.pivot(index='audio_duration_buckets', columns='system', values='WER') | |
| df_per_sample_wer_audio_pivot = df_per_sample_wer_audio_pivot.round(2) | |
| df_per_sample_wer_audio_pivot['number_of_samples'] = df_per_sample_wer_audio[df_per_sample_wer_audio['system']==free_system_with_best_wer].groupby('audio_duration_buckets')['number_of_samples'].sum().values | |
| # put number_of_samples as the first column after index | |
| df_per_sample_wer_audio_pivot = df_per_sample_wer_audio_pivot[['number_of_samples'] + [col for col in df_per_sample_wer_audio_pivot.columns if col != 'number_of_samples']] | |
| # print dataframe in streamlit | |
| st.dataframe(df_per_sample_wer_audio_pivot) | |
| # plot scatter plot with values from df_per_sample_wer_pivot. | |
| # each system should have a different color | |
| # the size of the point should be proportional to the number of samples in the bucket | |
| # the x axis should be the audio duration bucket | |
| # the y axis should be the average WER | |
| fig, ax = plt.subplots() | |
| for system in selected_systems: | |
| subset = df_per_sample_wer_audio[df_per_sample_wer_audio['system'] == system] | |
| ax.scatter(subset['audio_duration_buckets'], subset['WER'], label=system, s=subset['number_of_samples']*0.5) | |
| ax.set_xlabel('Audio Duration [s]') | |
| ax.set_ylabel('WER (%)') | |
| ax.set_title('WER in function of audio duration.') | |
| # place legend outside the plot on the right | |
| ax.legend(title='System', bbox_to_anchor=(1.05, 1), loc='upper left') | |
| st.pyplot(fig) | |
| ################################################################################################################################################## | |
| # WER per speech rate | |
| # speech rate chars unique values | |
| audio_feature_to_analyze = 'speech_rate_words' | |
| audio_feature_unit = ' [words/s]' | |
| metric = 'WER' | |
| metric_unit = ' [%]' | |
| no_of_buckets = 10 | |
| # calculate average WER per audio duration bucket for the best and worse commercial and free systems | |
| selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer] | |
| df_per_sample_wer_feature_pivot, df_per_sample_wer_feature = calculate_wer_per_audio_feature(df_per_sample, selected_systems, audio_feature_to_analyze, metric, no_of_buckets) | |
| # print dataframe in streamlit | |
| st.dataframe(df_per_sample_wer_feature_pivot) | |
| # plot scatter plot with values from df_per_sample_wer_pivot. | |
| # each system should have a different color | |
| # the size of the point should be proportional to the number of samples in the bucket | |
| # the x axis should be the audio duration bucket | |
| # the y axis should be the average WER | |
| fig, ax = plt.subplots() | |
| for system in selected_systems: | |
| subset = df_per_sample_wer_feature[df_per_sample_wer_feature['system'] == system] | |
| ax.scatter(subset[audio_feature_to_analyze], subset[metric], label=system, s=subset['number_of_samples']*0.5) | |
| ax.set_xlabel(audio_feature_to_analyze.replace('_',' ').capitalize() + audio_feature_unit) | |
| ax.set_ylabel(metric + metric_unit) | |
| ax.set_title('WER in function of speech rate.'.format(audio_feature_to_analyze)) | |
| # place legend outside the plot on the right | |
| ax.legend(title='System', loc='best') | |
| st.pyplot(fig) | |
| ################################################################################################################################################ | |
| # WER PER GENDER | |
| #selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer, free_system_with_worst_wer, commercial_system_with_worst_wer] | |
| selected_systems = df_per_sample['system'].unique() | |
| df_per_sample_wer_gender_pivot, df_available_samples_per_category_per_system, no_samples_per_category = calculate_wer_per_meta_category(df_per_sample, selected_systems, 'WER', 'speaker_gender') | |
| #print(df_per_sample_wer_gender_pivot) | |
| #print(no_samples_per_category) | |
| # print dataframe in streamlit | |
| st.write("Number of samples per category") | |
| for system in selected_systems: | |
| st.write(f"System: {system}") | |
| df_available_samples_per_category = df_available_samples_per_category_per_system[system] | |
| st.dataframe(df_available_samples_per_category) | |
| st.write("Number of samples analyzed per category - {}".format(no_samples_per_category)) | |
| st.dataframe(df_per_sample_wer_gender_pivot) | |
| #print(difference_values) | |
| #print(selected_systems) | |
| # create the scatter plot | |
| # the x axis should be the systems from selected_systems | |
| # the y axis should be the difference from difference_values | |
| # each system should have a different color | |
| fig, ax = plt.subplots() | |
| difference_values = df_per_sample_wer_gender_pivot['Difference'][:-3] | |
| selected_systems = df_per_sample_wer_gender_pivot.index[:-3] | |
| ax.scatter(difference_values, selected_systems, c=range(len(selected_systems)), cmap='viridis') | |
| ax.set_ylabel('ASR System') | |
| ax.set_xlabel('Difference in WER across speaker gender') | |
| ax.set_title('ASR systems perfomance bias for genders.') | |
| # add labels with difference in WER values | |
| for i, txt in enumerate(difference_values): | |
| ax.annotate(txt, (difference_values[i], selected_systems[i]), fontsize=5, ha='right') | |
| st.pyplot(fig) | |
| ##################################################################################################################################################################################### | |
| # WER per age | |
| df_per_sample_wer_age_pivot, df_available_samples_per_category_per_system, no_samples_per_category = calculate_wer_per_meta_category(df_per_sample, selected_systems,'WER','speaker_age') | |
| #print(df_per_sample_wer_age_pivot) | |
| #print(no_samples_per_category) | |
| # print dataframe in streamlit | |
| st.write("Number of samples per category") | |
| for system in selected_systems: | |
| st.write(f"System: {system}") | |
| df_available_samples_per_category = df_available_samples_per_category_per_system[system] | |
| st.dataframe(df_available_samples_per_category) | |
| st.write("Number of samples analyzed per category - {}".format(no_samples_per_category)) | |
| st.write("WER per age") | |
| st.dataframe(df_per_sample_wer_age_pivot) | |
| # extract columns from df_per_sample_wer_age_pivot for selected_systems (skip the last 3 values corresponding to median, average and std values) | |
| #print(selected_systems) | |
| # create the scatter plot | |
| # the x axis should be the systems from selected_systems | |
| # the y axis should be the difference from difference_values | |
| # each system should have a different color | |
| fig, ax = plt.subplots() | |
| difference_values = df_per_sample_wer_age_pivot['Std Dev'][:-3] | |
| selected_systems = df_per_sample_wer_age_pivot.index[:-3] | |
| ax.scatter(difference_values,selected_systems , c=range(len(selected_systems)), cmap='viridis') | |
| ax.set_ylabel('ASR System') | |
| ax.set_xlabel('Standard Deviation in WER across speaker age') | |
| ax.set_title('ASR systems perfomance bias for age groups') | |
| # add labels with difference in WER values | |
| for i, txt in enumerate(difference_values): | |
| ax.annotate(txt, (difference_values[i], selected_systems[i]), fontsize=5, ha='right') | |
| st.pyplot(fig) | |
| # READ vs CONVERSIONAL SPEECH AVERAGE WER | |
| # Hallucinations rate per system | |
| with inspection: | |
| st.title("Browse and manually inspect evaluation corpora and ASR results") | |
| st.markdown(INSPECTION_INFO, unsafe_allow_html=True) | |
| # TODO - load and display analysis and insights | |
| # filter dataset by audio id, type, ref/hyp content, ref/hyp length, words/chars per second etc. | |
| # playback audio | |
| # https://docs.streamlit.io/library/api-reference/media/st.audio | |
| datasets = [ | |
| "amu-cai/pl-asr-bigos-v2-secret", | |
| "pelcra/pl-asr-pelcra-for-bigos-secret", | |
| "amu-cai/pl-asr-bigos-v2-diagnostic", | |
| "amu-cai/pl-asr-bigos-v2-med"] | |
| st.title("Data for qualitative analysis") | |
| # select the dataset to display results | |
| dataset = st.selectbox("Select Dataset", datasets, key="dataset_inspection") | |
| # read the latest results for the selected dataset | |
| df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping) | |
| # read available options to analyze for specific dataset | |
| splits = list(df_per_dataset_all['subset'].unique()) # Get the unique splits | |
| norm_types = list(df_per_dataset_all['norm_type'].unique()) # Get the unique norm_types | |
| ref_types = list(df_per_dataset_all['ref_type'].unique()) # Get the unique ref_types | |
| systems = list(df_per_dataset_all['system'].unique()) # Get the unique systems | |
| metrics = list(df_per_dataset_all.columns[7:]) # Get the unique metrics | |
| # Select the system to display. More than 1 system can be selected. | |
| systems_selected = st.multiselect("Select ASR Systems", systems, key="systems_inspection", default=systems[:2]) | |
| # Select the metric to display | |
| metric = st.selectbox("Select Metric", metrics, index=metrics.index('WER'), key="metric_inspection") | |
| # Select the normalization type | |
| norm_type = st.selectbox("Select Normalization Type", norm_types, index=norm_types.index('all'), key="norm_type_inspection") | |
| # Select the reference type | |
| ref_type = st.selectbox("Select Reference Type", ref_types, index=ref_types.index('orig'), key="ref_type_inspection") | |
| num_of_samples = st.slider("Select number of samples to display", 1, 100, 10) | |
| df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type) & (df_per_sample_all["system"].isin(systems_selected))] | |
| # drop columns dataset | |
| #df_per_sample = df_per_sample.drop(columns=['dataset']) | |
| # print 20 refs and hyps with the worse WER per sample | |
| st.subheader("Samples with the worst WER per sample") | |
| df_per_sample_worst_wer = df_per_sample.sort_values(by='WER', ascending=False).head(num_of_samples) | |
| # use full width of the screen to display dataframe | |
| st.dataframe(df_per_sample_worst_wer, use_container_width=True) | |
| # ALL as the concatenation | |
| # common functions, difference only in the input TSV | |