|
|
|
|
|
let rawData; |
|
|
let chart = null; |
|
|
let lineGraph = null; |
|
|
let taskScores_save = null; |
|
|
let currentTask = 'Avg'; |
|
|
const sortby_options = { |
|
|
BY_REWARD_SCORE: "sort-by-reward-score", |
|
|
BY_SUCCESS_RATE: "sort-by-success-rate", |
|
|
BY_GROUNDING_ACC: "sort-by-grounding-acc", |
|
|
}; |
|
|
let cur_sortby_option = sortby_options.BY_REWARD_SCORE; |
|
|
|
|
|
const taskSubtaskMapping = { |
|
|
'Avg': ['AlfWorld', 'ScienceWorld', 'BabyAI', 'PDDL', 'Jericho', 'WebShop', 'WebArena', 'Tool-Query', 'Tool-Operation'], |
|
|
'Embodied': ['AlfWorld', 'ScienceWorld', 'BabyAI'], |
|
|
'Game': ['PDDL', 'Jericho'], |
|
|
'Web': ['WebShop', 'WebArena'], |
|
|
'Tools': ['Tool-Query', 'Tool-Operation'], |
|
|
}; |
|
|
|
|
|
const SubtaskNameMapping = { |
|
|
'AlfWorld': 'ALF', |
|
|
'ScienceWorld': 'SW', |
|
|
'BabyAI': 'BA', |
|
|
'PDDL': 'PL', |
|
|
'Jericho': 'JC', |
|
|
'WebShop': 'WS', |
|
|
'WebArena': 'WA', |
|
|
'Tool-Query': 'T-Q', |
|
|
'Tool-Operation': 'T-O', |
|
|
}; |
|
|
|
|
|
|
|
|
const modelColors = {}; |
|
|
const borderStyles = {}; |
|
|
|
|
|
|
|
|
const colors = [ |
|
|
'rgba(255, 99, 132, 1)', |
|
|
'rgba(54, 162, 235, 1)', |
|
|
'rgba(255, 206, 86, 1)', |
|
|
'rgba(75, 192, 192, 1)', |
|
|
'rgba(153, 102, 255, 1)', |
|
|
'rgba(255, 159, 64, 1)', |
|
|
'rgba(199, 199, 199, 1)', |
|
|
'rgba(83, 102, 255, 1)', |
|
|
'rgba(40, 159, 64, 1)', |
|
|
'rgba(143, 162, 235, 1)', |
|
|
'rgba(255, 99, 75, 1)', |
|
|
'rgba(71 ,150 ,87, 1)', |
|
|
'rgba(210 ,102 ,95, 1)', |
|
|
'rgba(51 ,47 ,180, 1)', |
|
|
]; |
|
|
|
|
|
|
|
|
const borders = [ |
|
|
{ |
|
|
borderWidth: 2, |
|
|
borderDash: [], |
|
|
}, |
|
|
{ |
|
|
borderWidth: 2, |
|
|
borderDash: [5, 5], |
|
|
}, |
|
|
{ |
|
|
borderWidth: 2, |
|
|
borderDash: [10, 5], |
|
|
}, |
|
|
{ |
|
|
borderWidth: 2, |
|
|
borderDash: [2, 2], |
|
|
}, |
|
|
{ |
|
|
borderWidth: 2, |
|
|
borderDash: [8, 4], |
|
|
}, |
|
|
{ |
|
|
borderWidth: 2, |
|
|
borderDash: [5, 10], |
|
|
}, |
|
|
{ |
|
|
borderWidth: 2, |
|
|
borderDash: [15, 5], |
|
|
}, |
|
|
{ |
|
|
borderWidth: 2, |
|
|
borderDash: [5, 15], |
|
|
}, |
|
|
{ |
|
|
borderWidth: 2, |
|
|
borderDash: [10, 10], |
|
|
}, |
|
|
{ |
|
|
borderWidth: 2, |
|
|
borderDash: [11, 4], |
|
|
}, |
|
|
{ |
|
|
borderWidth: 2, |
|
|
borderDash: [10, 2], |
|
|
}, |
|
|
{ |
|
|
borderWidth: 2, |
|
|
borderDash: [4, 8], |
|
|
}, { |
|
|
borderWidth: 2, |
|
|
borderDash: [3, 5], |
|
|
}, { |
|
|
borderWidth: 2, |
|
|
borderDash: [5, 3], |
|
|
}, |
|
|
]; |
|
|
|
|
|
|
|
|
function generateModelColorsAndStyles(models) { |
|
|
for (let i = 0; i < models.length; i++) { |
|
|
const model = models[i]; |
|
|
modelColors[model] = colors[i % colors.length]; |
|
|
borderStyles[model] = borders[i % borders.length]; |
|
|
} |
|
|
} |
|
|
|
|
|
function getScoresForTask(rawData, task) { |
|
|
console.log("Current task:", task); |
|
|
|
|
|
return rawData.map(model => { |
|
|
if (model.tasks[task]) { |
|
|
return { |
|
|
model: model.model, |
|
|
score: parseFloat(model.tasks[task].score), |
|
|
accuracy: parseFloat(model.tasks[task].accuracy), |
|
|
grounding: parseFloat(model.tasks[task].grounding) |
|
|
}; |
|
|
} else { |
|
|
console.error("Task not found:", task); |
|
|
return null; |
|
|
} |
|
|
}).filter(item => item !== null); |
|
|
} |
|
|
|
|
|
|
|
|
let selectedModelIndexInBar = null; |
|
|
let selectedModelIndexInLine = null; |
|
|
|
|
|
|
|
|
function createMainResultChart() { |
|
|
const taskScores = getScoresForTask(rawData, currentTask); |
|
|
|
|
|
if (cur_sortby_option === sortby_options.BY_REWARD_SCORE) { |
|
|
taskScores.sort((a, b) => b.score - a.score); |
|
|
} else if (cur_sortby_option === sortby_options.BY_SUCCESS_RATE) { |
|
|
taskScores.sort((a, b) => b.accuracy - a.accuracy); |
|
|
} else if (cur_sortby_option === sortby_options.BY_GROUNDING_ACC) { |
|
|
taskScores.sort((a, b) => b.grounding - a.grounding); |
|
|
} |
|
|
taskScores_save = taskScores |
|
|
|
|
|
const labels = taskScores.map(item => item.model); |
|
|
const scores = taskScores.map(item => item.score); |
|
|
const accuracies = taskScores.map(item => item.accuracy); |
|
|
const groundings = taskScores.map(item => item.grounding); |
|
|
|
|
|
if (chart) { |
|
|
chart.destroy(); |
|
|
} |
|
|
|
|
|
const ctx = document.getElementById('chart-success-reward-rate'); |
|
|
chart = new Chart(ctx, { |
|
|
plugins: [ChartDataLabels], |
|
|
type: 'bar', |
|
|
data: { |
|
|
labels: labels, |
|
|
datasets: [ |
|
|
{ |
|
|
label: 'Progress Rate', |
|
|
data: scores, |
|
|
backgroundColor: '#f398ae', |
|
|
}, |
|
|
{ |
|
|
label: 'Success Rate', |
|
|
data: accuracies, |
|
|
backgroundColor: '#78b5f1', |
|
|
}, |
|
|
{ |
|
|
label: 'Grounding Accuracy', |
|
|
data: groundings, |
|
|
backgroundColor: '#f3e276', |
|
|
} |
|
|
|
|
|
] |
|
|
}, |
|
|
options: { |
|
|
responsive: true, |
|
|
maintainAspectRatio: false, |
|
|
interaction: { |
|
|
mode: 'index', |
|
|
}, |
|
|
indexAxis: 'y', |
|
|
scales: { |
|
|
x: { |
|
|
ticks: { |
|
|
beginAtZero: true, |
|
|
min: 0, |
|
|
max: 100, |
|
|
font: { |
|
|
size: 12, |
|
|
family: "'Noto Sans', sans-serif", |
|
|
weight: 'bold' |
|
|
} |
|
|
}, |
|
|
grace: 10, |
|
|
title: { |
|
|
display: true, |
|
|
text: 'Value (%)', |
|
|
font: { |
|
|
size: 14, |
|
|
family: "'Noto Sans', sans-serif", |
|
|
weight: 'bold' |
|
|
} |
|
|
} |
|
|
}, |
|
|
y: { |
|
|
ticks: { |
|
|
font: { |
|
|
size: 12, |
|
|
family: "'Noto Sans', sans-serif", |
|
|
weight: 'bold' |
|
|
} |
|
|
}, |
|
|
title: { |
|
|
display: true, |
|
|
text: 'Model', |
|
|
font: { |
|
|
size: 14, |
|
|
family: "'Noto Sans', sans-serif", |
|
|
weight: 'bold' |
|
|
} |
|
|
} |
|
|
} |
|
|
}, |
|
|
plugins: { |
|
|
legend: { |
|
|
display: true, |
|
|
labels: { |
|
|
usePointStyle: true, |
|
|
font: { |
|
|
size: 10, |
|
|
family: "'Noto Sans', sans-serif", |
|
|
weight: 'bold' |
|
|
} |
|
|
}, |
|
|
align: 'center', |
|
|
position: 'bottom' |
|
|
}, |
|
|
tooltip: { |
|
|
callbacks: { |
|
|
label: function (context) { |
|
|
let label = context.dataset.label || ''; |
|
|
if (label) { |
|
|
label += ': '; |
|
|
} |
|
|
label += context.formattedValue; |
|
|
return label; |
|
|
} |
|
|
} |
|
|
}, |
|
|
datalabels: { |
|
|
align: 'end', |
|
|
anchor: 'end', |
|
|
color: 'black', |
|
|
padding: 0, |
|
|
formatter: function (value, context) { |
|
|
return value.toFixed(2); |
|
|
}, |
|
|
font: function (context) { |
|
|
var width = context.chart.width; |
|
|
var size = Math.round(width / 48); |
|
|
size = Math.min(size, 10); |
|
|
return { |
|
|
size: size, |
|
|
family: "'Noto Sans', sans-serif", |
|
|
}; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
}); |
|
|
|
|
|
const subTaskLabels = taskSubtaskMapping[currentTask] || []; |
|
|
|
|
|
if (lineGraph) { |
|
|
lineGraph.destroy(); |
|
|
} |
|
|
let datasets = []; |
|
|
let yAxisTitle = ''; |
|
|
const lineGraphCtx = document.getElementById('line-graph').getContext('2d'); |
|
|
if (cur_sortby_option === sortby_options.BY_SUCCESS_RATE) { |
|
|
datasets = rawData.map(modelData => { |
|
|
return { |
|
|
label: modelData.model, |
|
|
data: subTaskLabels.map(subtask => modelData.tasks[subtask] ? |
|
|
parseFloat(modelData.tasks[subtask].accuracy) : null), |
|
|
borderColor: modelColors[modelData.model] || '#4CAF50', |
|
|
fill: false, |
|
|
...borderStyles[modelData.model] |
|
|
}; |
|
|
}); |
|
|
yAxisTitle = 'Success Rate (%)'; |
|
|
} else if (cur_sortby_option === sortby_options.BY_REWARD_SCORE) { |
|
|
datasets = rawData.map(modelData => { |
|
|
return { |
|
|
label: modelData.model, |
|
|
data: subTaskLabels.map(subtask => modelData.tasks[subtask] ? |
|
|
parseFloat(modelData.tasks[subtask].score) : null), |
|
|
borderColor: modelColors[modelData.model] || '#4CAF50', |
|
|
fill: false, |
|
|
...borderStyles[modelData.model] |
|
|
}; |
|
|
}); |
|
|
yAxisTitle = 'Progress Rate (%)'; |
|
|
} else if (cur_sortby_option === sortby_options.BY_GROUNDING_ACC) { |
|
|
datasets = rawData.map(modelData => { |
|
|
return { |
|
|
label: modelData.model, |
|
|
data: subTaskLabels.map(subtask => modelData.tasks[subtask] ? |
|
|
parseFloat(modelData.tasks[subtask].grounding) : null), |
|
|
borderColor: modelColors[modelData.model] || '#4CAF50', |
|
|
fill: false, |
|
|
...borderStyles[modelData.model] |
|
|
}; |
|
|
}); |
|
|
yAxisTitle = 'Grounding accuracy (%)'; |
|
|
} |
|
|
lineGraph = new Chart(lineGraphCtx, { |
|
|
type: 'line', |
|
|
data: { |
|
|
labels: subTaskLabels, |
|
|
datasets: datasets |
|
|
}, |
|
|
options: { |
|
|
responsive: true, |
|
|
maintainAspectRatio: false, |
|
|
scales: { |
|
|
x: { |
|
|
ticks: { |
|
|
font: { |
|
|
size: 12, |
|
|
family: "'Noto Sans', sans-serif", |
|
|
weight: 'bold' |
|
|
} |
|
|
}, |
|
|
title: { |
|
|
display: true, |
|
|
text: 'Sub-Task', |
|
|
font: { |
|
|
size: 14, |
|
|
family: "'Noto Sans', sans-serif", |
|
|
weight: 'bold' |
|
|
}, |
|
|
}, |
|
|
}, |
|
|
y: { |
|
|
ticks: { |
|
|
font: { |
|
|
size: 12, |
|
|
family: "'Noto Sans', sans-serif", |
|
|
weight: 'bold' |
|
|
} |
|
|
}, |
|
|
title: { |
|
|
display: true, |
|
|
text: yAxisTitle, |
|
|
font: { |
|
|
size: 14, |
|
|
family: "'Noto Sans', sans-serif", |
|
|
weight: 'bold' |
|
|
}, |
|
|
}, |
|
|
min: 0, |
|
|
max: 100, |
|
|
}, |
|
|
}, |
|
|
plugins: { |
|
|
legend: { |
|
|
display: true, |
|
|
labels: { |
|
|
usePointStyle: true, |
|
|
font: { |
|
|
size: 10, |
|
|
family: "'Noto Sans', sans-serif", |
|
|
weight: 'bold' |
|
|
}, |
|
|
}, |
|
|
align: 'center', |
|
|
position: 'bottom', |
|
|
}, |
|
|
annotation: { |
|
|
annotations: [] |
|
|
} |
|
|
} |
|
|
} |
|
|
}); |
|
|
|
|
|
if (ctx) { |
|
|
ctx.addEventListener('mousemove', function (event) { |
|
|
const activePoints = chart.getElementsAtEventForMode(event, 'nearest', {intersect: true}, true); |
|
|
|
|
|
if (activePoints.length > 0) { |
|
|
const selectedIndex = activePoints[0].index; |
|
|
if (selectedIndex !== selectedModelIndexInBar) { |
|
|
selectedModelIndexInBar = selectedIndex; |
|
|
selectedModelIndexInLine = highlightModel(taskScores_save.map(item => item.model), selectedModelIndexInBar); |
|
|
|
|
|
} |
|
|
} else { |
|
|
selectedModelIndexInBar = null; |
|
|
removeHighlight(); |
|
|
|
|
|
} |
|
|
}); |
|
|
} |
|
|
} |
|
|
|
|
|
document.querySelectorAll('.btn-group.task-filter-selector .btn').forEach(btn => { |
|
|
btn.addEventListener('click', () => { |
|
|
currentTask = btn.id.replace('filter-by-', ''); |
|
|
|
|
|
document.querySelectorAll('.btn-group.task-filter-selector .btn.active').forEach(active => { |
|
|
active.classList.remove('active'); |
|
|
}); |
|
|
btn.classList.add('active'); |
|
|
|
|
|
createMainResultChart(); |
|
|
}); |
|
|
}); |
|
|
|
|
|
function highlightModel(labels, index) { |
|
|
let highlightedDatasetIndex = -1; |
|
|
const modelName = labels[index]; |
|
|
|
|
|
lineGraph.data.datasets.forEach((dataset, datasetIndex) => { |
|
|
if (dataset.label === modelName) { |
|
|
dataset.borderColor = '#000000'; |
|
|
dataset.borderWidth = 4; |
|
|
highlightedDatasetIndex = datasetIndex |
|
|
} else { |
|
|
dataset.borderColor = modelColors[dataset.label]; |
|
|
dataset.borderWidth = 2; |
|
|
} |
|
|
}); |
|
|
lineGraph.options.plugins.annotation.annotations = createAnnotations(modelName); |
|
|
lineGraph.update(); |
|
|
|
|
|
return highlightedDatasetIndex |
|
|
} |
|
|
|
|
|
function removeHighlight() { |
|
|
lineGraph.data.datasets.forEach((dataset) => { |
|
|
dataset.borderColor = modelColors[dataset.label]; |
|
|
dataset.borderWidth = 2; |
|
|
}); |
|
|
lineGraph.options.plugins.annotation.annotations = []; |
|
|
lineGraph.update(); |
|
|
} |
|
|
|
|
|
|
|
|
function createAnnotations(modelName) { |
|
|
return taskSubtaskMapping[currentTask].map(subtask => { |
|
|
const modelData = rawData.find(data => data.model === modelName); |
|
|
if (!modelData || !modelData.tasks || !modelData.tasks[subtask]) { |
|
|
return null; |
|
|
} |
|
|
|
|
|
let content = `${modelData.model} (success rate):\n`; |
|
|
if (cur_sortby_option === sortby_options.BY_REWARD_SCORE) { |
|
|
content = `${modelData.model} (progress rate):\n` |
|
|
} else if (cur_sortby_option === sortby_options.BY_GROUNDING_ACC) { |
|
|
content = `${modelData.model} (grounding acc):\n` |
|
|
} |
|
|
let focus_score = null |
|
|
let avg_content = `${modelData.model}(%):\n` |
|
|
let minYValue = 100; |
|
|
let maxYValue = 0; |
|
|
taskSubtaskMapping[currentTask].forEach(subtask => { |
|
|
if (modelData.tasks && modelData.tasks[subtask] && cur_sortby_option === sortby_options.BY_SUCCESS_RATE) { |
|
|
focus_score = parseFloat(modelData.tasks[subtask].accuracy); |
|
|
} else if (modelData.tasks && modelData.tasks[subtask] && cur_sortby_option === sortby_options.BY_REWARD_SCORE) { |
|
|
focus_score = parseFloat(modelData.tasks[subtask].score); |
|
|
} else if (modelData.tasks && modelData.tasks[subtask] && cur_sortby_option === sortby_options.BY_GROUNDING_ACC) { |
|
|
focus_score = parseFloat(modelData.tasks[subtask].grounding); |
|
|
} |
|
|
minYValue = Math.min(minYValue, focus_score); |
|
|
maxYValue = Math.max(maxYValue, focus_score); |
|
|
if (maxYValue === 0) { |
|
|
maxYValue = 0.6 |
|
|
} |
|
|
if (currentTask === 'Avg') { |
|
|
subtask = SubtaskNameMapping[subtask] |
|
|
content = avg_content + `${subtask}: ${focus_score.toFixed(1)}\n`; |
|
|
avg_content += `${subtask}: ${focus_score.toFixed(1)}\n` |
|
|
} else { |
|
|
content += `${subtask}: ${focus_score.toFixed(1)}%\n`; |
|
|
} |
|
|
|
|
|
}); |
|
|
return { |
|
|
type: 'label', |
|
|
content: content, |
|
|
xValue: (taskSubtaskMapping[currentTask].length - 1) / 2, |
|
|
yValue: (maxYValue + minYValue) / 2, |
|
|
backgroundColor: 'rgba(255,255,255,0.8)', |
|
|
font: { |
|
|
size: 9, |
|
|
weight: 'bold', |
|
|
color: 'black', |
|
|
family: "'Noto Sans', sans-serif" |
|
|
}, |
|
|
xPadding: 6, |
|
|
yPadding: 6, |
|
|
borderColor: 'black', |
|
|
borderWidth: 1, |
|
|
borderRadius: 6, |
|
|
position: 'center', |
|
|
adjustScaleRange: true |
|
|
}; |
|
|
}).filter(annotation => annotation !== null); |
|
|
} |
|
|
|
|
|
function updateLineGraphScale(labels, selectedIndex) { |
|
|
|
|
|
const dataset = lineGraph.data.datasets[selectedIndex]; |
|
|
const maxValue = Math.max(...dataset.data); |
|
|
const minValue = Math.min(...dataset.data.filter(v => v !== null)); |
|
|
const range = maxValue - minValue; |
|
|
let buffer = null; |
|
|
if (range > 2) { |
|
|
buffer = 0.5 |
|
|
} else { |
|
|
buffer = 1.5; |
|
|
} |
|
|
|
|
|
lineGraph.options.scales.y.min = Math.max(0, minValue - buffer); |
|
|
lineGraph.options.scales.y.max = maxValue + buffer; |
|
|
lineGraph.update(); |
|
|
} |
|
|
|
|
|
function resetLineGraphScale() { |
|
|
lineGraph.options.scales.y.min = 0; |
|
|
lineGraph.options.scales.y.max = 100; |
|
|
lineGraph.update(); |
|
|
} |
|
|
|
|
|
Object.values(sortby_options).forEach(sortby_option => { |
|
|
const btn = document.getElementById(sortby_option); |
|
|
|
|
|
btn.addEventListener('click', () => { |
|
|
document.querySelectorAll('.btn-group.metric-filter-selector .btn.active').forEach(active => { |
|
|
active.classList.remove('active'); |
|
|
}); |
|
|
|
|
|
btn.classList.add('active'); |
|
|
|
|
|
cur_sortby_option = sortby_option; |
|
|
|
|
|
createMainResultChart(); |
|
|
}); |
|
|
}); |
|
|
|
|
|
document.addEventListener('DOMContentLoaded', function () { |
|
|
fetch('/agentboard/data/To_Release/main_data_new.json').then(response => response.json()).then((loadedData) => { |
|
|
rawData = loadedData; |
|
|
generateModelColorsAndStyles(rawData.map(data => data.model)); |
|
|
createMainResultChart(); |
|
|
}); |
|
|
}); |
|
|
|