|
|
<!DOCTYPE html> |
|
|
<html> |
|
|
<head> |
|
|
<meta charset="utf-8"> |
|
|
<meta name="description" |
|
|
content="Evaluating mathematical reasoning of foundation models in visual contexts"> |
|
|
<meta name="keywords" content="MathVista, Math Vista"> |
|
|
<meta name="viewport" content="width=device-width, initial-scale=1"> |
|
|
<body style="background-color: #ccd6d4"> |
|
|
<p align="center"> |
|
|
<img src="static/images/Puzzle_logo2.png" alt="logo" style="width:40%;"> |
|
|
</p> |
|
|
</body> |
|
|
<title> Can LLMs Solve Molecule Puzzles? A Multimodal Benchmark for Molecular Structure Elucidation</title> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="icon" href="static/images/logos/molecule.png"> |
|
|
|
|
|
<link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet"> |
|
|
|
|
|
<link rel="stylesheet" href="static/css/bulma.min.css"> |
|
|
<link rel="stylesheet" href="static/css/bulma-carousel.min.css"> |
|
|
<link rel="stylesheet" href="static/css/bulma-slider.min.css"> |
|
|
<link rel="stylesheet" href="static/css/fontawesome.all.min.css"> |
|
|
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css"> |
|
|
<link rel="stylesheet" href="static/css/index.css"> |
|
|
<link rel="stylesheet" href="static/css/leaderboard.css"> |
|
|
|
|
|
|
|
|
|
|
|
<script type="text/javascript" src="static/js/sort-table.js" defer></script> |
|
|
|
|
|
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script> |
|
|
<script defer src="static/js/fontawesome.all.min.js"></script> |
|
|
<script src="static/js/bulma-carousel.min.js"></script> |
|
|
<script src="static/js/bulma-slider.min.js"></script> |
|
|
<script src="static/js/explorer-index.js"></script> |
|
|
<script src="static/js/question_card.js"></script> |
|
|
|
|
|
<script src="static/js/leaderboard_testmini.js"></script> |
|
|
<script src="https://kehanguo2.github.io/Molpuzzle.io/data/results/output_folders.js" defer></script> |
|
|
<script src="https://kehanguo2.github.io/Molpuzzle.io/data/results/model_scores.js" defer></script> |
|
|
|
|
|
<script src="visualizer/data/data_public.js" defer></script> |
|
|
<style> |
|
|
table { |
|
|
width: 100%; |
|
|
border-collapse: collapse; |
|
|
margin: auto; |
|
|
} |
|
|
th, td { |
|
|
padding: 4px; |
|
|
border: 1px solid #ddd; |
|
|
font-size: 0.9em; |
|
|
} |
|
|
.highlight { |
|
|
background-color: #f0f0f0; |
|
|
color: #9932CC; |
|
|
} |
|
|
thead { |
|
|
background-color: #f2f2f2; |
|
|
} |
|
|
tr:nth-child(even) { |
|
|
background-color: #f9f9f9; |
|
|
} |
|
|
.checkmark { |
|
|
color: green; |
|
|
} |
|
|
.xmark { |
|
|
color: red; |
|
|
} |
|
|
.xcheck { |
|
|
color: lightskyblue; |
|
|
} |
|
|
caption { |
|
|
caption-side: top; |
|
|
text-align: center; |
|
|
font-weight: bold; |
|
|
margin-bottom: 8px; |
|
|
} |
|
|
.hero { |
|
|
position: relative; |
|
|
overflow: hidden; |
|
|
} |
|
|
|
|
|
.hero .background { |
|
|
position: absolute; |
|
|
top: 0; |
|
|
left: 0; |
|
|
width: 100%; |
|
|
height: 100%; |
|
|
background-image: url('static/images/background5.png'); |
|
|
background-size: cover; |
|
|
background-position: center; |
|
|
filter: blur(5px); |
|
|
z-index: 0; |
|
|
} |
|
|
|
|
|
.hero-body { |
|
|
position: relative; |
|
|
z-index: 1; |
|
|
|
|
|
} |
|
|
|
|
|
.video-container { |
|
|
max-width: 60%; |
|
|
margin: 0 auto; |
|
|
padding: 20px; |
|
|
background-color: #823e3e; |
|
|
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); |
|
|
border-radius: 10px; |
|
|
overflow: hidden; |
|
|
} |
|
|
.video-container iframe { |
|
|
width: 100%; |
|
|
height: 315px; |
|
|
border: none; |
|
|
} |
|
|
.video-title { |
|
|
text-align: center; |
|
|
font-size: 1.5em; |
|
|
margin-bottom: 15px; |
|
|
color: #333; |
|
|
} |
|
|
</style> |
|
|
</head> |
|
|
<body> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<section class="hero"> |
|
|
|
|
|
<div class="hero-body"> |
|
|
<div class="container is-max-desktop"> |
|
|
<div class="columns is-centered"> |
|
|
<div class="column has-text-centered"> |
|
|
|
|
|
|
|
|
|
|
|
<h2 class="subtitle is-3 publication-subtitle" style="color: rgb(255, 255, 255); font-weight: bold; text-shadow: 2px 2px 4px rgba(255,255,255, 0.5);"> |
|
|
<span style="color: rgb(0,0, 0); font-weight: bold;">Can LLMs Solve Molecule Puzzles? A Multimodal Benchmark for Molecular Structure Elucidation</span> |
|
|
</h2> |
|
|
|
|
|
<div class="is-size-5 publication-authors"> |
|
|
<span class="author-block"> |
|
|
<a href="https://kehanguo2.github.io/">Kehan Guo</a><sup style="color:#b53524;">1,*</sup>, |
|
|
</span> |
|
|
<span class="author-block"> |
|
|
<a href="https://scholar.google.com/citations?user=k1nZE2sAAAAJ&hl=en">Bozhao Nan</a><sup style="color:#6fbf73;;">2,*</sup>, |
|
|
</span> |
|
|
<span class="author-block"> |
|
|
<a href="https://yujunzhou.github.io/">Yujun Zhou</a><sup style="color:#b53524">1,</sup>, |
|
|
</span> |
|
|
<span class="author-block"> |
|
|
<a href="https://taichengguo.github.io/">Taicheng Guo</a><sup style="color:#b53524">1</sup>, |
|
|
</span> |
|
|
<span class="author-block"> |
|
|
<a href="https://zguo.io/">Zhichun Guo</a><sup style="color:#b53524">1</sup>, |
|
|
</span> |
|
|
<span class="author-block"> |
|
|
<a href="https://ccas.nd.edu/people/mihir-surve/">Mihir Surve</a><sup style="color:#6fbf73;">2</sup>, |
|
|
</span> |
|
|
<span class="author-block"> |
|
|
<a href="https://zhenwen-nlp.github.io/">Zhenwen Liang</a><sup style="color:#b53524;;">1</sup>, |
|
|
</span> |
|
|
<span class="author-block"> |
|
|
<a href="https://niteshchawla.nd.edu/">Nitesh V. Chawla</a><sup style="color:#b53524;;">1</sup>, |
|
|
</span> |
|
|
<span class="author-block"></span> |
|
|
<a href="https://chemistry.nd.edu/people/olaf-wiest/">Olaf Wiest</a><sup style="color:#6fbf73;">2</sup>, |
|
|
</span> |
|
|
<span class="author-block"> |
|
|
<a href="https://engineering.nd.edu/faculty/xiangliang-zhang/">Xiangliang Zhang</a><sup style="color:#b53524;">1,††</sup> |
|
|
</span> |
|
|
</div> |
|
|
|
|
|
<div class="is-size-5 publication-authors"> |
|
|
<span class="author-block"><sup style="color:#50b155;">1</sup>Department of Computer Science and Engineering, University of Notre Dame,</span> |
|
|
<span class="author-block"><sup style="color:#b53524;">2</sup>Department of Chemistry and Biochemitry, University of Notre Dame,</span> |
|
|
</div> |
|
|
|
|
|
<div> |
|
|
(* Equal contribution,†† Corresponding author) |
|
|
</div> |
|
|
|
|
|
<div class="column has-text-centered"> |
|
|
<div class="publication-links"> |
|
|
|
|
|
|
|
|
<span class="link-block"> |
|
|
|
|
|
<a href="https://kehanguo2.github.io/Molpuzzle.io/paper/NeurIPS24_MolPuzzle.pdf" class="external-link button is-normal is-rounded is-dark"> <span class="icon"> |
|
|
<i class="fas fa-file-powerpoint"></i> |
|
|
</span> |
|
|
<span>Slides</span> |
|
|
</a> |
|
|
</span> |
|
|
|
|
|
<span class="link-block"> |
|
|
<a href="https://github.com/KehanGuo2/MolPuzzle" |
|
|
class="external-link button is-normal is-rounded is-dark"> |
|
|
<span class="icon"> |
|
|
<i class="fab fa-github"></i> |
|
|
</span> |
|
|
<span>Code</span> |
|
|
</a> |
|
|
</span> |
|
|
|
|
|
<span class="link-block"> |
|
|
<a href="https://huggingface.co/datasets/kguo2/MolPuzzle_data" |
|
|
class="external-link button is-normal is-rounded is-dark"> |
|
|
<span class="icon"> |
|
|
<i class="fas fa-database"></i> |
|
|
</span> |
|
|
<span>Data</span> |
|
|
</a> |
|
|
</span> |
|
|
|
|
|
<span class="link-block"> |
|
|
|
|
|
<a href="https://kehanguo2.github.io/Molpuzzle.io/paper/SpectrumLLM__Arxiv_.pdf" class="external-link button is-normal is-rounded is-dark"> <span class="icon"> |
|
|
<i class="fas fa-file-pdf"></i> |
|
|
</span> |
|
|
<span>Paper</span> |
|
|
</a> |
|
|
</span> |
|
|
|
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
<br> |
|
|
|
|
|
<br> |
|
|
<br> |
|
|
<br> |
|
|
<br> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<section class="section"> |
|
|
<div class="container" style="margin-top: -150px; margin-bottom: -100px;"> |
|
|
<div class="columns is-centered m-6"> |
|
|
<div class="column is-full has-text-centered content"> |
|
|
|
|
|
|
|
|
|
|
|
<p align="center"> |
|
|
<img src="static/images/intro_spectrum.png" alt="benchmark overview" style="width:80%;"> |
|
|
</p> |
|
|
|
|
|
<p>Comparison of molecular structure elucidation to solving a crossword puzzle. Just as crossword clues provide hints for fitting words into a grid, spectroscopic data such as NMR, IR, and mass spectrometry offer complementary clues about a molecule’s structure. Integrating these diverse clues leads to a complete and consistent picture of the molecule, similar to how words fit together in a puzzle.</p> |
|
|
|
|
|
|
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<section class="section"> |
|
|
<div class="container" style="margin-bottom: 2vh;"> |
|
|
|
|
|
<div class="columns is-centered has-text-centered"> |
|
|
<div class="column is-four-fifths"> |
|
|
<h2 class="title is-3">Introduction</h2> |
|
|
<div class="content has-text-justified"> |
|
|
<p> Artificial intelligence (AI) is revolutionizing chemistry, with significant impacts on industrial chemical engineering, drug discovery, and education. Large language models (LLMs) have successfully addressed predictive tasks such as molecular property prediction, reaction prediction, and experiment automation. Here, we introduce <strong>molecular structure elucidation</strong>, a task that presents a new challenge for AI. <strong>This task requires integrating diverse spectroscopic data, iterative hypothesis testing, and deep chemical reasoning to determine a molecule’s structure</strong>. Much like solving a complex crossword puzzle, it involves piecing together clues to form a coherent solution. The Figure highlights this analogy, illustrating the similarities in strategy and complexity between molecular structure elucidation and solving a crossword puzzle.</p> |
|
|
|
|
|
<p> In this work, we present a novel approach to molecular structure elucidation, adapting the task for Large Language Models (LLMs) to explore their potential in chemical research. <strong>Our primary contribution is the introduction of the MolPuzzle dataset, comprising 234 complex structure elucidation challenges involving multimodal data like IR, MASS, H-NMR, and C-NMR spectra, as well as molecular formulas.</strong> Each instance requires LLMs to navigate three key sub-tasks: molecule understanding, spectrum interpretation, and molecule construction. </p> |
|
|
|
|
|
<p> We tested 11 state-of-the-art LLMs, including GPT-4o and Claude-3-opus, alongside human benchmarks. Key findings include: <strong>(1) GPT-4o outperforms other models but still underperforms compared to humans, with only 1.4% of its answers exactly matching the ground truth;</strong><strong>(2) LLMs struggle particularly in spectrum interpretation and molecule construction.</strong></p> |
|
|
|
|
|
<p>In summary, our contributions are two-fold: Our contributions are twofold: <strong>(1) A new reasoning challenge for the AI community focused on complex problem-solving in chemistry; and (2) New AI tools for the chemistry community, showcasing LLMs’ potential to accelerate molecular structure elucidation and inspire interdisciplinary collaboration.</strong></p> </div> |
|
|
</div> |
|
|
|
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
<section class="section"> |
|
|
<div class="container" style="margin-bottom: 2vh;"> |
|
|
|
|
|
<div class="columns is-centered has-text-centered"> |
|
|
<div class="column is-four-fifths"> |
|
|
<h2 class="title is-3">Overview of the MolPuzzle Benchmark</h2> |
|
|
<div class="content has-text-justified"> |
|
|
<p>The MolPuzzle benchmark is designed to test the reasoning capabilities of Large Language Models (LLMs) in molecular structure elucidation tasks. This dataset contains 200 instances of molecular structure elucidation challenges, simulating real-world chemistry tasks. Each instance in MolPuzzle involves three interlinked sub-tasks:</p> |
|
|
<ul> |
|
|
<li><strong>Molecule Understanding:</strong> This stage evaluates the model’s ability to identify and understand basic molecular structures, starting from the molecular formula derived from mass spectrometry data. The dataset includes questions about the degree of saturation, aromatic rings, and functional groups, helping the model narrow down possible molecular structures.</li> |
|
|
<li><strong> Spectrum Interpretation:</strong> This stage involves analyzing multimodal data, including IR, MASS, 1H-NMR, and 13C-NMR spectra. These spectral images provide critical information about functional groups, molecular mass, and the arrangement of atoms. The dataset challenges models to integrate these clues and refine molecular hypotheses based on the spectral data.</li> |
|
|
<li><strong>Molecule Construction:</strong> In this final stage, the models attempt to assemble the molecule based on the information gathered from previous steps. This involves constructing a valid molecular structure that fits the constraints provided by the NMR data.</li> |
|
|
</ul> |
|
|
|
|
|
<p align="center"> |
|
|
<img src= "static/images/intro_spectrum(1).png" alt="benchmark overview" style="width:100%;"> |
|
|
</p> |
|
|
<p> In total, Molpuzzle includes 23,678 data examples collected from each Stage.</p> |
|
|
<p align="center"> |
|
|
<img src= "static/images/molpuzzle_stats2.png" alt="molpuzzle statistics" style="width:100%;"> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
|
|
|
<section class="section"> |
|
|
<div class="container" style="margin-bottom: 2vh;"> |
|
|
|
|
|
<div class="columns is-centered has-text-centered"> |
|
|
<div class="column is-four-fifths"> |
|
|
<h2 class="title is-3 mathvista">Experiment Results</h2> |
|
|
<h3 class="subtitle is-4 left-align">Addressing individual QA tasks in three stages</h3> |
|
|
<div class="content has-text-justified"> |
|
|
<p>We first conducted evaluation of a variety of LLMs for completing the individual tasks in each stage, including GPT-4o, GPT-3.5-turbo, Claude-3-opus, Gemini-pro, LLama-3-8B-Instruct, Vicuna-13B-v1.5, Mistral-7B-Instruct-v0.3, and in particular multimodal LLMs such as Gemini-pro-vision, LLava-Llama-3-8B, Qwen-VL-Chat, and InstructBlip-Vicuna-7B/13B.</p> |
|
|
<table border="1" cellspacing="0" cellpadding="5"> |
|
|
|
|
|
<thead> |
|
|
<tr> |
|
|
<th rowspan="2" style="text-align: center;">Method</th> |
|
|
<th colspan="4" style="text-align: center;">Stage 1 (Molecule Understanding) Tasks</th> |
|
|
</tr> |
|
|
<tr> |
|
|
<th>SI</th> |
|
|
<th>ARI</th> |
|
|
<th>FGI</th> |
|
|
<th>SDC</th> |
|
|
</tr> |
|
|
</thead> |
|
|
<tbody> |
|
|
<tr> |
|
|
<td>GPT-4o</td> |
|
|
<td><strong>1.00±0.000</strong></td> |
|
|
<td>0.943±0.016</td> |
|
|
<td>0.934±0.005</td> |
|
|
<td>0.667±0.003</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>GPT-3.5-turbo</td> |
|
|
<td>0.451±0.025</td> |
|
|
<td>0.816±0.017</td> |
|
|
<td>0.826±0.075</td> |
|
|
<td>0.5±0.099</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Claude-3-opus</td> |
|
|
<td>0.361±0.009</td> |
|
|
<td><strong>0.988±0.015</strong></td> |
|
|
<td><strong>0.934±0.001</strong></td> |
|
|
<td><strong>0.856±0.016</strong></td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Llama3</td> |
|
|
<td>0.228±0.043</td> |
|
|
<td>0.696±0.051</td> |
|
|
<td>0.521±0.003</td> |
|
|
<td>0.000±0.000</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Human</td> |
|
|
<td><strong>1.00±0.000</strong></td> |
|
|
<td><strong>1.000±0.000</strong></td> |
|
|
<td>0.890±0.259</td> |
|
|
<td>0.851±0.342</td> |
|
|
</tr> |
|
|
</tbody> |
|
|
</table> |
|
|
|
|
|
<table border="1" cellspacing="0" cellpadding="5"> |
|
|
<thead> |
|
|
<tr> |
|
|
<th rowspan="2">Method</th> |
|
|
<th colspan="4" style="text-align: center;">Stage 2 (Spectrum Interpretation) Tasks</th> |
|
|
</tr> |
|
|
<tr> |
|
|
<th>IR Interpretation</th> |
|
|
<th>MASS Interpretation</th> |
|
|
<th>H-NMR Interpretation</th> |
|
|
<th>C-NMR Interpretation</th> |
|
|
</tr> |
|
|
</thead> |
|
|
<tbody> |
|
|
<tr> |
|
|
<td>GPT-4o</td> |
|
|
<td><strong>0.656±0.052</strong></td> |
|
|
<td><strong>0.609±0.042</strong></td> |
|
|
<td><strong>0.618±0.026</strong></td> |
|
|
<td><strong>0.639±0.010</strong></td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>LLava</td> |
|
|
<td>0.256±0.026</td> |
|
|
<td>0.101±0.021</td> |
|
|
<td>0.118±0.008</td> |
|
|
<td>0.254±0.015</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Human</td> |
|
|
<td>0.753±0.221</td> |
|
|
<td>0.730±0.110</td> |
|
|
<td>0.764±0.169</td> |
|
|
<td>0.769±0.101</td> |
|
|
</tr> |
|
|
</tbody> |
|
|
</table> |
|
|
|
|
|
<table border="1" cellspacing="0" cellpadding="5"> |
|
|
<thead> |
|
|
<tr> |
|
|
<th rowspan="2" style="text-align: center;">Method</th> |
|
|
<th colspan="2" style="text-align: center;">Stage 3 (Molecule Construction) Tasks</th> |
|
|
</tr> |
|
|
<tr> |
|
|
<th style="text-align: center;">H-NMR Elucidation</th> |
|
|
<th style="text-align: center;">C-NMR Elucidation</th> |
|
|
</tr> |
|
|
</thead> |
|
|
<tbody> |
|
|
<tr> |
|
|
<td style="text-align: center;">GPT-4o</td> |
|
|
<td style="text-align: center;"><strong>0.524±0.021</strong></td> |
|
|
<td style="text-align: center;"><strong>0.506±0.037</strong></td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td style="text-align: center;">Llama3</td> |
|
|
<td style="text-align: center;">0.341±0.015</td> |
|
|
<td style="text-align: center;">0.352±0.017</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td style="text-align: center;">Human</td> |
|
|
<td style="text-align: center;">0.867±0.230</td> |
|
|
<td style="text-align: center;">0.730±0.220</td> |
|
|
</tr> |
|
|
</tbody> |
|
|
</table> |
|
|
<caption> |
|
|
<p><strong>Table 1: F1 scores (↑) of individual QA tasks in three stages. The best LLMs results are in bold font.</strong><br> </p> |
|
|
|
|
|
<p>Tasks in stage 1 are SI: Saturation Identification, ARI: Aromatic Ring Identification, FGI: Functional Group Identification, and SDC: Saturation Degree Calculation.</p> |
|
|
</caption> |
|
|
</div> |
|
|
<h3 class="subtitle is-4 left-align">Addressing entire molecule puzzles</h3> |
|
|
<div class="content has-text-justified"> |
|
|
<p> For solving the entire molecule puzzles, the evaluation is limited to the three most advanced multimodal LMMs: GPT-4o, Claude-3-opus, and Gemini-pro, due to the involvement of spectrum image analysis in Stage 2. </p> |
|
|
<table border="1" cellspacing="0" cellpadding="5"> |
|
|
|
|
|
<thead> |
|
|
<tr> |
|
|
<th style="text-align: center;">Method</th> |
|
|
<th style="text-align: center;">Acc. (↑)</th> |
|
|
<th style="text-align: center;">Levenshtein (↓)</th> |
|
|
<th style="text-align: center;">Validity (↑)</th> |
|
|
<th style="text-align: center;">MACCS FTS (↑)</th> |
|
|
<th style="text-align: center;">RDK FTS (↑)</th> |
|
|
<th style="text-align: center;">Morgan FTS (↑)</th> |
|
|
</tr> |
|
|
</thead> |
|
|
<tbody> |
|
|
<tr> |
|
|
<td style="text-align: center;">GPT-4o</td> |
|
|
<td style="text-align: center;"><strong>0.014±0.004</strong></td> |
|
|
<td style="text-align: center;"><strong>11.653±0.013</strong></td> |
|
|
<td style="text-align: center;"><strong>1.000±0.000</strong></td> |
|
|
<td style="text-align: center;"><strong>0.431±0.009</strong></td> |
|
|
<td style="text-align: center;"><strong>0.293±0.013</strong></td> |
|
|
<td style="text-align: center;">0.232±0.007</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td style="text-align: center;">Claude-3-opus</td> |
|
|
<td style="text-align: center;">0.013±0.008</td> |
|
|
<td style="text-align: center;">12.680±0.086</td> |
|
|
<td style="text-align: center;"><strong>1.000±0.000</strong></td> |
|
|
<td style="text-align: center;">0.383±0.050</td> |
|
|
<td style="text-align: center;">0.264±0.040</td> |
|
|
<td style="text-align: center;">0.241±0.037</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td style="text-align: center;">Gemini-pro</td> |
|
|
<td style="text-align: center;">0.000±0.000</td> |
|
|
<td style="text-align: center;">12.711±0.196</td> |
|
|
<td style="text-align: center;"><strong>1.000±0.000</strong></td> |
|
|
<td style="text-align: center;">0.340±0.017</td> |
|
|
<td style="text-align: center;">0.208±0.002</td> |
|
|
<td style="text-align: center;">0.171±0.007</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td style="text-align: center;">Human</td> |
|
|
<td style="text-align: center;">0.667±0.447</td> |
|
|
<td style="text-align: center;">1.332±2.111</td> |
|
|
<td style="text-align: center;"><strong>1.000±0.000</strong></td> |
|
|
<td style="text-align: center;"><strong>0.985±0.022</strong></td> |
|
|
<td style="text-align: center;">0.795±0.317</td> |
|
|
<td style="text-align: center;">0.810±0.135</td> |
|
|
</tr> |
|
|
</tbody> |
|
|
</table> |
|
|
<caption><strong>Table 2: The performance of LLMs and human baseline in solving MolPuzzle. The best LLM results are in bold font. Acc. stands for the Accuracy of Exact Match.</strong></caption> |
|
|
|
|
|
|
|
|
</div> |
|
|
|
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<section class="section"> |
|
|
<div class="container" style="margin-bottom: 2vh;"> |
|
|
<div class="columns is-centered has-text-centered"> |
|
|
<div class="column is-four-fifths"> |
|
|
<h2 class="title is-3">Success and Failure Analysis</h2> |
|
|
|
|
|
<p align="center"> |
|
|
<img src= "static/images/molpuzzle_analysis.png" alt="benchmark overview" style="width:100%;"> |
|
|
</p> |
|
|
<p> Error in solving the molecule puzzle</p> |
|
|
<div class="content has-text-justified"> |
|
|
<p>The Figure presents case studies that illustrate the iterative steps involved in Stage 3, showcasing the most common errors made by GPT-4o: <strong>the accumulation of errors in iterative steps, which can lead to catastrophic failures.</strong> Note that this stage focuses on selecting the correct fragments and assembling them step by step to form the final molecular structure. We find that <strong>GPT-4o can initially succeed in picking the correct fragment when the structure is comparatively simple. However, as the process progresses, it does no select structures that satisfy all the requirements indicated by the NMR data.</strong></p> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
</section> |
|
|
|
|
|
|
|
|
<section class="section" id="BibTeX"> |
|
|
<div class="container is-max-desktop content"> |
|
|
<h2 class="title is-3 has-text-centered">BibTeX</h2> |
|
|
<pre><code>{@inproceedings{guocan, |
|
|
title={Can LLMs Solve Molecule Puzzles? A Multimodal Benchmark for Molecular Structure Elucidation}, |
|
|
author={Guo, Kehan and Nan, Bozhao and Zhou, Yujun and Guo, Taicheng and Guo, Zhichun and Surve, Mihir and Liang, Zhenwen and Chawla, Nitesh V and Wiest, Olaf and Zhang, Xiangliang}, |
|
|
booktitle={The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track}}} |
|
|
</code></pre> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
<section> |
|
|
<div class="section" id="org-banners" style="display:flex"> |
|
|
<a href="https://www.nd.edu/" target="blank" class="ext-link"> |
|
|
<img class="center-block org-banner" src="static/images/molpuzzle_fundlogo.png" style="width: 1200px; height: auto;"> |
|
|
</a> |
|
|
|
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
<footer class="footer"> |
|
|
|
|
|
<div class="content has-text-centered"> |
|
|
</div> |
|
|
<div class="columns is-centered"> |
|
|
<div class="column is-8"> |
|
|
<div class="content"> |
|
|
<p> |
|
|
This website is website adapted from <a href="https://nerfies.github.io/">Nerfies</a>, licensed under a <a rel="license" |
|
|
href="http://creativecommons.org/licenses/by-sa/4.0/">Creative |
|
|
Commons Attribution-ShareAlike 4.0 International License</a>. |
|
|
</p> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
</footer> |
|
|
|
|
|
</body> |
|
|
</html> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|