Spaces:
Running
Running
| <html lang="en-US"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <!-- Begin Jekyll SEO tag v2.8.0 --> | |
| <title>Attention Tracker | Attention Tracker: Detecting Prompt Injection Attacks in LLMs </title> | |
| <meta property="og:title" content="Gradient Cuff" /> | |
| <meta property="og:locale" content="en_US" /> | |
| <meta name="description" content="Detecting Prompt Injection Attacks in LLMs using attention" /> | |
| <meta property="og:description" content="Detecting Prompt Injection Attacks in LLMs using attention" /> | |
| <script type="application/ld+json"> | |
| {"@context":"https://schema.org","@type":"WebSite","description":"Detecting Jailbreak Attacks on Large Language Models by Exploring Refusal Loss Landscapes","headline":"Gradient Cuff","name":"Gradient Cuff","url":"https://huggingface.co/spaces/gregH/Gradient Cuff"}</script> | |
| <!-- End Jekyll SEO tag --> | |
| <!-- <link rel="preconnect" href="https://fonts.gstatic.com"> | |
| <link rel="preload" href="https://fonts.googleapis.com/css?family=Open+Sans:400,700&display=swap" as="style" type="text/css" crossorigin> | |
| <meta name="viewport" content="width=device-width, initial-scale=1"> | |
| <meta name="theme-color" content="#157878"> | |
| <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent"> --> | |
| <link rel="stylesheet" href="assets/css/bootstrap/bootstrap.min.css?v=90447f115a006bc45b738d9592069468b20e2551"> | |
| <link rel="stylesheet" href="assets/css/style.css?v=90447f115a006bc45b738d9592069468b20e2551"> | |
| <!-- start custom head snippets, customize with your own _includes/head-custom.html file --> | |
| <link rel="stylesheet" href="assets/css/custom_style.css?v=90447f115a006bc45b738d9592069468b20e2551"> | |
| <link rel="stylesheet" href="style.css"> | |
| <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script> | |
| <link rel="stylesheet" href="https://ajax.googleapis.com/ajax/libs/jqueryui/1.12.1/themes/smoothness/jquery-ui.css"> | |
| <script src="https://ajax.googleapis.com/ajax/libs/jqueryui/1.12.1/jquery-ui.min.js"></script> | |
| <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.9.4/Chart.js"></script> | |
| <script src="assets/js/calibration.js?v=90447f115a006bc45b738d9592069468b20e2551"></script> | |
| <link rel="stylesheet" href="//code.jquery.com/ui/1.13.2/themes/base/jquery-ui.css"> | |
| <link rel="stylesheet" href="/resources/demos/style.css"> | |
| <script src="https://code.jquery.com/jquery-3.6.0.js"></script> | |
| <script src="https://code.jquery.com/ui/1.13.2/jquery-ui.js"></script> | |
| <!-- for mathjax support --> | |
| <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script> | |
| <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script> | |
| <!-- end custom head snippets --> | |
| <!-- Font Awesome for PDF and GitHub icons --> | |
| <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css"> | |
| <!-- AI2 HTML-CSS Icons (for arXiv) --> | |
| <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/academicons/1.9.1/css/academicons.min.css"> | |
| <script> | |
| let normalIndex = 0; | |
| let attackIndex = 0; | |
| function navigateImages(type, direction) { | |
| let images; | |
| let currentIndex; | |
| if (type === 'normal') { | |
| images = document.querySelectorAll('.normal-gallery .image-gallery img'); | |
| currentIndex = normalIndex; | |
| } else if (type === 'attack') { | |
| images = document.querySelectorAll('.attack-gallery .image-gallery img'); | |
| currentIndex = attackIndex; | |
| } | |
| if (images && images.length > 0) { | |
| // Remove the active class from the current image | |
| images[currentIndex].classList.remove('active'); | |
| // Update the current index based on direction and number of images | |
| currentIndex = (currentIndex + direction + images.length) % images.length; | |
| // Add the active class to the new image | |
| images[currentIndex].classList.add('active'); | |
| // Save the updated index | |
| if (type === 'normal') { | |
| normalIndex = currentIndex; | |
| } else if (type === 'attack') { | |
| attackIndex = currentIndex; | |
| } | |
| } else { | |
| console.error("No images found for type:", type); | |
| } | |
| } | |
| // Initialize the galleries by adding the active class to the first image | |
| document.addEventListener("DOMContentLoaded", () => { | |
| const normalImages = document.querySelectorAll('.normal-gallery .image-gallery img'); | |
| const attackImages = document.querySelectorAll('.attack-gallery .image-gallery img'); | |
| if (normalImages.length > 0) { | |
| normalImages[0].classList.add('active'); | |
| } | |
| if (attackImages.length > 0) { | |
| attackImages[0].classList.add('active'); | |
| } | |
| }); | |
| </script> | |
| </head> | |
| <body> | |
| <header class="page-header" role="banner"> | |
| <h1 class="project-name" style="font-weight: 500;">Attention Tracker</h1> | |
| <h2 class="project-tagline">Attention Tracker: Detecting Prompt Injection Attacks in LLMs</h2> | |
| <p /> | |
| <div style="text-align: center; font-size:larger; "> | |
| <div> | |
| <a href="https://khhung-906.github.io/" style="color: white;" target="_blank" rel="noopener noreferrer"> | |
| Kuo-Han Hung<sup>1,2</sup>, | |
| </a> | |
| <a href="https://ireneko.github.io/" style="color: white;" target="_blank" rel="noopener noreferrer"> | |
| Ching-Yun Ko<sup>1</sup>, | |
| </a> | |
| <a href="" style="color: white;" target="_blank" rel="noopener noreferrer"> | |
| Ambrish Rawat<sup>1</sup>, | |
| </a> | |
| </div> | |
| <div> | |
| <a href="" style="color: white;" target="_blank" rel="noopener noreferrer"> | |
| I-Hsin Chung<sup>1</sup>, | |
| </a> | |
| <a href="https://winstonhsu.info/" style="color: white;" target="_blank" rel="noopener noreferrer"> | |
| Winston H. Hsu<sup>2</sup>, | |
| </a> | |
| <a href="https://sites.google.com/site/pinyuchenpage/" style="color: white;" target="_blank" | |
| rel="noopener noreferrer"> | |
| Pin-Yu Chen<sup>1</sup> | |
| </a> | |
| </div> | |
| <div style="color: #f1f0f0"> | |
| <sup>1</sup>IBM Research <sup>2</sup>National Taiwan University | |
| </div> | |
| <div class="publication-links"> | |
| <span class="link-block"> | |
| <a href="https://arxiv.org/pdf/2411.00348.pdf" target="_blank" | |
| class="external-link button is-normal is-rounded is-dark"> | |
| <span class="icon"> | |
| <i class="fas fa-file-pdf"></i> | |
| </span> | |
| <span>Paper</span> | |
| </a> | |
| </span> | |
| <span class="link-block"> | |
| <a href="https://arxiv.org/abs/2411.00348" target="_blank" | |
| class="external-link button is-normal is-rounded is-dark"> | |
| <span class="icon"> | |
| <i class="ai ai-arxiv"></i> | |
| </span> | |
| <span>arXiv</span> | |
| </a> | |
| </span> | |
| <span class="link-block"> | |
| <a href="https://github.com/khhung-906/Attention-Tracker" target="_blank" | |
| class="external-link button is-normal is-rounded is-dark"> | |
| <span class="icon"> | |
| <i class="fab fa-github"></i> | |
| </span> | |
| <span>Code</span> | |
| </a> | |
| </span> | |
| <span class="link-block"> | |
| <a href="https://huggingface.co/spaces/pinyuchen/attention-tracker" target="_blank" | |
| class="external-link button is-normal is-rounded is-dark"> | |
| <span class="icon"> | |
| <i class="fas fa-laptop"></i> | |
| </span> | |
| <span>Demo</span> | |
| </a> | |
| </span> | |
| </div> | |
| </header> | |
| <main id="content" class="main-content" role="main"> | |
| <h2 id="abstract" class="section-title">Abstract</h2> | |
| <p>Large Language Models (LLMs) have revolutionized various domains but remain vulnerable to prompt injection | |
| attacks, where malicious inputs manipulate the model into ignoring original instructions and executing designated | |
| action. In this paper, we investigate | |
| the underlying mechanisms of these attacks by analyzing the attention patterns within LLMs. | |
| We introduce the concept of the <strong>distraction effect</strong>, where specific attention heads, termed | |
| important heads, shift focus from the original instruction to the injected instruction. Building on this | |
| discovery, we propose <strong>Attention | |
| Tracker</strong>, a training-free detection method that tracks attention patterns on instruction to detect | |
| prompt injection attacks without the need for additional LLM inference. Our method generalizes effectively across | |
| diverse models, datasets, | |
| and attack types, showing an AUROC improvement of up to 10.0% over existing methods, and performs well even on | |
| small LLMs. We | |
| demonstrate the robustness of our approach through extensive evaluations and provide insights into safeguarding | |
| LLM-integrated systems from prompt injection vulnerabilities. | |
| </p> | |
| <h2 id="what-is-jailbreak" class="section-title">What is Prompt Injection Attack?</h2> | |
| <p>A Prompt Injection Attack is a technique used to manipulate language models (like GPT-3 or similar AI systems) by | |
| injecting malicious or deceptive prompts into the input data, causing the model to behave in unexpected or | |
| undesired ways. This attack exploits the way language models interpret and respond to instructions, tricking them | |
| into providing information or performing actions that were not originally intended.</p> | |
| <div><img id="attack-intro" src="./figures/attack_intro.png" /></div> | |
| <h2 id="refusal-loss" class="section-title">Distraction Effect</h2> | |
| <p> | |
| In this section, we analyze the reasons behind the success of prompt injection attacks on LLMs. Specifically, we | |
| aim to understand | |
| <strong>what mechanism within LLMs causes them to "ignore" the original instruction and follow the injected | |
| instruction instead</strong>. | |
| To explore this, we examine the attention patterns of the last token in the input prompts, as it has the most | |
| direct influence on the LLMs' output. | |
| </p> | |
| <div class="container"> | |
| <div><img id="attn-map-img" src="./figures/attn_map.png" /></div> | |
| </div> | |
| <p> | |
| In the figure (a), we visualize the attention maps of the last token in the input prompt for normal and attack | |
| data. We observe that the attention maps for normal data are much darker than those for attacked data, | |
| particularly in the middle and earlier layers of the LLM. This indicates that the last token's attention to the | |
| instruction is significantly higher for normal data than for attack data in specific attention heads. When | |
| inputting attacked data, the attention shifts away from the original instruction towards the attack data, which we | |
| refer to as the <strong>distraction effect</strong>. | |
| Additionally, in the figure (b), we find that the attention focus shifts from the original instruction to the | |
| injected instruction in the attack data. This suggests that the separator string helps the attacker shift | |
| attention to the injected instruction, causing the LLM to perform the injected task instead of the target task. | |
| </p> | |
| </div> | |
| <h2 id="proposed-approach-attention-tracker" class="section-title">Proposed Approach: Attention Tracker</h2> | |
| <p> With the discover of distraction effect, we propose <strong>Attention Tracker</strong>, | |
| a prompt injection detection method based on tracking the attention pattern on instruction. Our detection | |
| procedure is shown below: | |
| </p> | |
| <div class="container"><img id="attention-tracker-header" src="./figures/main.png" /></div> | |
| <p></p> | |
| <p> | |
| Attention Tracker can be summarized into two phases: | |
| </p> | |
| <p> | |
| <strong>(Phase 1) Finding Important Heads:</strong> In the first step, we identify specific attention head that | |
| that exhibit the distraction effect, which we termed the important heads. To find the important heads, we use a | |
| set of LLM-generated sentences with the ignore attack as the dataset. | |
| </p> | |
| <p> | |
| <strong>(Phase 2) Prompt Injection Detection with Important Heads:</strong> In the second step, we feed the | |
| testing quries into the target LLM and aggregate the attention directed towards the instruction in the important | |
| heads. With this aggregated score which we call the <strong>focus score</strong>, we can effectively detect prompt | |
| injection attacks. | |
| </p> | |
| <p> | |
| We provide more details about the running flow of Attention Tracker in the paper. | |
| </p> | |
| <h2 id="result-attention-tracker" class="section-title">Experiment Result</h2> | |
| <p> | |
| In this section, we evaluate Attention Tracker against various baselines with the AUROC score on two prompt | |
| injection detection benchmarks: Open-Prompt-Injection and deepset prompt injection dataset: | |
| </p> | |
| <div class="container"><img id="attention-tracker-header" src="./figures/result.png" /></div> | |
| <p /> | |
| <p> | |
| As shown in the table, Attention Tracker consistently outperforms existing baselines, with an AUROC improvement of | |
| up to 3.1% on the Open-Prompt-Injection benchmark and 10.0% on the deepset prompt injection dataset. Among | |
| training-free methods, it achieves even greater gains, with an average AUROC improvement of 31.3% and 20.9% across | |
| the two datasets, respectively. Unlike LLM-based methods that rely on larger models for stability, Attention | |
| Tracker delivers robust and effective performance even with smaller LLMs, underscoring its suitability for | |
| real-world applications. | |
| </p> | |
| <h2 id="demo" class="section-title">Example</h2> | |
| <p> | |
| We evaluated the effectiveness of the Attention Tracker by visualizing the distribution of attention aggregation | |
| for key heads across different data types (normal data vs. attack data) in the Open-Prompt-Injection dataset. | |
| Additionally, we calculated the focus score for these data samples. A higher focus score indicates a lower | |
| likelihood of prompt injection attacks. The tested model is Qwen-2 1.8b. | |
| </p> | |
| <div class="group-title green">Normal Data</div> | |
| <div class="image-gallery-container normal-gallery"> | |
| <span class="arrow left-arrow" onclick="navigateImages('normal', -1)"><</span> | |
| <div class="image-gallery"> | |
| <!-- <img id="normalImage1" src="./demo_results/normal_1.png" alt="Normal Image 1"> --> | |
| <img id="normalImage2" src="./demo_results/normal_2.png" alt="Normal Image 2"> | |
| <img id="normalImage3" src="./demo_results/normal_3.png" alt="Normal Image 3"> | |
| <img id="normalImage4" src="./demo_results/normal_4.png" alt="Normal Image 4"> | |
| <img id="normalImage5" src="./demo_results/normal_5.png" alt="Normal Image 5"> | |
| </div> | |
| <span class="arrow right-arrow" onclick="navigateImages('normal', 1)">></span> | |
| </div> | |
| <div class="group-title red">Attack Data</div> | |
| <div class="image-gallery-container attack-gallery"> | |
| <span class="arrow left-arrow" onclick="navigateImages('attack', -1)"><</span> | |
| <div class="image-gallery"> | |
| <!-- <img id="attackImage1" src="./demo_results/attack_1.png" alt="Attack Image 1" class="active"> --> | |
| <img id="attackImage2" src="./demo_results/attack_2.png" alt="Attack Image 2"> | |
| <img id="attackImage3" src="./demo_results/attack_3.png" alt="Attack Image 3"> | |
| <img id="attackImage4" src="./demo_results/attack_4.png" alt="Attack Image 4"> | |
| <img id="attackImage5" src="./demo_results/attack_5.png" alt="Attack Image 5"> | |
| </div> | |
| <span class="arrow right-arrow" onclick="navigateImages('attack', 1)">></span> | |
| </div> | |
| <!-- <h2 id="inquiries" class="section-title"> Inquiries on Attention Tracker</h2> | |
| <p class="section-title"> Please contact <a href="Mailto:khhung906@gmail.com">Kuo-Han Hung</a> | |
| and <a href="Mailto:pin-yu.chen@ibm.com">Pin-Yu Chen</a> | |
| </p> --> | |
| <h2 id="citations" class="section-title">Citations</h2> | |
| <p>If you find Attention Tracker helpful and useful for your research, please cite our main paper as follows:</p> | |
| <div class="language-plaintext highlighter-rouge"> | |
| <div class="highlight"> | |
| <pre class="highlight"> | |
| <code>@misc{hung2024attentiontrackerdetectingprompt, | |
| title={Attention Tracker: Detecting Prompt Injection Attacks in LLMs}, | |
| author={Kuo-Han Hung and Ching-Yun Ko and Ambrish Rawat and I-Hsin Chung and Winston H. Hsu and Pin-Yu Chen}, | |
| year={2024}, | |
| eprint={2411.00348}, | |
| archivePrefix={arXiv}, | |
| primaryClass={cs.CR}, | |
| url={https://arxiv.org/abs/2411.00348}, | |
| }</code></pre> | |
| </div> | |
| </div> | |
| <footer class="site-footer"> | |
| <span class="site-footer-owner">This website is maintained by <a href="https://khhung-906.github.io/">Kuo-Han | |
| Hung</a></a>.</span> | |
| </footer> | |
| </main> | |
| </body> | |
| </html> |