Spaces:

PierrunoYT
/

audio-flamingo-3

Runtime error

App Files Files Community

PierrunoYT Amp commited on Aug 7

Commit

06a685c

0 Parent(s):

Initial commit

Browse files

Co-authored-by: Amp <amp@ampcode.com>
Amp-Thread-ID: https://ampcode.com/threads/T-702cb6b1-8290-4c2e-8522-1ed38473ca1f

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +57 -0
README.md +147 -0
app.py +306 -0
llava/__init__.py +8 -0
llava/cli/infer_audio.py +88 -0
llava/constants.py +55 -0
llava/conversation.py +197 -0
llava/data/__init__.py +9 -0
llava/data/base.py +95 -0
llava/data/builder.py +193 -0
llava/data/collate.py +166 -0
llava/data/dataset.py +1635 -0
llava/data/datasets_mixture.py +80 -0
llava/data/registry/datasets/audio_test.yaml +97 -0
llava/data/registry/datasets/default.yaml +5 -0
llava/data/registry/mixtures.yaml +78 -0
llava/entry.py +60 -0
llava/eval/__init__.py +15 -0
llava/eval/eval_audio_bench.py +117 -0
llava/eval/mmmu_utils/__pycache__/eval_utils.cpython-311.pyc +0 -0
llava/eval/mmmu_utils/eval_utils.py +61 -0
llava/eval/registry_audio.yaml +93 -0
llava/media.py +47 -0
llava/mm_utils.py +641 -0
llava/model/FloatPointQuantizeTorch.py +85 -0
llava/model/FloatPointQuantizeTriton.py +199 -0
llava/model/__init__.py +35 -0
llava/model/apply_delta.py +77 -0
llava/model/builder.py +161 -0
llava/model/coat/activation/__init__.py +6 -0
llava/model/coat/activation/fake_quantization/FloatPointQuantizeTorch.py +101 -0
llava/model/coat/activation/fake_quantization/FloatPointQuantizeTriton.py +181 -0
llava/model/coat/activation/fake_quantization/quantize_function.py +239 -0
llava/model/coat/activation/fake_quantization/utils.py +115 -0
llava/model/coat/activation/models/_fp8_quantization_config.py +67 -0
llava/model/coat/activation/models/_fp8_weightcache.py +48 -0
llava/model/coat/activation/models/_fp8manager.py +31 -0
llava/model/coat/activation/models/coat_llama.py +1479 -0
llava/model/coat/activation/models/coat_llama_convert_from_hf.py +71 -0
llava/model/coat/activation/models/coat_olmo.py +1942 -0
llava/model/coat/activation/real_quantization/__init__.py +31 -0
llava/model/coat/activation/real_quantization/_dequantize.py +162 -0
llava/model/coat/activation/real_quantization/_division.py +212 -0
llava/model/coat/activation/real_quantization/_division_transpose.py +215 -0
llava/model/coat/activation/real_quantization/_memory_io.py +180 -0
llava/model/coat/activation/real_quantization/_quantize.py +176 -0
llava/model/coat/activation/real_quantization/_quantize_pertensor.py +152 -0
llava/model/coat/activation/real_quantization/_quantize_pertensor_transpose.py +155 -0
llava/model/coat/activation/real_quantization/_transpose.py +121 -0
llava/model/coat/activation/real_quantization/add_bwd.py +205 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,57 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+llava/model/coat/optimizer/kernels/build/lib.linux-x86_64-cpython-310/qoptim_cuda.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+llava/model/coat/optimizer/kernels/build/temp.linux-x86_64-cpython-310/bindings.o filter=lfs diff=lfs merge=lfs -text
+llava/model/coat/optimizer/kernels/build/temp.linux-x86_64-cpython-310/fp8_adamw_cuda.o filter=lfs diff=lfs merge=lfs -text
+llava/model/coat/optimizer/kernels/build/temp.linux-x86_64-cpython-310/fp8_adamw_expand_cuda.o filter=lfs diff=lfs merge=lfs -text
+static/af3_main_diagram-1.png filter=lfs diff=lfs merge=lfs -text
+static/af3_radial-1.png filter=lfs diff=lfs merge=lfs -text
+static/af3_sota.png filter=lfs diff=lfs merge=lfs -text
+static/audio/audio2.wav filter=lfs diff=lfs merge=lfs -text
+static/chat/audio1.mp3 filter=lfs diff=lfs merge=lfs -text
+static/chat/audio2.mp3 filter=lfs diff=lfs merge=lfs -text
+static/emergent/audio1.wav filter=lfs diff=lfs merge=lfs -text
+static/logo-no-bg.png filter=lfs diff=lfs merge=lfs -text
+static/speech/339a1acd-afcb-466b-a7b1-8661e59b1e56.wav filter=lfs diff=lfs merge=lfs -text
+static/speech/audio3.wav filter=lfs diff=lfs merge=lfs -text
+static/speech/bcc6057d-0dda-435d-b956-a96ab27bc9e4.wav filter=lfs diff=lfs merge=lfs -text
+static/speech/be84d293-5e9c-4158-9a1e-b4dd1acb7d70.wav filter=lfs diff=lfs merge=lfs -text
+static/speech/fec3402e-7883-45c0-90d4-38647f615dc3.wav filter=lfs diff=lfs merge=lfs -text
+static/think/audio1.wav filter=lfs diff=lfs merge=lfs -text
+static/think/audio2.wav filter=lfs diff=lfs merge=lfs -text
+static/voice/voice_2.mp3 filter=lfs diff=lfs merge=lfs -text
+static/speech/speaker1.flac filter=lfs diff=lfs merge=lfs -text
+static/speech/videoplayback.wav filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,147 @@

+---
+license: other
+title: Audio Flamingo 3 Demo
+sdk: gradio
+emoji: 🚀
+colorFrom: green
+colorTo: green
+pinned: true
+short_description: Audio Flamingo 3 Demo
+---
+<div align="center" style="display: flex; justify-content: center; align-items: center; text-align: center;">
+  <a href="https://github.com/NVIDIA/audio-flamingo" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
+    <img src="static/logo-no-bg.png" alt="Audio Flamingo 3 🔥🚀🔥" width="120">
+  </a>
+</div>
+<div align="center" style="display: flex; justify-content: center; align-items: center; text-align: center;">
+    <h2>
+    Audio Flamingo 3: Advancing Audio Intelligence with Fully Open Large Audio-Language Models
+    </h2>
+</div>
+<div align="center" style="display: flex; justify-content: center; margin-top: 10px;">
+  <a href=""><img src="https://img.shields.io/badge/arXiv-2503.03983-AD1C18" style="margin-right: 5px;"></a>
+  <a href="https://research.nvidia.com/labs/adlr/AF3/"><img src="https://img.shields.io/badge/Demo page-228B22" style="margin-right: 5px;"></a>
+  <a href="https://github.com/NVIDIA/audio-flamingo"><img src='https://img.shields.io/badge/Github-Audio Flamingo 3-9C276A' style="margin-right: 5px;"></a>
+  <a href="https://github.com/NVIDIA/audio-flamingo/stargazers"><img src="https://img.shields.io/github/stars/NVIDIA/audio-flamingo.svg?style=social"></a>
+</div>
+<div align="center" style="display: flex; justify-content: center; margin-top: 10px; flex-wrap: wrap; gap: 5px;">
+  <a href="https://huggingface.co/nvidia/audio-flamingo-3">
+    <img src="https://img.shields.io/badge/🤗-Checkpoints-ED5A22.svg">
+  </a>
+  <a href="https://huggingface.co/nvidia/audio-flamingo-3-chat">
+    <img src="https://img.shields.io/badge/🤗-Checkpoints (Chat)-ED5A22.svg">
+  </a>
+  <a href="https://huggingface.co/datasets/nvidia/AudioSkills">
+    <img src="https://img.shields.io/badge/🤗-Dataset: AudioSkills--XL-ED5A22.svg">
+  </a>
+  <a href="https://huggingface.co/datasets/nvidia/LongAudio">
+    <img src="https://img.shields.io/badge/🤗-Dataset: LongAudio--XL-ED5A22.svg">
+  </a>
+  <a href="https://huggingface.co/datasets/nvidia/AF-Chat">
+    <img src="https://img.shields.io/badge/🤗-Dataset: AF--Chat-ED5A22.svg">
+  </a>
+  <a href="https://huggingface.co/datasets/nvidia/AF-Think">
+    <img src="https://img.shields.io/badge/🤗-Dataset: AF--Think-ED5A22.svg">
+  </a>
+</div>
+<div align="center" style="display: flex; justify-content: center; margin-top: 10px;">
+<a href="https://huggingface.co/spaces/nvidia/audio_flamingo_3"><img src="https://img.shields.io/badge/🤗-Gradio Demo (7B)-5F9EA0.svg" style="margin-right: 5px;"></a>
+</div>
+## Overview
+This repo contains the PyTorch implementation of [Audio Flamingo 3: Advancing Audio Intelligence with Fully Open Large Audio-Language Models](). Audio Flamingo 3 (AF3) is a fully open, state-of-the-art Large Audio-Language Model (LALM) that advances reasoning and understanding across speech, sounds, and music. AF3 builds on previous work with innovations in:
+- Unified audio representation learning (speech, sound, music)
+- Flexible, on-demand chain-of-thought reasoning (Thinking in Audio)
+- Long-context audio comprehension (including speech and up to 10 minutes)
+- Multi-turn, multi-audio conversational dialogue (AF3-Chat)
+- Voice-to-voice interaction (AF3-Chat)
+Extensive evaluations confirm AF3’s effectiveness, setting new benchmarks on over 20 public audio understanding and reasoning tasks.
+## Main Results
+Audio Flamingo 3 outperforms prior SOTA models including GAMA, Audio Flamingo, Audio Flamingo 2, Qwen-Audio, Qwen2-Audio, Qwen2.5-Omni.LTU, LTU-AS, SALMONN, AudioGPT, Gemini Flash v2 and Gemini Pro v1.5 on a number of understanding and reasoning benchmarks.
+<div align="center">
+  <img class="img-full" src="static/af3_radial-1.png" width="300">
+</div>
+<div align="center">
+  <img class="img-full" src="static/af3_sota.png" width="400">
+</div>
+## Audio Flamingo 3 Architecture
+Audio Flamingo 3 uses AF-Whisper unified audio encoder, MLP-based audio adaptor, Decoder-only LLM backbone (Qwen2.5-7B), and Streaming TTS module (AF3-Chat).
+Audio Flamingo 3 can take up to 10 minutes of audio inputs.
+<div align="center">
+  <img class="img-full" src="static/af3_main_diagram-1.png" width="800">
+</div>
+## Installation
+```bash
+./environment_setup.sh af3
+```
+## Code Structure
+- The folder ```audio_flamingo_3/``` contains the main training and inference code of Audio Flamingo 3.
+- The folder ```audio_flamingo_3/scripts``` contains the inference scripts of Audio Flamingo 3 in case you would like to use our pretrained checkpoints on HuggingFace.
+Each folder is self-contained and we expect no cross dependencies between these folders. This repo does not contain the code for Streaming-TTS pipeline which will released in the near future.
+## Single Line Inference
+To infer stage 3 model directly, run the command below:
+```bash
+python llava/cli/infer_audio.py --model-base /path/to/checkpoint/af3-7b --conv-mode auto --text "Please describe the audio in detail" --media static/audio1.wav
+```
+To infer the model in stage 3.5 model, run the command below:
+```bash
+python llava/cli/infer_audio.py --model-base /path/to/checkpoint/af3-7b --model-path /path/to/checkpoint/af3-7b/stage35 --conv-mode auto --text "Please describe the audio in detail" --media static/audio1.wav --peft-mode
+```
+## References
+The main training and inferencing code within each folder are modified from [NVILA](https://github.com/NVlabs/VILA/tree/main) [Apache license](incl_licenses/License_1.md).
+## License
+- The code in this repo is under [MIT license](incl_licenses/MIT_license.md).
+- The checkpoints are for non-commercial use only [NVIDIA OneWay Noncommercial License](incl_licenses/NVIDIA_OneWay_Noncommercial_License.docx). They are also subject to the [Qwen Research license](https://huggingface.co/Qwen/Qwen2.5-7B/blob/main/LICENSE), the [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and the original licenses accompanying each training dataset.
+- Notice: Audio Flamingo 3 is built with Qwen-2.5. Qwen is licensed under the Qwen RESEARCH LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved.
+## Citation
+- Audio Flamingo 2
+```
+@article{ghosh2025audio,
+  title={Audio Flamingo 2: An Audio-Language Model with Long-Audio Understanding and Expert Reasoning Abilities},
+  author={Ghosh, Sreyan and Kong, Zhifeng and Kumar, Sonal and Sakshi, S and Kim, Jaehyeon and Ping, Wei and Valle, Rafael and Manocha, Dinesh and Catanzaro, Bryan},
+  journal={arXiv preprint arXiv:2503.03983},
+  year={2025}
+}
+```
+- Audio Flamingo
+```
+@inproceedings{kong2024audio,
+  title={Audio Flamingo: A Novel Audio Language Model with Few-Shot Learning and Dialogue Abilities},
+  author={Kong, Zhifeng and Goel, Arushi and Badlani, Rohan and Ping, Wei and Valle, Rafael and Catanzaro, Bryan},
+  booktitle={International Conference on Machine Learning},
+  pages={25125--25148},
+  year={2024},
+  organization={PMLR}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import gradio as gr
+import torch
+import llava
+from peft import PeftModel
+import os
+from huggingface_hub import snapshot_download
+import copy
+# ---------------------------------
+# SINGLE-TURN MODEL SETUP
+# ---------------------------------
+MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
+MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
+model_single = llava.load(MODEL_BASE_SINGLE, model_base=None)
+model_single_copy = copy.deepcopy(model_single)
+# Move the model to GPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_single = model_single.to_empty(device=device)
+generation_config_single = model_single.default_generation_config
+model_think = PeftModel.from_pretrained(
+    model_single,
+    MODEL_BASE_THINK,
+    device_map="auto",
+    torch_dtype=torch.float16,
+)
+model_think.to(device)
+# # ---------------------------------
+# # MULTI-TURN MODEL SETUP
+# # ---------------------------------
+# MODEL_BASE_MULTI = snapshot_download(repo_id="nvidia/audio-flamingo-3-chat")
+# model_multi = llava.load(MODEL_BASE_MULTI, model_base=None, devices=[0])
+# generation_config_multi = model_multi.default_generation_config
+# ---------------------------------
+# SINGLE-TURN INFERENCE FUNCTION
+# ---------------------------------
+def single_turn_infer(audio_file, prompt_text):
+    try:
+        sound = llava.Sound(audio_file)
+        full_prompt = f"<sound>\n{prompt_text}"
+        response = model_single_copy.generate_content([sound, full_prompt], generation_config=generation_config_single)
+        return response
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+# ---------------------------------
+# MULTI-TURN INFERENCE FUNCTION
+# ---------------------------------
+# def multi_turn_chat(user_input, audio_file, history, current_audio):
+#     try:
+#         if audio_file is not None:
+#             current_audio = audio_file  # Update state if a new file is uploaded
+#         if current_audio is None:
+#             return history + [("System", "❌ Please upload an audio file before chatting.")], history, current_audio
+#         sound = llava.Sound(current_audio)
+#         prompt = f"<sound>\n{user_input}"
+#         response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
+#         history.append((user_input, response))
+#         return history, history, current_audio
+#     except Exception as e:
+#         history.append((user_input, f"❌ Error: {str(e)}"))
+#         return history, history, current_audio
+def think_infer(audio_file, prompt_text):
+    try:
+        sound = llava.Sound(audio_file)
+        full_prompt = f"<sound>\n{prompt_text}"
+        response = model_think.generate_content([sound, full_prompt], generation_config=generation_config_single)
+        return response
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+# ---------------------------------
+# MULTI-TURN INFERENCE FUNCTION
+# ---------------------------------
+# def multi_turn_chat(user_input, audio_file, history, current_audio):
+#     try:
+#         if audio_file is not None:
+#             current_audio = audio_file  # Update state if a new file is uploaded
+#         if current_audio is None:
+#             return history + [("System", "❌ Please upload an audio file before chatting.")], history, current_audio
+#         sound = llava.Sound(current_audio)
+#         prompt = f"<sound>\n{user_input}"
+#         response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
+#         history.append((user_input, response))
+#         return history, history, current_audio
+#     except Exception as e:
+#         history.append((user_input, f"❌ Error: {str(e)}"))
+#         return history, history, current_audio
+# ---------------------------------
+# INTERFACE
+# ---------------------------------
+with gr.Blocks(css="""
+.gradio-container {
+    max-width: 100% !important;
+    width: 100% !important;
+    margin: 0 !important;
+    padding: 0 !important;
+}
+#component-0, .gr-block.gr-box {
+    width: 100% !important;
+}
+.gr-block.gr-box, .gr-column, .gr-row {
+    padding: 0 !important;
+    margin: 0 !important;
+}
+""") as demo:
+    with gr.Column():
+        gr.HTML("""
+<div align="center">
+  <img src="https://raw.githubusercontent.com/NVIDIA/audio-flamingo/audio_flamingo_3/static/logo-no-bg.png" alt="Audio Flamingo 3 Logo" width="120" style="margin-bottom: 10px;">
+  <h2><strong>Audio Flamingo 3</strong></h2>
+  <p><em>Advancing Audio Intelligence with Fully Open Large Audio-Language Models</em></p>
+</div>
+<div align="center" style="margin-top: 10px;">
+  <a href="https://arxiv.org/abs/2507.08128">
+    <img src="https://img.shields.io/badge/arXiv-2503.03983-AD1C18" alt="arXiv" style="display:inline;">
+  </a>
+  <a href="https://research.nvidia.com/labs/adlr/AF3/">
+    <img src="https://img.shields.io/badge/Demo%20page-228B22" alt="Demo Page" style="display:inline;">
+  </a>
+  <a href="https://github.com/NVIDIA/audio-flamingo">
+    <img src="https://img.shields.io/badge/Github-Audio_Flamingo_3-9C276A" alt="GitHub" style="display:inline;">
+  </a>
+  <a href="https://github.com/NVIDIA/audio-flamingo/stargazers">
+    <img src="https://img.shields.io/github/stars/NVIDIA/audio-flamingo.svg?style=social" alt="GitHub Stars" style="display:inline;">
+  </a>
+</div>
+<div align="center" style="display: flex; justify-content: center; margin-top: 10px; flex-wrap: wrap; gap: 5px;">
+  <a href="https://huggingface.co/nvidia/audio-flamingo-3">
+    <img src="https://img.shields.io/badge/🤗-Checkpoints-ED5A22.svg">
+  </a>
+  <a href="https://huggingface.co/nvidia/audio-flamingo-3-chat">
+    <img src="https://img.shields.io/badge/🤗-Checkpoints_(Chat)-ED5A22.svg">
+  </a>
+</div>
+<div align="center" style="display: flex; justify-content: center; margin-top: 10px; flex-wrap: wrap; gap: 5px;">
+  <a href="https://huggingface.co/datasets/nvidia/AudioSkills">
+    <img src="https://img.shields.io/badge/🤗-Dataset:_AudioSkills--XL-ED5A22.svg">
+  </a>
+  <a href="https://huggingface.co/datasets/nvidia/LongAudio">
+    <img src="https://img.shields.io/badge/🤗-Dataset:_LongAudio--XL-ED5A22.svg">
+  </a>
+  <a href="https://huggingface.co/datasets/nvidia/AF-Chat">
+    <img src="https://img.shields.io/badge/🤗-Dataset:_AF--Chat-ED5A22.svg">
+  </a>
+  <a href="https://huggingface.co/datasets/nvidia/AF-Think">
+    <img src="https://img.shields.io/badge/🤗-Dataset:_AF--Think-ED5A22.svg">
+  </a>
+</div>
+""")
+    # gr.Markdown("#### NVIDIA (2025)")
+    with gr.Tabs():
+        # ---------------- SINGLE-TURN ----------------
+        with gr.Tab("🎯 Single-Turn Inference"):
+            with gr.Row():
+                with gr.Column():
+                    audio_input_single = gr.Audio(type="filepath", label="Upload Audio")
+                    prompt_input_single = gr.Textbox(label="Prompt", placeholder="Ask a question about the audio...", lines=8)
+                    btn_single = gr.Button("Generate Answer")
+                    gr.Examples(
+                        examples=[
+                            ["static/emergent/audio1.wav", "What is surprising about the relationship between the barking and the music?"],
+                            ["static/audio/audio2.wav", "Please describe the audio in detail."],
+                            ["static/speech/audio3.wav", "Transcribe any speech you hear."],
+                        ],
+                        inputs=[audio_input_single, prompt_input_single],
+                        label="🧪 Try Examples"
+                    )
+                with gr.Column():
+                    output_single = gr.Textbox(label="Model Response", lines=15)
+            btn_single.click(fn=single_turn_infer, inputs=[audio_input_single, prompt_input_single], outputs=output_single)
+        with gr.Tab("🤔 Think / Long"):
+            with gr.Row():
+                with gr.Column():
+                    audio_input_think = gr.Audio(type="filepath", label="Upload Audio")
+                    prompt_input_think = gr.Textbox(label="Prompt", placeholder="To enable thinking, please add the text: '\nPlease think and reason about the input music before you respond.' to your prompt.", lines=8)
+                    btn_think = gr.Button("Generate Answer")
+                    gr.Examples(
+                        examples=[
+                            ["static/think/audio1.wav", "What are the two people doing in the audio Choose the correct option from the following options:\n(A) One person is demonstrating how to use the equipment\n(B) The two people are discussing how to use the equipment\n(C) The two people are disassembling the equipment\n(D) One person is teaching another person how to use a piece of equipment\n"],
+                            ["static/think/audio2.wav", "Is the boat in the video moving closer or further away? Choose the correct option from the following options:\n(A) Closer\n(B) Further\n"],
+                            ["static/speech/videoplayback.wav", "Generate a detailed caption for the input audio, describing all notable speech, sound, and musical events comprehensively. In the caption, transcribe all spoken content by all speakers in the audio precisely."],
+                            ["static/speech/speaker1.flac", "Transcribe any input speech in the input audio."],
+                        ],
+                        inputs=[audio_input_think, prompt_input_think],
+                        label="🧪 Try Examples"
+                    )
+                with gr.Column():
+                    output_think = gr.Textbox(label="Model Response", lines=30)
+            btn_think.click(fn=think_infer, inputs=[audio_input_think, prompt_input_think], outputs=output_think)
+        # ---------------- MULTI-TURN CHAT ----------------
+        with gr.Tab("💬 Multi-Turn Chat"):
+            # chatbot = gr.Chatbot(label="Audio Chatbot")
+            # audio_input_multi = gr.Audio(type="filepath", label="Upload or Replace Audio Context")
+            # user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
+            # btn_multi = gr.Button("Send")
+            # history_state = gr.State([])           # Chat history
+            # current_audio_state = gr.State(None)   # Most recent audio file path
+            # btn_multi.click(
+            #     fn=multi_turn_chat,
+            #     inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state],
+            #     outputs=[chatbot, history_state, current_audio_state]
+            # )
+            # gr.Examples(
+            #     examples=[
+            #         ["static/chat/audio1.mp3", "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?"],
+            #         ["static/chat/audio2.mp3", "Switching gears, this one is super energetic and synthetic. If I wanted to remix the calming folk piece into something closer to this, what would you suggest?"],
+            #     ],
+            #     inputs=[audio_input_multi, user_input_multi],
+            #     label="🧪 Try Examples"
+            # )
+            # Add the link to another Gradio demo here
+            gr.Markdown("🔗 [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
+        with gr.Tab("🗣️ Speech Prompt"):
+            # gr.Markdown("Use your **voice** to talk to the model.")
+            # with gr.Row():
+            #     with gr.Column():
+            #         speech_input = gr.Audio(type="filepath", label="Speak or Upload Audio")
+            #         btn_speech = gr.Button("Submit")
+            #     gr.Examples(
+            #             examples=[
+            #                 ["static/voice/voice_0.mp3"],
+            #                 ["static/voice/voice_1.mp3"],
+            #                 ["static/voice/voice_2.mp3"],
+            #             ],
+            #             inputs=speech_input,
+            #             label="🧪 Try Examples"
+            #         )
+            #     with gr.Column():
+            #         response_box = gr.Textbox(label="Model Response", lines=15)
+            # btn_speech.click(fn=speech_prompt_infer, inputs=speech_input, outputs=response_box)
+            # Add the link to another Gradio demo here
+            gr.Markdown("🔗 [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
+        # ---------------- ABOUT ----------------
+        with gr.Tab("📄 About"):
+            gr.Markdown("""
+### 📚 Overview
+**Audio Flamingo 3** is a fully open state-of-the-art (SOTA) large audio-language model that advances reasoning and understanding across speech, sound, and music. AF3 introduces:
+(i) AF-Whisper, a unified audio encoder trained using a novel strategy for joint representation learning across all 3 modalities of speech, sound, and music;
+(ii) flexible, on-demand thinking, allowing the model to do chain-of-thought reasoning before answering;
+(iii) multi-turn, multi-audio chat;
+(iv) long audio understanding and reasoning (including speech) up to 10 minutes; and
+(v) voice-to-voice interaction.
+To enable these capabilities, we propose several large-scale training datasets curated using novel strategies, including AudioSkills-XL, LongAudio-XL, AF-Think, and AF-Chat, and train AF3 with a novel five-stage curriculum-based training strategy. Trained on only open-source audio data, AF3 achieves new SOTA results on over 20+ (long) audio understanding and reasoning benchmarks, surpassing both open-weight and closed-source models trained on much larger datasets.
+**Key Features:**
+💡 Audio Flamingo 3 has strong audio, music and speech understanding capabilities.
+💡 Audio Flamingo 3 supports on-demand thinking for chain-of-though reasoning.
+💡 Audio Flamingo 3 supports long audio and speech understanding for audios up to 10 minutes.
+💡 Audio Flamingo 3 can have multi-turn, multi-audio chat with users under complex context.
+💡 Audio Flamingo 3 has voice-to-voice conversation abilities.
+""")
+    gr.Markdown("© 2025 NVIDIA | Built with ❤️ using Gradio + PyTorch")
+# -----------------------
+# Launch App
+# -----------------------
+if __name__ == "__main__":
+    demo.launch(share=True)

llava/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+from .entry import *
+from .media import *

llava/cli/infer_audio.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+import argparse
+import importlib.util
+import json
+import os
+from pydantic import BaseModel
+from termcolor import colored
+import llava
+from llava import conversation as clib
+from llava.media import Image, Video, Sound
+from llava.model.configuration_llava import JsonSchemaResponseFormat, ResponseFormat
+from peft import PeftModel
+import torch
+def get_schema_from_python_path(path: str) -> str:
+    schema_path = os.path.abspath(path)
+    spec = importlib.util.spec_from_file_location("schema_module", schema_path)
+    schema_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(schema_module)
+    # Get the Main class from the loaded module
+    Main = schema_module.Main
+    assert issubclass(
+        Main, BaseModel
+    ), f"The provided python file {path} does not contain a class Main that describes a JSON schema"
+    return Main.schema_json()
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-base", "-mb", type=str, required=True)
+    parser.add_argument("--model-path", "-mp", type=str, required=True)
+    parser.add_argument("--conv-mode", "-c", type=str, default="auto")
+    parser.add_argument("--text", type=str)
+    parser.add_argument("--media", type=str, nargs="+")
+    parser.add_argument("--json-mode", action="store_true")
+    parser.add_argument("--peft-mode", action="store_true")
+    parser.add_argument("--json-schema", type=str, default=None)
+    args = parser.parse_args()
+    # Convert json mode to response format
+    if not args.json_mode:
+        response_format = None
+    elif args.json_schema is None:
+        response_format = ResponseFormat(type="json_object")
+    else:
+        schema_str = get_schema_from_python_path(args.json_schema)
+        print(schema_str)
+        response_format = ResponseFormat(type="json_schema", json_schema=JsonSchemaResponseFormat(schema=schema_str))
+    # Load model
+    model = llava.load(args.model_base)
+    if args.peft_mode:
+        model = PeftModel.from_pretrained(
+            model,
+            args.model_path,
+            device_map="auto",
+            torch_dtype=torch.float16,
+        )
+    # Set conversation mode
+    clib.default_conversation = clib.conv_templates[args.conv_mode].copy()
+    # Prepare multi-modal prompt
+    prompt = []
+    if args.media is not None:
+        for media in args.media or []:
+            if any(media.endswith(ext) for ext in [".wav",".mp3", ".flac"]):
+                media = Sound(media)
+            else:
+                raise ValueError(f"Unsupported media type: {media}")
+            prompt.append(media)
+    if args.text is not None:
+        prompt.append(args.text)
+    # Generate response
+    response = model.generate_content(prompt, response_format=response_format)
+    print(colored(response, "cyan", attrs=["bold"]))
+if __name__ == "__main__":
+    main()

llava/constants.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+DEFAULT_SOUND_TOKEN = "<sound>"
+DEFAULT_SPEECH_TOKEN = "<speech>"
+SENTINEL_TOKEN = "<vila/sentinel>"
+MEDIA_TOKENS = {
+    "speech": "<speech>",
+    "sound": "<sound>",
+}
+"""
+151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151646: AddedToken("[BOS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151647: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151648: AddedToken("<vila/sentinel>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151649: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151650: AddedToken("<vila/video>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151651: AddedToken("<sound>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151652: AddedToken("<speech>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+"""
+NUM_EXTRA_TOKENS = 10

llava/conversation.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+import dataclasses
+from enum import Enum, auto
+from typing import List
+from llava.utils.logging import logger
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    AUTO = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_3 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    sep_style: SeparatorStyle = SeparatorStyle.AUTO
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            ret = self.system + self.sep
+            for rid, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    sep = self.sep if rid < len(messages) - 1 else self.sep2
+                    ret += role + message + sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version,
+        )
+conv_auto = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(),
+    sep_style=SeparatorStyle.AUTO,
+    sep="\n",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(),
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+hermes_2 = Conversation(
+    system="<|im_start|>system\nAnswer the questions.",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+    messages=(),
+    version="hermes-2",
+)
+# Template added by Yukang. Note (kentang-mit@): sep is <|eot_id|> for official template.
+llama_3_chat = Conversation(
+    system="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
+    roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
+    version="llama_v3",
+    messages=(),
+    sep_style=SeparatorStyle.LLAMA_3,
+    sep="<|eot_id|>",
+    sep2="<|end_of_text|>",
+)
+default_conversation = conv_auto
+conv_templates = {
+    "auto": conv_auto,
+    "hermes-2": hermes_2,
+    "llama_3": llama_3_chat,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "plain": conv_llava_plain,
+}
+CONVERSATION_MODE_MAPPING = {
+    "vila1.5-3b": "vicuna_v1",
+    "vila1.5-8b": "llama_3",
+    "vila1.5-13b": "vicuna_v1",
+    "vila1.5-40b": "hermes-2",
+    "llama-3": "llama_3",
+    "llama3": "llama_3",
+}
+def auto_set_conversation_mode(model_name_or_path: str) -> str:
+    global default_conversation
+    for k, v in CONVERSATION_MODE_MAPPING.items():
+        if k in model_name_or_path.lower():
+            logger.info(f"Setting conversation mode to `{v}` based on model name/path `{model_name_or_path}`.")
+            default_conversation = conv_templates[v]
+            return

llava/data/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+from .builder import *
+from .dataset import *
+from .datasets_mixture import *

llava/data/base.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+import random
+from typing import Any, Dict, List
+import torch
+from torch.utils.data import Dataset
+from transformers import PreTrainedTokenizer
+from llava.mm_utils import dynamic_process_images_and_prompt, dynamic_s2_process_images_and_prompt, process_images
+from llava.train.args import DataArguments
+from llava.utils.logging import logger
+from llava.utils.media import extract_media
+from llava.utils.tokenizer import preprocess_conversation
+__all__ = ["BaseDataset"]
+def _process_speech(speech: List[Any], data_args: DataArguments) -> torch.Tensor:
+    return torch.tensor(speech)
+def _process_sound(sound: List[Any], data_args: DataArguments) -> torch.Tensor:
+    return torch.tensor(sound)
+def _process_sound_masks(sound_masks: List[Any], data_args: DataArguments) -> torch.Tensor:
+    return torch.tensor(sound_masks)
+class BaseDataset(Dataset):
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        data_args: DataArguments,
+        no_system_prompt: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+        self.no_system_prompt = no_system_prompt
+        self.instances = []
+        self.enable_dynamic_res = False
+        self.enable_dynamic_res_s2 = False
+        # global_batch_size: int,
+        self.global_batch_size = kwargs.get("global_batch_size", 1)
+        # by default, dataset cls will resample on failure
+        self.resample_on_failure = kwargs.get("resample_on_failure", True)
+        # by default, dataset cls will resample on failure
+        self.resample_on_failure = kwargs.get("resample_on_failure", True)
+    def process(self, instance: Dict[str, Any]) -> List[Dict[str, Any]]:
+        raise NotImplementedError
+    def __getitem__(self, index: int) -> Dict[str, Any]:
+        instance = self.instances[index]
+        try:
+            # Process instance to conversation
+            conversation = self.process(instance)
+            # Extract media from conversation
+            media, media_meta = extract_media(conversation, self.data_args)
+            if "speech" in media:
+                processed_speech = _process_speech(media["speech"], self.data_args)
+            if "sound" in media:
+                processed_sound = _process_sound(media["sound"], self.data_args)
+                processed_sound_feature_masks = _process_sound_masks(media_meta["sound_feature_masks"], self.data_args)
+                processed_sound_embed_masks = _process_sound_masks(media_meta["sound_embed_masks"], self.data_args)
+            # Prepare "input_ids" and "labels" for training
+            data = preprocess_conversation(conversation, self.tokenizer, no_system_prompt=self.no_system_prompt)
+            if "speech" in media:
+                data["speech"] = processed_speech
+            if "sound" in media:
+                data["sound"] = processed_sound
+                data["sound_feature_masks"] = processed_sound_feature_masks
+                data["sound_embed_masks"] = processed_sound_embed_masks
+        except Exception as e:
+            if not self.resample_on_failure:
+                raise e
+            else:
+                logger.exception(f"Error processing instance '{instance}': '{e}'. Resampling.")
+                return self.__getitem__(random.randint(0, len(self.instances) - 1))
+        return data
+    def __len__(self) -> int:
+        return len(self.instances)

llava/data/builder.py ADDED Viewed

	@@ -0,0 +1,193 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+import os
+import os.path as osp
+from itertools import chain
+from typing import Any, List, Optional
+import torch
+import torch.distributed as dist
+from hydra.utils import instantiate
+from torch.utils.data import ConcatDataset, Dataset
+from transformers import PreTrainedTokenizer
+from llava.data.datasets_mixture import DATASETS_LEGACY
+from llava.train.args import DataArguments, TrainingArguments
+from llava.utils import io
+from llava.utils.logging import logger
+import time
+import numpy as np
+__all__ = ["DATASETS", "MIXTURES", "register_datasets", "register_mixtures", "parse_mixture", "build_dataset"]
+def load_dataset_yaml(name):
+    fname = f"{name}.yaml" if not name.endswith(".yaml") else name
+    # yaml under llava/data/registry/datasets
+    repo_path = osp.join(osp.dirname(__file__), "registry", "datasets", fname)
+    if osp.exists(repo_path):
+        return repo_path
+    # # yaml under <fs yaml path>
+    abs_path = osp.expanduser(fname)
+    if osp.exists(abs_path):
+        return abs_path
+    raise FileNotFoundError(f"Dataset '{name}' is not found in the {repo_path} or {abs_path}.")
+def register_datasets(name: Optional[str] = None):
+    if name is None:
+        name = os.environ.get("VILA_DATASETS", "default")
+        logger.info(f"Registering datasets from environment: '{name}'.")
+    # return io.load(osp.join(osp.dirname(__file__), "registry", "datasets", f"{name}.yaml"))
+    dataset_meta = {}
+    for _name in name.split(","):
+        yamlpath = load_dataset_yaml(_name)
+        logger.info(f"Registering datasets from: '{yamlpath}'.")
+        meta = io.load(yamlpath)
+        dataset_meta.update(meta)
+    return dataset_meta
+def register_mixtures():
+    return io.load(os.path.join(os.path.dirname(__file__), "registry", "mixtures.yaml"))
+DATASETS = register_datasets()
+MIXTURES = register_mixtures()
+def parse_mixture(mixture: str) -> List[str]:
+    names = mixture.split("+") if "+" in mixture else [mixture]
+    while any(name in MIXTURES for name in names):
+        names = list(chain(*[MIXTURES.get(name, [name]) for name in names]))
+    return sorted(names)
+class SubsetDataset(Dataset):
+    def __init__(self, dataset: Dataset, limit: int) -> None:
+        super().__init__()
+        self.dataset = dataset
+        self.limit = limit
+    def __len__(self) -> int:
+        return int(len(self.dataset) * self.limit)
+    def __getitem__(self, index: int) -> Any:
+        return self.dataset[index % len(self.dataset)]
+class RepeatedDataset(Dataset):
+    def __init__(self, dataset: Dataset, times: int) -> None:
+        super().__init__()
+        self.dataset = dataset
+        self.times = times
+    def __len__(self) -> int:
+        return len(self.dataset) * self.times
+    def __getitem__(self, index: int) -> Any:
+        return self.dataset[index % len(self.dataset)]
+def get_world_size():
+    if torch.distributed.is_initialized():
+        return torch.distributed.get_world_size()
+    else:
+        return 1
+def build_dataset(
+    mixture: str,
+    data_args: DataArguments,
+    training_args: TrainingArguments,
+    tokenizer: PreTrainedTokenizer,
+) -> Dataset:
+    logger.warning(f"Training VILA with mixture '{mixture}'.")
+    datasets = []
+    dataset_rng = np.random.default_rng(1234)
+    for name in parse_mixture(mixture):
+        if "*" in name:
+            name, times = name.split("*")
+            times = int(times)
+        else:
+            times = 1
+        limit_dataset = False
+        if "#" in name:
+            # we limit the max length of this dataset
+            name, max_length_percent = name.split("#")
+            limit_dataset = True
+        if DATASETS is not None and name in DATASETS:
+            if name in DATASETS_LEGACY:
+                logger.warning(f"Dataset '{name}' exists in both new and legacy registries. Using the new one.")
+            dataset = instantiate(DATASETS[name], _partial_=True)(
+                tokenizer=tokenizer,
+                data_args=data_args,
+                global_batch_size=(
+                    training_args.per_device_train_batch_size
+                    # * torch.distributed.get_world_size()
+                    * get_world_size()
+                    * training_args.gradient_accumulation_steps
+                ),
+            )
+        elif name in DATASETS_LEGACY:
+            logger.warning(f"Dataset '{name}' is from the legacy registry. Please consider migrating it.")
+            dataset = build_dataset_legacy(
+                name,
+                data_args=data_args,
+                training_args=training_args,
+                tokenizer=tokenizer,
+            )
+        else:
+            raise ValueError(f"Dataset '{name}' is not found in the registries.")
+        if limit_dataset:
+            # we limit the max length of this dataset
+            max_length = int(float(int(max_length_percent) / 100.) * len(dataset))
+            dataset = SubsetDataset(dataset, float(int(max_length_percent) / 100.))
+        if times > 1:
+            dataset = RepeatedDataset(dataset, times)
+        datasets.append(dataset)
+    return ConcatDataset(datasets)
+def build_dataset_legacy(
+    name: str,
+    data_args: DataArguments,
+    training_args: TrainingArguments,
+    tokenizer: PreTrainedTokenizer,
+) -> Dataset:
+    from llava.data.dataset import (
+        LazySupervisedDataset,
+        LazyWDSDataset,
+    )
+    dataset = DATASETS_LEGACY[name]
+    dataset_type = dataset.dataset_type
+    if dataset_type == "torch":
+        dataset_cls = LazySupervisedDataset
+    elif dataset_type == "wds":
+        dataset_cls = LazyWDSDataset
+    else:
+        raise NotImplementedError(f"{dataset_type} is not supported.")
+    data_args.meta_path = getattr(dataset, "meta_path", None)
+    data_args.caption_choice = getattr(dataset, "caption_choice", None)
+    data_args.caption_choice_2 = getattr(dataset, "caption_choice_2", None)
+    data_args.start_idx = getattr(dataset, "start_idx", None)
+    data_args.end_idx = getattr(dataset, "end_idx", None)
+    return dataset_cls(
+        tokenizer=tokenizer,
+        data_path=dataset.data_path,
+        image_folder=getattr(dataset, "image_path"),
+        data_args=data_args,
+        training_args=training_args,
+    )

llava/data/collate.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+from dataclasses import dataclass
+from typing import Any, Dict, Sequence
+import torch
+from transformers import PreTrainedTokenizer
+from llava.constants import IGNORE_INDEX
+from llava.utils.logging import logger
+__all__ = ["DataCollator"]
+@dataclass
+class DataCollator:
+    tokenizer: PreTrainedTokenizer
+    def __init__(self, tokenizer: PreTrainedTokenizer):
+        super().__init__()
+        self.tokenizer = tokenizer
+    def __call__(self, instances: Sequence[Dict[str, Any]]) -> Dict[str, Any]:
+        # Gather everything from the batch
+        input_ids, labels, media, block_sizes = [], [], {name: [] for name in self.tokenizer.media_tokens}, []
+        media_meta = {}
+        media_meta["sound_feature_masks"] = []
+        media_meta["sound_embed_masks"] = []
+        media_meta["frame_times"] = []
+        for instance in instances:
+            if isinstance(instance["input_ids"], torch.Tensor):
+                input_ids.append(instance["input_ids"])
+                labels.append(instance["labels"])
+                for name in media:
+                    objs = instance.get(name)
+                    objs = objs if objs is not None else []
+                    media[name].append([obj for obj in objs])
+                if instance.get("sound") is not None:
+                    for name_k in media_meta:
+                        if "sound" in name_k:
+                            objs = instance.get(name_k)
+                            media_meta[name_k].append([obj for obj in objs])
+                if instance.get("video") is not None or instance.get("image") is not None:
+                    for name_k in media_meta:
+                        if "frame" in name_k:
+                            objs = instance.get(name_k)
+                            media_meta[name_k].append([obj for obj in objs])
+                if "block_sizes" in instance:
+                    block_sizes.append(instance["block_sizes"])
+                else:
+                    block_sizes.append(
+                        [None for _ in range(len(instance.get("image")))] if instance.get("image") is not None else []
+                    )
+            else:
+                input_ids.extend(instance["input_ids"])
+                labels.extend(instance["labels"])
+                for name in media:
+                    objs = instance.get(name)
+                    objs = objs if objs is not None else [[] for _ in range(len(instance["input_ids"]))]
+                    media[name].extend(objs)
+                if instance.get("sound") is not None:
+                    for name_k in media_meta:
+                        if "sound" in name_k:
+                            objs = instance.get(name_k)
+                            media_meta[name_k].extend(objs)
+                if instance.get("video") is not None or instance.get("image") is not None:
+                    for name_k in media_meta:
+                        if "frame" in name_k:
+                            objs = instance.get(name_k)
+                            media_meta[name_k].append([obj for obj in objs])
+                if "block_sizes" in instance:
+                    block_sizes.extend(instance["block_sizes"])
+                else:
+                    block_sizes.extend(
+                        [[None for _ in range(len(objs))] for objs in instance.get("image")]
+                        if instance.get("image") is not None
+                        else [[] for _ in range(len(instance["input_ids"]))]
+                    )
+        batch_size = len(input_ids)
+        # Check if the number of media objects (or the number of block sizes) matches the number of media tokens
+        for name in media:
+            for k in range(batch_size):
+                if name == "image" and not all([_ is None for _ in block_sizes[k]]):
+                    actual = len(block_sizes[k])
+                else:
+                    actual = len(media[name][k])
+                expected = (input_ids[k] == self.tokenizer.media_token_ids[name]).sum().item()
+                if actual != expected:
+                    raise ValueError(
+                        f"Number mismatch between {name} objects and {name} tokens. "
+                        f"There are {expected} {name} tokens but {actual} {name} objects."
+                    )
+        # Batchify the inputs
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id,
+        )
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels,
+            batch_first=True,
+            padding_value=IGNORE_INDEX,
+        )
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+        labels = labels[:, : self.tokenizer.model_max_length]
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
+        # Truncate media objects if necessary
+        for name in media:
+            objects = []
+            for k in range(batch_size):
+                if name == "image" and not all([_ is None for _ in block_sizes[k]]):
+                    actual = len(media[name][k])
+                    num_large_scale_blocks = sum([x * y for x, y in block_sizes[k]])
+                    num_small_scale_blocks = actual - num_large_scale_blocks
+                    num_small_scale_blocks_each_img = num_small_scale_blocks // len(block_sizes[k])
+                    expected_full_image = (input_ids[k] == self.tokenizer.media_token_ids[name]).sum().item()
+                    expected = (
+                        sum([x * y for x, y in block_sizes[k][:expected_full_image]])
+                        + num_small_scale_blocks_each_img * expected_full_image
+                    )
+                    if actual > expected:
+                        logger.warning(f"Truncating the number of {name} objects from {actual} to {expected}")
+                        media[name][k] = media[name][k][:expected]
+                    objects.extend(media[name][k])
+                    block_sizes[k] = block_sizes[k][:expected_full_image]
+                else:
+                    actual = len(media[name][k])
+                    expected = (input_ids[k] == self.tokenizer.media_token_ids[name]).sum().item()
+                    if actual > expected:
+                        logger.warning(f"Truncating the number of {name} objects from {actual} to {expected}")
+                        media[name][k] = media[name][k][:expected]
+                    objects.extend(media[name][k])
+                    if name == "image":
+                        block_sizes[k] = block_sizes[k][:expected]
+            media[name] = objects
+        for name in media_meta:
+            objects = []
+            for k in range(batch_size):
+                try:
+                    objects.extend(media_meta[name][k])
+                except:
+                    continue
+            media_meta[name] = objects
+        # Flatten block sizes from [[bls_im1_instance1, bls_im2_instance1], [bls_im1_instance2, bls_im2_instance2], ...] to [bls_im1_instance1, bls_im2_instance1, bls_im1_instance2, bls_im2_instance2, ...]
+        block_sizes = sum(block_sizes, [])
+        return {
+            "input_ids": input_ids,
+            "media": media,
+            "media_config": {"image": {"block_sizes": block_sizes}, "video": {}, "speech": {}, "sound": {}},
+            "labels": labels,
+            "attention_mask": attention_mask,
+            "media_meta": media_meta,
+        }

llava/data/dataset.py ADDED Viewed

	@@ -0,0 +1,1635 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import base64
+import copy
+import io
+import json
+import os
+import os.path as osp
+import random
+import time
+import warnings
+from dataclasses import dataclass
+from typing import Dict, Sequence
+import math
+import numpy as np
+import PIL
+import torch
+import transformers
+from PIL import Image, ImageFile
+from torch.utils.data import Dataset, default_collate
+from transformers import PreTrainedTokenizer
+from transformers import AutoFeatureExtractor
+import kaldiio
+import llava.data.datasets_mixture as datasets_mixture
+from llava import conversation as conversation_lib
+from llava.constants import DEFAULT_SOUND_TOKEN,DEFAULT_SPEECH_TOKEN, IGNORE_INDEX
+from llava.data.collate import DataCollator
+from llava.mm_utils import (
+    load_audio,
+    get_num_windows,
+    tokenizer_image_token,
+)
+from torchvision import transforms
+from llava.train.args import DataArguments, TrainingArguments
+from llava.train.sequence_parallel import (
+    extract_local_from_list,
+    extract_local_input_ids,
+    extract_local_position_ids,
+    get_pg_manager,
+)
+from llava.utils.tokenizer import preprocess_conversation
+# import torchaudio
+from pytorchvideo.data.clip_sampling import ConstantClipsPerVideoSampler, UniformClipSampler
+import soundfile as sf
+from librosa import resample as librosa_resample
+import whisper
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+PIL.Image.MAX_IMAGE_PIXELS = 1000000000
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+def preprocess_multimodal(sources: Sequence[str], data_args: DataArguments) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+    for source in sources:
+        concat_values = "".join([sentence["value"] for sentence in source])
+        for sid, sentence in enumerate(source):
+            # In multimodal conversations, we automatically prepend '<image>' at the start of the first sentence if it doesn't already contain one.
+            if DEFAULT_SOUND_TOKEN in sentence["value"]:
+                sentence["value"] = sentence["value"].replace(DEFAULT_SOUND_TOKEN, f"{DEFAULT_SOUND_TOKEN}\n")
+                sentence["value"] = sentence["value"].replace(f"{DEFAULT_SOUND_TOKEN}\n\n", f"{DEFAULT_SOUND_TOKEN}\n")
+            if DEFAULT_SPEECH_TOKEN in sentence["value"]:
+                sentence["value"] = sentence["value"].replace(DEFAULT_SPEECH_TOKEN, f"{DEFAULT_SPEECH_TOKEN}\n")
+                sentence["value"] = sentence["value"].replace(f"{DEFAULT_SPEECH_TOKEN}\n\n", f"{DEFAULT_SPEECH_TOKEN}\n")
+    return sources
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
+        source[0]["value"] = DEFAULT_IMAGE_TOKEN
+        conversation = source[0]["value"] + source[1]["value"] + conversation_lib.default_conversation.sep
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]["value"], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+    return dict(input_ids=input_ids, labels=targets)
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    no_system_prompt: bool = False,
+) -> Dict:
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+    return default_collate(
+        [
+            preprocess_conversation(conversation, tokenizer, no_system_prompt=no_system_prompt)
+            for conversation in sources
+        ]
+    )
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning.
+    This class is originally implemented by the LLaVA team and modified by
+    Ji Lin and Haotian Tang.
+    """
+    def __init__(
+        self,
+        data_path: str,
+        image_folder: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        training_args: TrainingArguments,
+    ):
+        super().__init__()
+        try:
+            with open(data_path) as fp:
+                list_data_dict = json.load(fp)
+        except:
+            with open(data_path) as fp:
+                list_data_dict = [json.loads(q) for q in fp]
+        # rank0_print("Formatting inputs...Skip in lazy mode")
+        print("Formatting inputs...Skip in lazy mode")
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+        self.image_folder = image_folder
+        self.wav_processor = AutoFeatureExtractor.from_pretrained('Qwen/Qwen2-Audio-7B')
+    def __len__(self):
+        return len(self.list_data_dict)
+    @property
+    def lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            img_tokens = 128 if "image" in sample else 0
+            length_list.append(sum(len(conv["value"].split()) for conv in sample["conversations"]) + img_tokens)
+        return length_list
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            if 'duration' in sample.keys():
+                duration = sample["duration"]
+            else:
+                duration = 10.
+            try:
+                cur_len = sum(len(conv["value"].split()) for conv in sample["conversations"]) + int(math.ceil(duration * 25))
+                cur_len = cur_len if "sound" in sample else -cur_len
+                length_list.append(cur_len)
+            except:
+                try:
+                    cur_len = 0 + int(math.ceil(duration * 25))
+                    cur_len = cur_len if "sound" in sample else -cur_len
+                    length_list.append(cur_len)
+                except:
+                    cur_len = 0 + int(math.ceil(10. * 25))
+                    cur_len = cur_len if "sound" in sample else -cur_len
+                    length_list.append(cur_len)
+        return length_list
+    @staticmethod
+    def _load_sound(sound_file, wav_processor, sample_rate=16000, window_length=30.0, window_overlap=0.0, max_num_window=3, audio_start = 0.0):
+        if sound_file is None:
+            return None
+        window_length  = int(window_length * sample_rate)
+        window_overlap = int(window_overlap * sample_rate)
+        max_num_window = int(max_num_window)
+        duration = max_num_window * (window_length - window_overlap) + window_overlap
+        sound_outputs = []
+        audio_feature_masks = []
+        audio_embed_masks = []
+        try:
+            sound_filename = str.split(sound_file, '/')[-1]
+            if '.ark' in sound_filename:
+                sound = kaldiio.load_mat(sound_file)
+                audio_data = sound[1]
+                audio_data=audio_data.astype(np.float16)
+            else:
+                audio_data = load_audio(sound_file, sample_rate, duration, audio_start) # already cuts to max duration
+            T = len(audio_data)
+            audio_data = audio_data.reshape(1, -1)
+            num_windows, full_length = get_num_windows(T, sample_rate, max_num_window)
+            audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float()
+            for i in range(num_windows):
+                audio_embed_mask = torch.zeros(750)
+                start = i * (window_length - window_overlap)
+                audio_data_tensor_this = audio_data_tensor[:, start:start+window_length]
+                orig_length = audio_data_tensor_this.shape[1]
+                audio_data_tensor_this = wav_processor(audio_data_tensor_this.cpu().numpy(), sampling_rate=sample_rate, return_tensors="pt") #.squeeze(0) text="dummy", audios=audio_data_tensor_this, return_tensors="pt") #
+                sound_outputs.append(audio_data_tensor_this["input_features"])
+                # calculate the mask for the input melspec to Whisper
+                melspec_frames_this_window = int(math.ceil(orig_length / 160))
+                feature_attention_mask = torch.zeros(3000, dtype=torch.int32)
+                feature_attention_mask[:melspec_frames_this_window] = 1
+                audio_feature_masks.append(feature_attention_mask.unsqueeze(0))
+                # calculate the mask for the output embedding for use in AF2
+                conv_lengths = (melspec_frames_this_window - 1) // 2 + 1
+                output_embedding_lengths = (conv_lengths - 2) // 2 + 1
+                audio_embed_mask[:output_embedding_lengths] = 1
+                audio_embed_masks.append(audio_embed_mask)
+        except:
+            print('error loading file', sound_file)
+            sound_outputs.append(torch.zeros(1,128,3000))
+            audio_feature_masks.append(torch.zeros(1,3000, dtype=torch.int32))
+            audio_embed_masks.append(torch.zeros(750))
+        return torch.stack(sound_outputs, dim=0), torch.stack(audio_feature_masks, dim=0), torch.stack(audio_embed_masks, dim=0)
+    @staticmethod
+    def _load_speech(speech_path,sample_rate=16000):
+        if speech_path is None:
+            return None
+        speech_outputs = []
+        try:
+            speech = whisper.load_audio(speech_path)
+            speech = whisper.pad_or_trim(speech)
+            mel = whisper.log_mel_spectrogram(speech)
+            speech_outputs.append(mel.unsqueeze(0))
+        except:
+            speech_outputs.append(torch.zeros(1,80,3000))
+        return torch.stack(speech_outputs, dim=0)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        import re
+        if "sound" in self.list_data_dict[i]:
+            # chat data loading
+            if isinstance(self.list_data_dict[i]["sound"],list):
+                sound_files = self.list_data_dict[i]["sound"]
+                conversations_raw = self.list_data_dict[i]["conversations"]
+                # Step 1: Extract <sound-X> tags in order of appearance
+                sound_tag_pattern = re.compile(r"<sound-(\d+)>")
+                ordered_sound_tags = []
+                for turn in conversations_raw:
+                    tags = sound_tag_pattern.findall(turn["value"])
+                    ordered_sound_tags.extend([f"<sound-{tag}>" for tag in tags])
+                # Step 2: Load sound tensors in the order of tags
+                sound_tensor = []
+                audio_feature_masks = []
+                audio_embed_masks = []
+                sound_token_map = {}
+                for tag in ordered_sound_tags:
+                    idx = int(tag.split('-')[1][:-1])
+                    if tag not in sound_token_map:
+                        this_sound_tensor, af_mask, ae_mask = self._load_sound(sound_file, self.wav_processor, max_num_window=self.data_args.audio_frames)
+                        this_sound_tensor = this_sound_tensor.squeeze(1)  # (windows x 750 x 2048)
+                        sound_token_map[tag] = ("<sound>\n" * this_sound_tensor.shape[0]).rstrip()
+                        sound_tensor.append(this_sound_tensor)
+                        audio_feature_masks.append(af_mask)
+                        audio_embed_masks.append(ae_mask)
+                    else:
+                        # If already loaded, still append to match sequence
+                        this_sound_tensor, af_mask, ae_mask = self._load_sound(sound_file, self.wav_processor, max_num_window=self.data_args.audio_frames)
+                        this_sound_tensor = this_sound_tensor.squeeze(1)
+                        sound_tensor.append(this_sound_tensor)
+                        audio_feature_masks.append(af_mask)
+                        audio_embed_masks.append(ae_mask)
+                # Process conversations and inject sound markers
+                conversation = []
+                for turn in conversations_raw:
+                    role = turn["from"]
+                    value = turn["value"]
+                    # Replace any <sound-X> tag with corresponding repeated <sound>\n
+                    for tag, sound_token in sound_token_map.items():
+                        value = value.replace(tag, sound_token)
+                    conversation.append({
+                        "from": role,
+                        "value": value.rstrip()
+                    })
+                sources = [conversation]
+                sound_tensor = torch.cat(sound_tensor, dim=0)
+                audio_feature_masks = torch.cat(audio_feature_masks, dim=0)
+                audio_embed_masks = torch.cat(audio_embed_masks, dim=0)
+            else:
+                sound_file = self.list_data_dict[i]["sound"]
+                question = str(self.list_data_dict[i]["conversations"][0]["value"].rstrip())
+                answer = str(self.list_data_dict[i]["conversations"][1]["value"]).rstrip()
+                question = question.replace("<speech>\n", "").replace("\n<speech>", "").replace("<speech>", "")
+                question = question.replace("<sound>\n", "").replace("\n<sound>", "").replace("<sound>", "")
+                question = question.replace("<en><asr>\n", "").replace("\n<en><asr>", "").replace("<en><asr>", "")
+                question = question.replace("<eng><asr>\n", "").replace("\n<eng><asr>", "").replace("<eng><asr>", "")
+                sound_tensor, audio_feature_masks, audio_embed_masks = self._load_sound(sound_file, self.wav_processor, max_num_window=self.data_args.audio_frames)
+                sound_tensor=sound_tensor.squeeze(1) # squeeze the irrelevant dimension which was caused due to processor getting 1 batch for processing --> (windows x 750 x 2048)
+                question = "<sound>\n" * sound_tensor.shape[0] + question
+                conversation = [
+                    {"from": "human", "value": question},
+                    {"from": "gpt", "value": answer},
+                ]
+                sources = [conversation]
+        data_dict = preprocess(
+            sources,
+            self.tokenizer,
+            has_image=(
+                "sound" in self.list_data_dict[i]
+            ),
+        )
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+        if "sound" in self.list_data_dict[i]:
+            data_dict["sound"] = sound_tensor
+            data_dict["sound_feature_masks"] = audio_feature_masks
+            data_dict["sound_embed_masks"] = audio_embed_masks
+        if "speech" in self.list_data_dict[i]:
+            data_dict["speech"] = speech_tensor
+        return data_dict
+class LazyMMC4Dataset(Dataset):
+    """Dataset for supervised fine-tuning.
+    This class is implemented by Ji Lin and Haotian Tang."""
+    def __init__(
+        self,
+        data_path: str,
+        image_folder: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        training_args: TrainingArguments,
+        image_following_text_only=False,
+        text_only=False,
+    ):
+        super().__init__()
+        import pickle
+        n_samples = []
+        # actually shards and stats info
+        n_shards = len(os.listdir(data_path)) // 2
+        # n_shards = 100
+        count_info_list = sorted([f for f in os.listdir(data_path) if f.endswith(".count")])[:n_shards]
+        n_samples = [int(open(os.path.join(data_path, f)).read().strip()) for f in count_info_list]
+        print("total MMC4 samples", sum(n_samples))  # 10,881,869
+        PROCESS_GROUP_MANAGER = get_pg_manager()
+        if PROCESS_GROUP_MANAGER is not None:
+            import torch.distributed as dist
+            sequence_parallel_size = training_args.seq_parallel_size
+        else:
+            sequence_parallel_size = 1
+        print("sequence_parallel_size", sequence_parallel_size)
+        rank = training_args.process_index // sequence_parallel_size  # int(os.environ["RANK"])
+        world_size = training_args.world_size // sequence_parallel_size  # int(os.environ["WORLD_SIZE"])
+        shared_size = n_shards // world_size
+        gpu_samples = [sum(n_samples[i * shared_size : (i + 1) * shared_size]) for i in range(world_size)]
+        self.n_samples = min(gpu_samples) * world_size  # total size
+        self.idx_offset = rank * min(gpu_samples)
+        shard_start, shard_end = rank * shared_size, (rank + 1) * shared_size
+        print(f" * loading data from shard {shard_start}-{shard_end}")
+        shard_names = [d.replace(".count", ".pkl") for d in count_info_list]
+        shard_names = shard_names[shard_start:shard_end]
+        full_data_list = []
+        # now load data
+        for shard_name in shard_names:
+            # load shard
+            with open(os.path.join(data_path, shard_name), "rb") as f:
+                data_list = pickle.load(f)
+            full_data_list.extend(data_list)
+        print(f"* loaded totally {len(full_data_list)} samples")
+        self.data_list = full_data_list
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+        self.image_folder = image_folder
+        self.image_following_text_only = image_following_text_only
+        self.text_only = text_only
+    def __len__(self):
+        # return len(self.data_list)
+        return self.n_samples
+    @property
+    def modality_lengths(self):
+        # Estimate the number of tokens after tokenization, used for length-grouped sampling
+        length_list = []
+        for info in self.data_list:
+            num_images = min(6, len(info["image_info"]))
+            sentences = [info["text_list"][x["matched_text_index"]] for x in info["image_info"][:num_images]]
+            # The unit of cur_len is "words". We assume 1 word = 2 tokens.
+            cur_len = num_images * self.num_image_tokens // 2 + sum([len(x) for x in sentences])
+            length_list.append(cur_len)
+        return length_list
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        info = self.data_list[i - self.idx_offset]
+        sentences = info["text_list"]
+        # kentang-mit@: remove existing <image> tokens in the sentences
+        for ix in range(len(sentences)):
+            # if this is an html tag, we still preserve its semantic meaning
+            sentences[ix] = sentences[ix].replace("<image>", "<IMAGE>")
+        sim_matrix = info["similarity_matrix"]  # we do not use this...
+        # convert images from base64 to PIL and filter based on image-text similarity
+        images, sentence_ixs = [], []
+        if not self.text_only:
+            for sample_image, sim_vec in zip(info["image_info"], sim_matrix):
+                image_base64 = sample_image["image_base64"]
+                rawbytes = base64.b64decode(image_base64)
+                sim_ix = sample_image["matched_text_index"]
+                # sim_ix = np.argmax(sim_vec)
+                # sim_score = sim_vec[sim_ix]
+                # filter to images >= 5KB
+                # if len(rawbytes) // 1000 <= 5:
+                #     continue
+                # if sim_score < 0.24:
+                #     continue
+                image = Image.open(io.BytesIO(rawbytes)).convert("RGB")
+                images.append(image)
+                sentence_ixs.append(sim_ix)
+        # constrain max num 6 images
+        max_num_images = 6
+        if len(images) > max_num_images:
+            images = images[:max_num_images]
+            sentence_ixs = sentence_ixs[:max_num_images]
+        # reorder images according to text insertion
+        images = [images[iii] for iii in np.argsort(sentence_ixs)]
+        # preprocess and tokenize text
+        for ix in sentence_ixs:
+            sentences[ix] = f"<image>\n{sentences[ix]}"
+        if self.image_following_text_only:
+            # use pad tokens to divide sentence pieces
+            text = self.tokenizer.pad_token.join(sentences)
+        else:
+            text = " ".join(sentences)
+        # whitespace cleanup
+        text = text.replace("<image> ", "<image>").replace(" <image>", "<image>")
+        text = f"{text}{self.tokenizer.eos_token}"  # add eos token
+        if len(images) > 0:
+            if self.data_args.image_aspect_ratio == "dynamic_s2":
+                images, block_sizes = dynamic_s2_process_images_and_prompt(
+                    images, text, self.data_args, self.image_folder
+                )
+            elif self.data_args.image_aspect_ratio == "dynamic":
+                images, text = dynamic_process_images_and_prompt(
+                    images, text, self.data_args, self.image_folder, max_tiles=6
+                )
+            else:
+                images = torch.stack([process_image(image, self.data_args, self.image_folder) for image in images])
+            # the same size for all images, so we concat
+            # cur_token_len = (
+            #     images[0].shape[-2] // self.multimodal_cfg["patch_size"]
+            # ) * (images[0].shape[-1] // self.multimodal_cfg["patch_size"])
+            # cur_token_len += self.multimodal_cfg["n_extra_patch"]
+        else:
+            images = None
+            # cur_token_len = 0
+        input_ids = tokenizer_image_token(
+            text,
+            self.tokenizer,
+            return_tensors="pt",
+        )
+        image_token_id = self.tokenizer.media_token_ids["image"]
+        # now check the case where the last token is image patch token
+        if input_ids[-1] == image_token_id:  # need to remove one last image
+            last_non_im_patch_indices = torch.where(input_ids != image_token_id)[0][-1] + 1
+            input_ids = input_ids[:last_non_im_patch_indices]
+        n_im_patch = (input_ids == image_token_id).sum().item()
+        if self.data_args.image_aspect_ratio != "dynamic_s2":
+            images = images[:n_im_patch]
+            assert len(images) == n_im_patch, print(text, input_ids)
+        assert len(input_ids.shape) == 1, "Unexpected shape of 'input_ids' from MMC4."
+        input_ids = (
+            torch.concat([torch.tensor([self.tokenizer.bos_token_id]), input_ids])
+            if self.tokenizer.bos_token_id is not None and input_ids[0] != self.tokenizer.bos_token_id
+            else input_ids
+        )
+        targets = input_ids.clone()
+        if self.image_following_text_only:  # keep only text after leading image token
+            # remove loss for any token before the first <image> token
+            label_idx = 0
+            while label_idx < targets.shape[-1] and targets[label_idx] != image_token_id:
+                targets[label_idx] = IGNORE_INDEX
+                label_idx += 1
+            pad_token = self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0]
+            pad_token_idxs = torch.where(targets == pad_token)[0]
+            for pad_token_idx in pad_token_idxs:
+                token_idx = pad_token_idx + 1
+                while token_idx < targets.shape[-1] and targets[token_idx] != image_token_id:
+                    targets[token_idx] = IGNORE_INDEX
+                    token_idx += 1
+            # do not train on padding tokens
+            targets[targets == pad_token] = IGNORE_INDEX
+        # mask image tokens is unnecessary for llava-1.5
+        # targets[targets == IMAGE_TOKEN_INDEX] = IGNORE_INDEX
+        # print(input_ids.shape)
+        data_dict = dict(input_ids=input_ids, labels=targets, image=images)
+        if self.data_args.image_aspect_ratio == "dynamic_s2":
+            data_dict["block_sizes"] = block_sizes
+        return data_dict
+class LazyCoyoDataset(Dataset):
+    """Dataset for supervised fine-tuning.
+    This class is implemented by Ji Lin and Haotian Tang."""
+    num_image_tokens = 576
+    def __init__(
+        self,
+        data_path: str,
+        image_folder: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        training_args: TrainingArguments,
+        # kentang-mit@: balance the total number of tokens for Coyo and MMC4.
+        n_samples_per_idx=4,
+    ):
+        super().__init__()
+        import pickle
+        n_samples = []
+        # actually shards and stats info
+        n_shards = len(os.listdir(data_path)) // 2
+        # n_shards = 100
+        count_info_list = sorted([f for f in os.listdir(data_path) if f.endswith(".count")])[:n_shards]
+        n_samples = [int(open(os.path.join(data_path, f)).read().strip()) for f in count_info_list]
+        print("total COYO samples", sum(n_samples))
+        PROCESS_GROUP_MANAGER = get_pg_manager()
+        if PROCESS_GROUP_MANAGER is not None:
+            import torch.distributed as dist
+            sequence_parallel_size = training_args.seq_parallel_size
+        else:
+            sequence_parallel_size = 1
+        print("sequence_parallel_size", sequence_parallel_size)
+        rank = training_args.process_index // sequence_parallel_size  # int(os.environ["RANK"])
+        world_size = training_args.world_size // sequence_parallel_size  # int(os.environ["WORLD_SIZE"])
+        shared_size = n_shards // world_size
+        gpu_samples = [
+            sum(n_samples[i * shared_size : (i + 1) * shared_size]) // n_samples_per_idx for i in range(world_size)
+        ]
+        self.n_samples = min(gpu_samples) * world_size  # total size
+        self.idx_offset = rank * min(gpu_samples)
+        shard_start, shard_end = rank * shared_size, (rank + 1) * shared_size
+        print(f" * loading data from shard {shard_start}-{shard_end}")
+        shard_names = [d.replace(".count", ".pkl") for d in count_info_list]
+        shard_names = shard_names[shard_start:shard_end]
+        full_data_list = []
+        # now load data
+        for shard_name in shard_names:
+            # load shard
+            with open(os.path.join(data_path, shard_name), "rb") as f:
+                shard_data = pickle.load(f)
+                random.seed(42)
+                if "mmc4" in data_path:
+                    random.shuffle(shard_data)  # shuffle for MMC4cap only
+                full_data_list.extend(shard_data)
+        print(f"* loaded totally {len(full_data_list)} samples")
+        # now pack the samples into groups
+        n_groups = len(full_data_list) // n_samples_per_idx
+        full_data_list = [
+            full_data_list[i : i + n_samples_per_idx] for i in range(0, len(full_data_list), n_samples_per_idx)
+        ]
+        if len(full_data_list[-1]) < n_samples_per_idx:
+            full_data_list = full_data_list[:-1]
+        assert len(full_data_list) == n_groups
+        print(f"split into {n_groups} groups")
+        self.data_list = full_data_list
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+        self.image_folder = image_folder
+    def __len__(self):
+        # return len(self.data_list)
+        return self.n_samples
+    @property
+    def modality_lengths(self):
+        # Estimate the number of tokens after tokenization, used for length-grouped sampling
+        length_list = []
+        for samples in self.data_list:
+            cur_len = sum([len(conv["text" if "text" in conv else "caption"].split()) for conv in samples])
+            # The unit of cur_len is "words". We assume 1 word = 2 tokens.
+            cur_len = cur_len + len(samples) * self.num_image_tokens // 2
+            length_list.append(cur_len)
+        return length_list
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        CONCAT_SAMPLES = False
+        info_list = self.data_list[i - self.idx_offset]
+        text_list = []
+        image_list = []
+        for sample in info_list:
+            caption_key = (
+                "text" if "text" in sample else "caption"
+            )  # kentang-mit@: remove existing <image> tokens in the sentences
+            # kentang-mit@: remove existing <image> token.
+            # if this is an html tag, we still preserve its semantic meaning
+            sample[caption_key] = sample[caption_key].replace("<image>", "<IMAGE>")
+            text_list.append(DEFAULT_IMAGE_TOKEN + "\n" + sample[caption_key] + self.tokenizer.eos_token)
+            if "image" in sample:
+                image_base64 = sample["image"]
+                rawbytes = base64.b64decode(image_base64)
+            else:
+                rawbytes = sample["rawbytes"]
+            image = Image.open(io.BytesIO(rawbytes)).convert("RGB")
+            image_list.append(image)
+        image_list = torch.stack([process_image(image, self.data_args, self.image_folder) for image in image_list])
+        if CONCAT_SAMPLES:
+            # into <image>cap<eos><image>cap<eos>...
+            text_list = "".join(text_list)
+            input_ids = self.tokenizer(
+                text_list,
+                return_tensors="pt",
+                padding="longest",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+            ).input_ids  # 4, seq_len
+            input_ids = input_ids[0]
+        else:
+            input_ids = [
+                tokenizer_image_token(
+                    prompt,
+                    self.tokenizer,
+                    return_tensors="pt",
+                )
+                for prompt in text_list
+            ]
+            # print([x.shape[0] for x in input_ids], [len(x.split()) for x in text_list], [len(re.findall(r"<image[^>]*>", x)) for x in text_list])
+            # input_ids = torch.nn.utils.rnn.pad_sequence(
+            #     input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+            # )
+        targets = copy.deepcopy(input_ids)
+        for i in range(len(targets)):
+            targets[i][targets[i] == self.tokenizer.pad_token_id] = IGNORE_INDEX
+        return dict(input_ids=input_ids, labels=targets, image=image_list)
+class LazyWDSDataset(Dataset):
+    """Dataset for supervised fine-tuning.
+    This class is implemented by Ji Lin and Ligeng Zhu."""
+    def __init__(
+        self,
+        data_path: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        image_folder: str,
+        training_args: TrainingArguments,
+    ):
+        super().__init__()
+        n_samples = []
+        n_shards = len(os.listdir(data_path)) // 3
+        for shard in range(n_shards):
+            with open(os.path.join(data_path, f"{shard:05d}_stats.json")) as f:
+                info = json.load(f)
+                n_samples.append(info["successes"])
+        # print(f"[DEBUG] {data_path} total samples", sum(n_samples))  # 10,881,869
+        PROCESS_GROUP_MANAGER = get_pg_manager()
+        if PROCESS_GROUP_MANAGER is not None:
+            import torch.distributed as dist
+            sequence_parallel_size = training_args.seq_parallel_size
+        else:
+            sequence_parallel_size = 1
+        print("sequence_parallel_size", sequence_parallel_size)
+        rank = training_args.process_index // sequence_parallel_size  # int(os.environ["RANK"])
+        world_size = training_args.world_size // sequence_parallel_size  # int(os.environ["WORLD_SIZE"])
+        shared_size = n_shards // world_size
+        print("rank", rank, "world_size", world_size, "shared_size", shared_size)
+        gpu_samples = [sum(n_samples[i * shared_size : (i + 1) * shared_size]) for i in range(world_size)]
+        self.n_samples = min(gpu_samples) * world_size  # total size
+        self.idx_offset = rank * min(gpu_samples)
+        shard_start, shard_end = rank * shared_size, (rank + 1) * shared_size
+        print(f" * loading data from shard {shard_start}-{shard_end}")
+        tar_list = [f"{shard_idx:05d}.tar" for shard_idx in range(shard_start, shard_end)]
+        self.data_list = []
+        t1 = time.time()
+        for tar in tar_list:
+            tmp_path = f"/tmp/ccs{tar}"
+            tar_path = os.path.join(data_path, tar)
+            if PROCESS_GROUP_MANAGER is not None:
+                dist.barrier()
+                if PROCESS_GROUP_MANAGER.sp_rank == 0:
+                    os.makedirs(tmp_path, exist_ok=True)
+                    os.system(f"tar -xkf {tar_path} -C {tmp_path}")
+                dist.barrier()
+            else:
+                os.makedirs(tmp_path, exist_ok=True)
+                os.system(f"tar -xkf {tar_path} -C {tmp_path}")
+            txt_list = [f for f in os.listdir(tmp_path) if f.endswith(".txt")]
+            for txt in txt_list:
+                caption = open(os.path.join(tmp_path, txt)).read().strip()
+                image_path = os.path.join(tmp_path, txt.split(".")[0] + ".jpg")
+                self.data_list.append({"caption": caption, "image": image_path})
+        t2 = time.time()
+        print(f"Loading done. Total time: {t2 - t1:.2f} seconds")
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+        self.image_folder = image_folder
+    def __len__(self):
+        return self.n_samples
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        # print("i", i, "idx_offset", self.idx_offset, "len", len(self.data_list))
+        info = self.data_list[i - self.idx_offset]
+        caption, image_path = info["caption"], info["image"]
+        rand_prompt = "<image>\n"
+        sources = [
+            {
+                "image": image_path,
+                "conversations": [
+                    {"from": "human", "value": rand_prompt},
+                    {"from": "gpt", "value": caption},
+                ],
+            }
+        ]
+        # one example of sources
+        # [{'id': 'GCC_train_001738742', 'image': 'GCC_train_001738742.jpg', 'conversations': [{'from': 'human', 'value': 'Provide a brief description of the given image.\n<image>'}, {'from': 'gpt', 'value': 'a sketch of an ostrich'}]}]
+        if "image" in sources[0]:
+            image = process_image(sources[0]["image"], self.data_args, self.image_folder)
+            image = torch.unsqueeze(image, dim=0)
+            # now random pick some context samples for training
+            if hasattr(self.data_args, "num_shots"):
+                if self.data_args.num_shots > 0:
+                    raise NotImplementedError
+        else:
+            raise NotImplementedError
+        data_dict = preprocess([sources[0]["conversations"]], self.tokenizer, has_image=True)
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+        # image exist in the data
+        if image is not None:
+            data_dict["image"] = image
+        else:
+            raise NotImplementedError
+        return data_dict
+class LazyCCSWebDataset(Dataset):
+    """Dataset for supervised fine-tuning.
+    This class is implemented by Ligeng Zhu."""
+    def __init__(
+        self,
+        data_path: str,
+        image_folder: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        training_args: TrainingArguments,
+    ):
+        super().__init__()
+        t1 = time.time()
+        from llava.data.simple_vila_webdataset import VILAWebDataset
+        print("[DEBUG] ", osp.abspath(data_path))
+        self.dataset = VILAWebDataset(data_path=osp.abspath(data_path))
+        t2 = time.time()
+        print(f"Loading done. Total time: {t2 - t1:.2f} seconds")
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        # info = self.data_list[i - self.idx_offset]
+        # caption, image_path = info["caption"], info["image"]
+        info = self.dataset[i]
+        if ".jpg" in info:
+            caption, image_path = info[".txt"], info[".jpg"]
+        elif ".png" in info:
+            caption, image_path = info[".txt"], info[".png"]
+        elif ".webp" in info:
+            caption, image_path = info[".txt"], info[".webp"]
+        elif ".bmp" in info:
+            caption, image_path = info[".txt"], info[".bmp"]
+        elif ".tiff" in info:
+            caption, image_path = info[".txt"], info[".tiff"]
+        else:
+            print(info.keys())
+            print(info)
+            raise KeyError
+        caption = caption.replace("<image>", "<IMAGE>")
+        if isinstance(image_path, io.BytesIO):
+            image_path = Image.open(image_path).convert("RGB")
+        if not isinstance(image_path, PIL.Image.Image):
+            print(image_path)
+            print(info.keys())
+            print(type(image_path))
+            raise NotImplementedError
+        rand_prompt = "<image>\n"
+        sources = [
+            {
+                "image": image_path,
+                "conversations": [
+                    {"from": "human", "value": rand_prompt},
+                    {"from": "gpt", "value": caption},
+                ],
+            }
+        ]
+        # one example of sources
+        # [{'id': 'GCC_train_001738742', 'image': 'GCC_train_001738742.jpg', 'conversations': [{'from': 'human', 'value': 'Provide a brief description of the given image.\n<image>'}, {'from': 'gpt', 'value': 'a sketch of an ostrich'}]}]
+        if "image" in sources[0]:
+            image = process_image(sources[0]["image"], self.data_args, image_folder=None)
+            image = torch.unsqueeze(image, dim=0)
+            # now random pick some context samples for training
+            if hasattr(self.data_args, "num_shots"):
+                if self.data_args.num_shots > 0:
+                    raise NotImplementedError
+        else:
+            raise NotImplementedError
+        data_dict = preprocess([sources[0]["conversations"]], self.tokenizer, has_image=True)
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+        # image exist in the data
+        if image is not None:
+            data_dict["image"] = image
+        else:
+            raise NotImplementedError
+        return data_dict
+from functools import lru_cache
+@lru_cache(maxsize=16)
+def lru_json_load(fpath):
+    with open(fpath) as fp:
+        return json.load(fp)
+class LazyCoyoWebDataset(Dataset):
+    """Dataset for supervised fine-tuning.
+    This class is implemented by Ligeng Zhu."""
+    num_image_tokens = 576
+    def __init__(
+        self,
+        data_path: str,
+        image_folder: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        training_args: TrainingArguments,
+        # kentang-mit@: balance the total number of tokens for Coyo and MMC4.
+        n_samples_per_idx=4,
+    ):
+        super().__init__()
+        from llava.data.simple_vila_webdataset import VILAWebDataset
+        print("[DEBUG] ", osp.abspath(data_path))
+        self.dataset = VILAWebDataset(data_path=osp.abspath(data_path), meta_path=data_args.meta_path)
+        if data_args.start_idx >= 0 and data_args.end_idx >= 0:
+            # Ligeng: support slicing for ablate different subsets.
+            total = len(self.dataset)
+            start_idx = int(total * data_args.start_idx)
+            end_idx = int(total * data_args.end_idx)
+            print(f"loading subset from {start_idx} to {end_idx}, total {total}")
+            self.dataset = torch.utils.data.Subset(self.dataset, range(start_idx, end_idx))
+        # For caption choice,
+        #   if None: use original caption
+        #   if a folder path: use specified caption to override original one (choice1)
+        #   if a folder path: use specified caption and concat with original one (choice2)
+        self.caption_choice = None
+        self.caption_choice_2 = None
+        self.data_path = data_path
+        if data_args.caption_choice is not None:
+            self.caption_choice = data_args.caption_choice
+            print("[recap] Override coyo caption using ", self.caption_choice)
+        if data_args.caption_choice_2 is not None:
+            self.caption_choice_2 = data_args.caption_choice_2
+            print("[recapv2] Override coyo caption using ", self.caption_choice_2)
+        print("total samples", len(self.dataset))
+        PROCESS_GROUP_MANAGER = get_pg_manager()
+        if PROCESS_GROUP_MANAGER is not None:
+            import torch.distributed as dist
+            sequence_parallel_size = training_args.seq_parallel_size
+            sequence_parallel_rank = PROCESS_GROUP_MANAGER.sp_rank
+        else:
+            sequence_parallel_size = 1
+        print("sequence_parallel_size", sequence_parallel_size)
+        rank = (
+            training_args.process_index // sequence_parallel_size if "RANK" in os.environ else 2
+        )  # int(os.environ["RANK"])
+        world_size = (
+            training_args.world_size // sequence_parallel_size if "WORLD_SIZE" in os.environ else 32
+        )  # int(os.environ["WORLD_SIZE"])
+        print(
+            "rank",
+            rank,
+            "world_size",
+            world_size,
+        )
+        self.n_samples_per_idx = n_samples_per_idx
+        # self.n_samples = len(self.dataset) // n_samples_per_idx
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+    def __len__(self):
+        return len(self.dataset) // self.n_samples_per_idx
+    @property
+    def modality_lengths(self):
+        # Estimate the number of tokens after tokenization, used for length-grouped sampling
+        length_list = []
+        for samples in self.data_list:
+            cur_len = sum([len(conv["text" if "text" in conv else "caption"].split()) for conv in samples])
+            # The unit of cur_len is "words". We assume 1 word = 2 tokens.
+            cur_len = cur_len + len(samples) * self.num_image_tokens // 2
+            length_list.append(cur_len)
+        return length_list
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        CONCAT_SAMPLES = False
+        # info_list = self.dataset[i - self.idx_offset]
+        begin_idx, end_idx = (
+            i * self.n_samples_per_idx,
+            (i + 1) * self.n_samples_per_idx,
+        )
+        end_idx = min(end_idx, len(self.dataset))
+        text_list = []
+        image_list = []
+        for idx in range(begin_idx, end_idx):
+            info = self.dataset[idx]
+            if ".jpg" in info:
+                caption, image_path = info[".txt"], info[".jpg"]
+            elif ".png" in info:
+                caption, image_path = info[".txt"], info[".png"]
+            elif ".webp" in info:
+                caption, image_path = info[".txt"], info[".webp"]
+            elif ".bmp" in info:
+                caption, image_path = info[".txt"], info[".bmp"]
+            elif ".tiff" in info:
+                caption, image_path = info[".txt"], info[".tiff"]
+            else:
+                print(info.keys())
+                print(info)
+                raise KeyError
+            if self.caption_choice is not None:
+                # load new captions
+                shard = info["__shard__"]
+                url = info[".json"]["url"]
+                tar_name = osp.relpath(osp.realpath(shard), osp.realpath(self.data_path))
+                # tar_name = osp.dirname(shard)
+                shard_json_path = osp.join(self.caption_choice, tar_name + ".json")
+                try:
+                    shard_json = lru_json_load(shard_json_path)
+                    try:
+                        caption = shard_json[url]["output"]
+                    except KeyError:
+                        print(f"{url} not in caption. fallback to original caption temporarially")
+                except:
+                    print(f"shard_json_path {shard_json_path} not found. fallback to original caption temporarially")
+            caption = caption.replace("<image>", "<IMAGE>")
+            text_list.append(DEFAULT_IMAGE_TOKEN + caption + self.tokenizer.eos_token)
+            if isinstance(image_path, io.BytesIO):
+                image_path = Image.open(image_path).convert("RGB")
+            if not isinstance(image_path, PIL.Image.Image):
+                print(image_path)
+                print(info.keys())
+                print(type(image_path))
+                raise NotImplementedError
+            image_list.append(image_path)
+        # image_list = torch.stack([process_image(image, self.data_args, image_folder=None) for image in image_list])
+        # NOTE(fix by ligeng)
+        #  now image_list should return a list of image tensor where each has a dimension of (1, c, h, w)
+        image_list = [process_image(image, self.data_args, image_folder=None).unsqueeze(0) for image in image_list]
+        if CONCAT_SAMPLES:
+            # into <image>cap<eos><image>cap<eos>...
+            text_list = "".join(text_list)
+            input_ids = self.tokenizer(
+                text_list,
+                return_tensors="pt",
+                padding="longest",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+            ).input_ids  # 4, seq_len
+            input_ids = input_ids[0]
+        else:
+            input_ids = [
+                tokenizer_image_token(
+                    prompt,
+                    self.tokenizer,
+                    return_tensors="pt",
+                )
+                for prompt in text_list
+            ]
+            input_ids = [
+                (
+                    torch.concat([torch.tensor([self.tokenizer.bos_token_id]), input_ids_i])
+                    if input_ids_i[0] != self.tokenizer.bos_token_id
+                    else input_ids_i
+                )
+                for input_ids_i in input_ids
+            ]
+        targets = copy.deepcopy(input_ids)
+        for i in range(len(targets)):
+            targets[i][targets[i] == self.tokenizer.pad_token_id] = IGNORE_INDEX
+        return dict(input_ids=input_ids, labels=targets, image=image_list)
+class LazyVideoWebDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(
+        self,
+        data_path: str,
+        image_folder: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        training_args: TrainingArguments,
+        # cache_path: str,
+        # n_samples_per_idx=4,
+    ):
+        super().__init__()
+        # from llava.data.simple_video_dataset import SimpleVideoDataset
+        from llava.data.simple_vila_webdataset import VILAWebDataset
+        print("[DEBUG] ", osp.abspath(data_path))
+        self.dataset = VILAWebDataset(
+            data_path=osp.abspath(data_path),
+            meta_path=f"{osp.abspath(data_path)}/wids-meta.json",
+            # cache_dir=cache_path,
+        )
+        # None: use original caption
+        # Folder path: use original caption
+        self.caption_choice = None
+        self.data_path = data_path
+        if data_args.caption_choice is not None:
+            self.caption_choice = data_args.caption_choice
+            print("[recap] Override LazyVideo caption using ", self.caption_choice)
+        print("total samples", len(self.dataset))
+        # InternVid: TODO
+        PROCESS_GROUP_MANAGER = get_pg_manager()
+        if PROCESS_GROUP_MANAGER is not None:
+            import torch.distributed as dist
+            sequence_parallel_size = training_args.seq_parallel_size
+            sequence_parallel_rank = PROCESS_GROUP_MANAGER.sp_rank
+        else:
+            sequence_parallel_size = 1
+        print("sequence_parallel_size", sequence_parallel_size)
+        rank = (
+            training_args.process_index // sequence_parallel_size if "RANK" in os.environ else 2
+        )  # int(os.environ["RANK"])
+        world_size = (
+            training_args.world_size // sequence_parallel_size if "WORLD_SIZE" in os.environ else 32
+        )  # int(os.environ["WORLD_SIZE"])
+        print(
+            "rank",
+            rank,
+            "world_size",
+            world_size,
+        )
+        self.rank = rank
+        # rank = int(os.environ["RANK"]) if "RANK" in os.environ else 2
+        # world_size = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 32
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+        self.missing_uids = set()
+    def __len__(self):
+        return len(self.dataset)
+    @property
+    def modality_lengths(self):
+        # Estimate the number of tokens after tokenization, used for length-grouped sampling
+        length_list = []
+        for samples in self.data_list:
+            cur_len = sum([len(conv["text" if "text" in conv else "caption"].split()) for conv in samples])
+            # The unit of cur_len is "words". We assume 1 word = 2 tokens.
+            cur_len = cur_len + len(samples) * self.num_image_tokens // 2
+            length_list.append(cur_len)
+        return length_list
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        ADD_TEXT_PROMPT = False
+        num_video_frames = self.data_args.num_video_frames if hasattr(self.data_args, "num_video_frames") else 8
+        loader_fps = self.data_args.fps if hasattr(self.data_args, "fps") else 0.0
+        info = self.dataset[i]
+        caption = ""
+        # print(info)
+        if ".mp4" in info:
+            caption, video_path = info[".txt"], info[".mp4"]
+        else:
+            video_path = None
+            caption = "Empty video."
+        images, frames_loaded, _ = LazySupervisedDataset._load_video(
+            video_path, num_video_frames, loader_fps, self.data_args
+        )
+        if frames_loaded == 0:
+            caption = "Empty video."
+        if self.caption_choice is not None:
+            shard = info["__shard__"]
+            uuid = osp.join(info["__shard__"], info["__key__"])
+            url = info["__key__"]
+            tar_name = osp.basename(info["__shard__"])
+            try:
+                shard_json_path = osp.join(self.caption_choice, tar_name.replace(".tar", ".json"))
+                shard_json = lru_json_load(shard_json_path)
+                caption = shard_json[url]["summary"]["output"]
+            except (KeyError, FileNotFoundError, json.decoder.JSONDecodeError):
+                if uuid not in self.missing_uids:
+                    print("override caption not found for ", uuid)
+                    self.missing_uids.add(uuid)
+            # print(f"[DEBUG {uuid}]", caption)
+        frames_loaded_successfully = len(images)
+        if caption is None:
+            caption = ""
+        prompt = "<image>\n" * frames_loaded_successfully + caption
+        image_tensor = torch.stack([process_image(image, self.data_args, None) for image in images])
+        input_ids = tokenizer_image_token(
+            prompt,
+            self.tokenizer,
+            return_tensors="pt",
+        )
+        targets = copy.deepcopy(input_ids)
+        data_dict = dict(input_ids=input_ids, labels=targets, image=image_tensor)
+        return data_dict
+class DataCollatorForSupervisedDatasetSeqParallel:
+    """Collate examples for supervised fine-tuning.
+    This class is originally implemented by the LLaVA team and
+    modified by Haotian Tang."""
+    def __init__(
+        self,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        training_args: TrainingArguments,
+        sp_degree: int,
+        sp_rank: int,
+        ring_degree: int,
+        ring_type: str,
+    ):
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+        self.training_args = training_args
+        self.sp_degree = sp_degree
+        self.sp_rank = sp_rank
+        self.ring_degree = ring_degree
+        self.ring_type = ring_type
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels, images = [], [], []
+        image_token_id = self.tokenizer.media_token_ids["image"]
+        video_token_id = self.tokenizer.media_token_ids["video"]
+        for instance in instances:
+            if not isinstance(instance["input_ids"], list):
+                input_ids.append(instance["input_ids"])
+            else:
+                input_ids += instance["input_ids"]
+            if not isinstance(instance["labels"], list):
+                labels.append(instance["labels"])
+            else:
+                labels += instance["labels"]
+            # Note (kentang-mit@: we do not directly push tensors to
+            # images, but list of tensors.
+            if "video" in instance:
+                instance["image"] = torch.cat(instance["video"])
+                video_id_pos = torch.where(input_ids[-1] == video_token_id)[0][0]
+                replace_ids = torch.Tensor(
+                    ([image_token_id] + self.tokenizer.encode("\n")) * instance["image"].shape[0],
+                    device=input_ids[-1].device,
+                )
+                input_ids[-1] = torch.cat(
+                    [input_ids[-1][:video_id_pos], replace_ids, input_ids[-1][video_id_pos + 1 :]]
+                ).to(input_ids[-1].dtype)
+                labels[-1] = torch.cat(
+                    [
+                        labels[-1][:video_id_pos],
+                        torch.Tensor([IGNORE_INDEX] * instance["image"].shape[0] * 2),
+                        labels[-1][video_id_pos + 1 :],
+                    ]
+                ).to(labels[-1].dtype)
+                instance.pop("video")
+            if "image" in instance:
+                cur_image = instance["image"]
+                assert len(cur_image.shape) == 4
+                # n_images, 3, size, size
+                if cur_image.shape[0] == 0:
+                    warnings.warn("loaded one sample without images.")
+                if not isinstance(instance["input_ids"], list):
+                    # datasets other than coyo, not packing >1 samples together
+                    images.append(cur_image)
+                else:
+                    # coyo-like datasets
+                    images.extend(cur_image.chunk(cur_image.size(0), 0))
+            else:
+                warnings.warn("loaded one sample without images.")
+                images.append([])
+        # kentang-mit@: we need to make sure these two lists have
+        # the same length. We will use input_ids to filter out images corresponding
+        # to truncated <image> tokens later.
+        max_num_images = max([len(_images) for _images in images])
+        for _images, _input_ids in zip(images, input_ids):
+            assert (
+                len(_images) == (_input_ids == image_token_id).sum().item()
+            ), f"Number mismatch between images and placeholder image tokens in 'len(_images) == (_input_ids == image_token_id).sum().item()'.\
+                Expect to have {len(_images)} images but only found {(_input_ids == image_token_id).sum().item()} images in tokens. \
+                Error input_ids: {_input_ids} {self.tokenizer.decode([x if x != -200 else 200 for x in _input_ids])}"
+        NUM_TOKENS_PER_IMAGE = self.data_args.num_image_tokens
+        if hasattr(self.data_args.image_processor, "crop_size"):
+            crop_size = self.data_args.image_processor.crop_size
+        else:
+            crop_size = self.data_args.image_processor.size
+        # Init the padding sample
+        seq_id = 0
+        while seq_id < len(input_ids):
+            # Skip the samples without images
+            dummy_image = torch.ones((1, 3, crop_size["height"], crop_size["width"]), device=input_ids[seq_id].device)
+            # dummy input_ids include one bos, one image token, and one eos
+            dummy_input_ids = torch.zeros_like(input_ids[seq_id][:3])
+            dummy_input_ids[0] = self.tokenizer.bos_token_id
+            dummy_input_ids[1] = image_token_id
+            dummy_input_ids[2] = self.tokenizer.eos_token_id
+            dummy_labels = copy.deepcopy(dummy_input_ids)
+            dummy_labels[:2] = IGNORE_INDEX
+            dummy_seqlen = NUM_TOKENS_PER_IMAGE + 2  # TODO: Check the hard coding of 2
+            dummy_position_ids = torch.arange(start=0, end=dummy_seqlen, dtype=torch.int32)
+            break
+        # Sort with the real length of the sequence
+        combined = sorted(
+            zip(input_ids, labels, images),
+            key=lambda x: len(x[2]) * (NUM_TOKENS_PER_IMAGE - 1) + x[0].size(-1),
+            reverse=True,  # Start Packing from the sequence with most images.
+        )
+        sorted_ids, sorted_labels, sorted_images = zip(*combined)
+        sorted_ids, sorted_labels, sorted_images = list(sorted_ids), list(sorted_labels), list(sorted_images)
+        max_seq_length = self.tokenizer.model_max_length  # len(sorted_ids[0])
+        max_sample_len = 0
+        batches = []
+        label_batches = []
+        position_ids = []
+        batch_images = []
+        seqlens_in_batch = []
+        i = 0
+        while i < len(sorted_ids):
+            current_batch = torch.tensor([], dtype=torch.int32)
+            current_label_batch = torch.tensor([], dtype=torch.int32)
+            current_position_ids = torch.tensor([], dtype=torch.int32)
+            current_batch_images = []
+            current_num_images = 0
+            current_len = 0
+            current_num_samples = 0
+            # Pack a few samples into one sample
+            while i < len(sorted_ids):
+                num_images = (sorted_ids[i] == image_token_id).sum().item()
+                num_image_tokens_added = num_images * (NUM_TOKENS_PER_IMAGE - 1)
+                num_incoming_tokens = sorted_ids[i].size(-1) + num_image_tokens_added
+                # Handle RingAttn_Varlen which requires `seqlens_in_batch` should be divisible by `ring_degree`
+                if self.ring_degree > 1:
+                    RING_PAD_TOKEN_INDEX = 2
+                    if self.ring_type == "ring_varlen":
+                        if num_incoming_tokens % self.sp_degree != 0:
+                            pad_len = self.sp_degree - num_incoming_tokens % self.sp_degree
+                            num_incoming_tokens += pad_len
+                            # pad `input_ids`
+                            pad_tensor = torch.full(
+                                (pad_len,), RING_PAD_TOKEN_INDEX, dtype=sorted_ids[i].dtype, device=sorted_ids[i].device
+                            )
+                            sorted_ids[i] = torch.cat([sorted_ids[i], pad_tensor])
+                            # pad `label`
+                            pad_label_tensor = torch.full(
+                                (pad_len,), IGNORE_INDEX, dtype=sorted_labels[i].dtype, device=sorted_labels[i].device
+                            )
+                            sorted_labels[i] = torch.cat([sorted_labels[i], pad_label_tensor])
+                    elif self.ring_type == "zigzag_ring_varlen":
+                        self.zigzag_sp_degree = self.sp_degree * 2
+                        if num_incoming_tokens % self.zigzag_sp_degree != 0:
+                            pad_len = self.zigzag_sp_degree - num_incoming_tokens % self.zigzag_sp_degree
+                            num_incoming_tokens += pad_len
+                            # pad `input_ids`
+                            pad_tensor = torch.full(
+                                (pad_len,), RING_PAD_TOKEN_INDEX, dtype=sorted_ids[i].dtype, device=sorted_ids[i].device
+                            )
+                            sorted_ids[i] = torch.cat([sorted_ids[i], pad_tensor])
+                            # pad `label`
+                            pad_label_tensor = torch.full(
+                                (pad_len,), IGNORE_INDEX, dtype=sorted_labels[i].dtype, device=sorted_labels[i].device
+                            )
+                            sorted_labels[i] = torch.cat([sorted_labels[i], pad_label_tensor])
+                    else:
+                        raise ValueError(f"Invalid ring_type: {self.ring_type}")
+                if num_incoming_tokens > max_seq_length:
+                    print(
+                        f"Warning: Skipping one packed sample with {num_incoming_tokens} tokens,\
+                        please consider increase max seq len {max_seq_length}."
+                    )
+                    i += 1
+                    continue
+                if (
+                    (current_num_images == 0)
+                    or (current_num_images < self.sp_degree)
+                    or (
+                        (current_num_images + num_images <= max_num_images)
+                        and (current_len + num_incoming_tokens <= max_sample_len)
+                    )
+                ) and (current_len + num_incoming_tokens <= max_seq_length):
+                    current_num_images += num_images
+                    current_len += num_incoming_tokens
+                    current_num_samples += 1
+                    current_position_ids = torch.cat(
+                        (current_position_ids, torch.arange(start=0, end=num_incoming_tokens)), dim=0
+                    )
+                    current_batch = torch.cat((current_batch, sorted_ids[i]), dim=0)
+                    sorted_labels[i][0] = IGNORE_INDEX
+                    current_label_batch = torch.cat((current_label_batch, sorted_labels[i]), dim=0)
+                    seqlens_in_batch.append(num_incoming_tokens)
+                    current_batch_images.extend(sorted_images[i])
+                    i += 1
+                    assert current_num_images == len(current_batch_images)
+                else:
+                    break
+            # Padding the sample with the dummy image sample, if there are no enough images
+            MAX_RETRY = self.sp_degree
+            num_retry = 0
+            while current_num_images < self.sp_degree and current_len < max_seq_length and num_retry <= MAX_RETRY:
+                current_num_images += dummy_image.size(0)
+                current_len += dummy_seqlen
+                current_num_samples += 1
+                current_position_ids = torch.cat((current_position_ids, dummy_position_ids), dim=0)
+                current_batch = torch.cat((current_batch, dummy_input_ids), dim=0)
+                current_label_batch = torch.cat((current_label_batch, dummy_labels), dim=0)
+                seqlens_in_batch.append(dummy_seqlen)
+                current_batch_images.extend(dummy_image)
+                # We pad from left side to ensure correct grad flow
+                # current_batch = torch.cat((dummy_input_ids, current_batch), dim=0)
+                # current_label_batch = torch.cat((dummy_labels, current_label_batch), dim=0)
+                # seqlens_in_batch.insert(0, dummy_seqlen)
+                # current_batch_images = torch.cat((dummy_image, current_batch_images), dim=0)
+                num_retry += 1
+            # Drop the samples that do not have enough images
+            if current_num_images < self.sp_degree:
+                print(f"Warning: Skipping one packed sample with {current_num_images} images")
+                seqlens_in_batch = seqlens_in_batch[:-current_num_samples]
+                continue
+            max_sample_len = max(max_sample_len, current_len)
+            batches.append(current_batch)
+            label_batches.append(current_label_batch)
+            position_ids.append(current_position_ids)
+            batch_images.append(current_batch_images)
+            try:
+                assert current_num_images == len(torch.where(current_batch == image_token_id)[0].tolist())
+            except AssertionError:
+                print(f"Error num_images on {self.sp_rank}", current_num_images)
+                print("current_batch", current_batch)
+                print(
+                    f"Error len(torch.where(batches[i] == image_token_id)[0].tolist() on {self.sp_rank}:",
+                    len(torch.where(current_batch == image_token_id)[0].tolist()),
+                )
+                print(f"Error len(current_batch_images) on {self.sp_rank}:", len(current_batch_images))
+                raise AssertionError
+        # Split for sequence parallelism
+        for i in range(len(batches)):
+            image_token_indices = torch.where(batches[i] == image_token_id)[0].tolist()
+            image_ids = torch.arange(0, len(image_token_indices), dtype=torch.int32)
+            batches[i] = extract_local_input_ids(
+                batches[i], image_token_indices, self.sp_rank, self.sp_degree, self.tokenizer.bos_token_id
+            )
+            label_batches[i] = extract_local_input_ids(
+                label_batches[i], image_token_indices, self.sp_rank, self.sp_degree, self.tokenizer.bos_token_id
+            )
+            batch_images[i] = torch.concat(
+                extract_local_from_list(batch_images[i], self.sp_rank, self.sp_degree), dim=0
+            )
+            H, W = batch_images[i].size(-2), batch_images[i].size(-1)
+            batch_images[i] = batch_images[i].reshape(-1, 3, W, H)
+            num_images = len(batch_images[i])
+            try:
+                assert num_images == len(torch.where(batches[i] == image_token_id)[0].tolist())
+            except AssertionError:
+                print(f"Error num_images on {self.sp_rank}", num_images)
+                print("batches[i]", batches[i])
+                print(
+                    f"Error len(torch.where(batches[i] == image_token_id)[0].tolist() on {self.sp_rank}:",
+                    len(torch.where(batches[i] == image_token_id)[0].tolist()),
+                )
+                print(f"Error batch_images[i] on {self.sp_rank}:", batch_images[i].shape)
+                raise AssertionError
+            position_ids[i] = extract_local_position_ids(
+                position_ids[i], image_token_indices, image_ids, self.sp_rank, self.sp_degree, NUM_TOKENS_PER_IMAGE - 1
+            )
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            batches, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+        labels = torch.nn.utils.rnn.pad_sequence(label_batches, batch_first=True, padding_value=IGNORE_INDEX)
+        seqlens_in_batch = [torch.tensor(x) for x in seqlens_in_batch]
+        seqlens_in_batch = torch.stack(seqlens_in_batch, axis=0)
+        seqlens_in_batch = seqlens_in_batch.flatten()
+        position_ids = torch.nn.utils.rnn.pad_sequence(position_ids, batch_first=True, padding_value=-1)
+        if batch_images:
+            batch_images = [torch.unbind(images) for images in batch_images]
+            flat_batch_images = [item for sublist in batch_images for item in sublist]
+        else:
+            flat_batch_images = None
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            # notice that we inject attention mask here
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+            seqlens_in_batch=seqlens_in_batch,
+            media={"image": flat_batch_images},
+            media_config={"image": {}},
+            position_ids=position_ids,
+        )
+        return batch
+def make_supervised_data_module(
+    tokenizer: PreTrainedTokenizer,
+    data_args: DataArguments,
+    training_args: TrainingArguments,
+) -> Dict:
+    """Make dataset and collator for supervised fine-tuning.
+    This function is originally implemented by the LLaVA team and
+    modified by Jason Lu, Haotian Tang and Ligeng Zhu."""
+    datasets_mixture.register_datasets_mixtures()
+    from .builder import build_dataset
+    train_dataset = build_dataset(data_args.data_mixture, data_args, training_args, tokenizer)
+    training_args.sample_lens = [len(d) for d in train_dataset.datasets]
+    PROCESS_GROUP_MANAGER = get_pg_manager()
+    if PROCESS_GROUP_MANAGER is None:
+        data_collator = DataCollator(tokenizer=tokenizer)
+    else:
+        sp_degree = training_args.seq_parallel_size
+        sp_rank = PROCESS_GROUP_MANAGER.sp_rank
+        ring_degree = PROCESS_GROUP_MANAGER.ring_degree
+        ring_type = PROCESS_GROUP_MANAGER.ring_type
+        data_collator = DataCollatorForSupervisedDatasetSeqParallel(
+            tokenizer=tokenizer,
+            data_args=data_args,
+            training_args=training_args,
+            sp_degree=sp_degree,
+            sp_rank=sp_rank,
+            ring_degree=ring_degree,
+            ring_type=ring_type,
+        )
+    return dict(
+        train_dataset=train_dataset,
+        data_collator=data_collator,
+    )

llava/data/datasets_mixture.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+from dataclasses import dataclass, field
+@dataclass
+class Dataset:
+    dataset_name: str
+    dataset_type: str = field(default="torch")
+    data_path: str = field(default=None, metadata={"help": "Path to the training data."})
+    meta_path: str = field(default=None, metadata={"help": "Path to the meta data for webdataset."})
+    image_path: str = field(default=None, metadata={"help": "Path to the training image data."})
+    speech_path: str = field(default=None, metadata={"help": "Path to the training speech data."})
+    caption_choice: str = field(default=None, metadata={"help": "Path to the caption directory for recaption."})
+    description: str = field(
+        default=None,
+        metadata={
+            "help": "Detailed desciption of where the data is from, how it is labelled, intended use case and the size of the dataset."
+        },
+    )
+    test_script: str = (None,)
+    maintainer: str = (None,)
+    ############## ############## ############## ############## ############## ##############
+    caption_choice: str = field(default=None, metadata={"help": "Path to the captions for webdataset."})
+    caption_choice_2: str = field(default=None, metadata={"help": "Path to the captions for webdataset."})
+    start_idx: float = field(default=-1, metadata={"help": "Start index of the dataset."})
+    end_idx: float = field(default=-1, metadata={"help": "Start index of the dataset."})
+DATASETS_LEGACY = {}
+def add_dataset(dataset):
+    if dataset.dataset_name in DATASETS_LEGACY:
+        # make sure the data_name is unique
+        warnings.warn(f"{dataset.dataset_name} already existed in DATASETS. Make sure the name is unique.")
+    assert "+" not in dataset.dataset_name, "Dataset name cannot include symbol '+'."
+    DATASETS_LEGACY.update({dataset.dataset_name: dataset})
+def register_datasets_mixtures():
+    ############## ############## ############## ############## ############## ##############
+    # Audio Datasets
+    ############## ############## ############## ############## ############## ##############
+    data_mixture_1 = Dataset(
+        dataset_name="data_mixture_1",
+        dataset_type="torch",
+        data_path="/path/to/your/data_mixture_1/train.json",
+    )
+    add_dataset(data_mixture_1)
+    data_mixture_2 = Dataset(
+        dataset_name="data_mixture_2",
+        dataset_type="torch",
+        data_path="/path/to/your/data_mixture_2/train.json",
+    )
+    add_dataset(data_mixture_2)
+    # Add more data mixtures below

llava/data/registry/datasets/audio_test.yaml ADDED Viewed

	@@ -0,0 +1,97 @@

+---
+Clotho-AQA-AQA:
+  _target_: llava.data.LLaVADataset
+  data_path: Clotho-AQA-AQA/test.json
+Music-AVQA-AQA_All:
+  _target_: llava.data.LLaVADataset
+  data_path: Music-AVQA-AQA_All/test.json
+CochlScene-SceneClassification:
+  _target_: llava.data.LLaVADataset
+  data_path: CochlScene-SceneClassification/test.json
+NSynth-Source:
+  _target_: llava.data.LLaVADataset
+  data_path: NSynth-Source/test.json
+NSynth-Instrument:
+  _target_: llava.data.LLaVADataset
+  data_path: NSynth-Instrument/test.json
+FSD50k-EventClassification:
+  _target_: llava.data.LLaVADataset
+  data_path: FSD50k-EventClassification/test.json
+Clotho-v2-AudioCaptioning:
+  _target_: llava.data.LLaVADataset
+  data_path: Clotho-v2-AudioCaptioning/test.json
+audiocaps-AudioCaptioning:
+  _target_: llava.data.LLaVADataset
+  data_path: audiocaps-AudioCaptioning/test.json
+ravdess-EmotionClassification:
+  _target_: llava.data.LLaVADataset
+  data_path: ravdess-EmotionClassification/val.json
+GTZAN-GenreClassification:
+  _target_: llava.data.LLaVADataset
+  data_path: GTZAN-GenreClassification/test.json
+UrbanSound8K-EventClassification:
+  _target_: llava.data.LLaVADataset
+  data_path: UrbanSound8K-EventClassification/train.json
+Medley-solos-DB-InstrClassification:
+  _target_: llava.data.LLaVADataset
+  data_path: Medley-solos-DB-InstrClassification/test.json
+ESC50-EventClassification:
+  _target_: llava.data.LLaVADataset
+  data_path: ESC50-EventClassification/train.json
+CREMA-D-EmotionClassification:
+  _target_: llava.data.LLaVADataset
+  data_path: CREMA-D-EmotionClassification/test.json
+IEMOCAP-EmotionClassification:
+  _target_: llava.data.LLaVADataset
+  data_path: IEMOCAP-EmotionClassification/test.json
+MELD-EmotionClassification:
+  _target_: llava.data.LLaVADataset
+  data_path: MELD-EmotionClassification/test.json
+MELD-SentimentClassification:
+  _target_: llava.data.LLaVADataset
+  data_path: MELD-SentimentClassification/test.json
+MMAU:
+  _target_: llava.data.LLaVADataset
+  data_path: MMAU/test.json
+MMAU-mini:
+  _target_: llava.data.LLaVADataset
+  data_path: MMAU/test-mini.json
+AudioEntailmentQA:
+  _target_: llava.data.LLaVADataset
+  data_path: AudioEntailmentQA/test.json
+SPGI-ASR:
+  _target_: llava.data.LLaVADataset
+  data_path: SPGI-ASR/val.json
+SWBD-ASR:
+  _target_: llava.data.LLaVADataset
+  data_path: SWBD-ASR/val.json
+LibriSpeech-ASR-clean:
+  _target_: llava.data.LLaVADataset
+  data_path: LibriSpeech-ASR/test_clean.json
+LibriSpeech-ASR-other:
+  _target_: llava.data.LLaVADataset
+  data_path: LibriSpeech-ASR/test_other.json
+VoxPopuli-ASR:
+  _target_: llava.data.LLaVADataset
+  data_path: VoxPopuli-ASR/test.json
+Europarl-ASR:
+  _target_: llava.data.LLaVADataset
+  data_path: Europarl-ASR/test.json
+CV-ASR:
+  _target_: llava.data.LLaVADataset
+  data_path: CV-ASR/test.json
+GigaSpeech-ASR:
+  _target_: llava.data.LLaVADataset
+  data_path: GigaSpeech-ASR/test.json
+CompA-R-AQA:
+  _target_: llava.data.LLaVADataset
+  data_path: CompA-R-AQA/test.json
+MuschoMusicQA:
+  _target_: llava.data.LLaVADataset
+  data_path: MuschoMusicQA/test.json
+CMM:
+  _target_: llava.data.LLaVADataset
+  data_path: CMM/test.json
+AIR-Bench:
+  _target_: llava.data.LLaVADataset
+  data_path: AIR-Bench/test.json

llava/data/registry/datasets/default.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+---
+dummy:
+    _target_: llava.data.DummyDataset
+    num_instances: 10000
+    comments: dummy dataset for testing

llava/data/registry/mixtures.yaml ADDED Viewed

	@@ -0,0 +1,78 @@

+---
+audio_speech_all:
+    -CV-ASR_1
+    -MELD-EmotionClassification+
+    -BBCSoundEffects-AudioDescription
+    -SWBD-ASR_1
+    -WavCaps-SoundBible-AudioCaptioning
+    -AudioSet-Speech-Audio-QA
+    -SONYC-UST-EventClassification
+    -VoxPopuli-ASR_1
+    -FSD50k-EventClassification
+    -SalmonnQA
+    -emov-db-EmotionClassification
+    -LLARK_MagnaTagATune-mir+tess-EmotionClassification
+    -Europarl-ASR_1
+    -jl-corpus-EmotionClassification
+    -Ego-10-AudioCaptioning
+    -SPGI-ASR_1
+    -CREMA-D-EmotionClassification
+    -MusicBenchQA
+    -WavCaps-BBC_Sound_Effects-AudioCaptioning
+    -NSynth-Instrument
+    -SpokenSquadQA
+    -NSynth-MIR
+    -AudioEntailmentQA
+    -GigaSpeech-ASR_1
+    -WavCaps-AudioSet_SL-AudioCaptioning
+    -NonSpeech7k-EventClassification
+    -chime-home-EventClassification
+    -MusicCaps-AudioCaptioning
+    -LP-MusicCaps-MSD-AudioCaptioning
+    -Ego-30-AudioCaptioning
+    -NSynth-Source+Clotho-v2-AudioCaptioning
+    -LP-MusicCaps-MC-AudioCaptioning
+    -Clotho-AQA-EventClassification
+    -WavCaps-FreeSound-AudioCaptioning
+    -LLARK_MagnaTagATune-reasoning
+    -AudioSet-Temporal-Speech-Audio-QA
+    -TUT-EventClassification
+    -ESC50-EventClassification
+    -WavText5K-Tagging
+    -MELD-SentimentClassification
+    -Music-AVQA-AQA_All
+    -Music-AVQA-AVQA_All
+    -MACS-AudioCaptioning
+    -Medley-solos-DB-InstrClassification
+    -AudioSet-EventClassification
+    -OMGEmotion-EmotionClassification
+    -FMA-GenreClassification
+    -Epidemic_sound-AudioCaptioning
+    -CochlScene-SceneClassification
+    -LLARK_FMA-reasoning
+    -ravdess-EmotionClassification
+    -CompA-R-AQA
+    -MU-LLAMA-AQA
+    -musdbhq-InstrClassification
+    -UrbanSound8K-EventClassification
+    -audiocaps-AudioCaptioning
+    -VocalSound-VocalClassification
+    -CLAP_freesound-AudioCaptioning
+    -MMAUQA
+    -SongDescriber-AudioCaptioning
+    -HeySQuADQA
+    -Mira-AudioCaptioning
+    -Clotho-AQA-AQA
+    -LibriSpeech-ASR_1
+    -IEMOCAP-EmotionClassification
+    -AudioSetFullwoAudioMusicCaps-EventClassification
+    -MSP-PODCAST-Publish-1.9-EmotionClassification
+    -OpenAQA-AQA
+    -SoundDescs-AudioDescription
+    -LibriSQA
+    -LLARK_FMA-mir
+    -LP-MusicCaps-MTT-AudioCaptioning
+    -GTZAN-GenreClassification
+    -musdbhq-captioning
+    -YesNoQA

llava/entry.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import os
+import typing
+from typing import List, Optional
+if typing.TYPE_CHECKING:
+    from transformers import PreTrainedModel
+else:
+    PreTrainedModel = None
+__all__ = ["load"]
+def load(
+    model_path: str,
+    model_base: Optional[str] = None,
+    devices: Optional[List[int]] = None,
+    **kwargs,
+) -> PreTrainedModel:
+    import torch
+    from llava.conversation import auto_set_conversation_mode
+    from llava.mm_utils import get_model_name_from_path
+    from llava.model.builder import load_pretrained_model
+    auto_set_conversation_mode(model_path)
+    model_name = get_model_name_from_path(model_path)
+    model_path = os.path.expanduser(model_path)
+    if os.path.exists(os.path.join(model_path, "model")):
+        model_path = os.path.join(model_path, "model")
+    # Set `max_memory` to constrain which GPUs to use
+    if devices is not None:
+        assert "max_memory" not in kwargs, "`max_memory` should not be set when `devices` is set"
+        kwargs.update(max_memory={device: torch.cuda.get_device_properties(device).total_memory for device in devices})
+    model = load_pretrained_model(model_path, model_name, model_base, **kwargs)[1]
+    return model

llava/eval/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+import os
+from llava.utils import io
+__all__ = ["EVAL_ROOT", "TASKS"]
+EVAL_ROOT = "scripts/eval"
+TASKS = io.load(os.path.join(os.path.dirname(__file__), "registry_audio.yaml"))

llava/eval/eval_audio_bench.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+import argparse
+import csv
+import itertools
+import json
+import os
+import torch
+from datasets import load_dataset
+from tqdm import tqdm
+import llava
+from llava import conversation as conversation_lib
+from llava.data.builder import DATASETS
+from llava.eval.mmmu_utils.eval_utils import parse_choice
+from llava.utils import distributed as dist
+from llava.utils import io
+from llava.utils.logging import logger
+def load_existing_ids(output_file):
+    if not os.path.exists(output_file):
+        return set(), []
+    try:
+        with open(output_file, "r") as f:
+            lines = f.readlines()
+            outputs = [json.loads(line) for line in lines]
+            processed_ids = {item["id"] for item in outputs}
+            return processed_ids, outputs
+    except Exception as e:
+        print(f"Error loading existing outputs: {e}")
+        return set(), []
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default=None)
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--task", type=str, default=None)
+    parser.add_argument("--conv-mode", type=str, default="auto")
+    parser.add_argument("--generation-config", type=json.loads)
+    parser.add_argument("--output-dir", type=str, default=None)
+    args = parser.parse_args()
+    # Set up distributed environment
+    dist.init()
+    devices = range(dist.local_rank(), torch.cuda.device_count(), dist.local_size())
+    torch.cuda.set_device(devices[0])
+    # Load stage 3 model with line 56
+    model = llava.load(args.model_base, model_base=None, devices=devices)
+    # Uncomment line 58-63 to load stage 3.5 model on top of stage 3 for thinking mode and long audio mode
+    # model = PeftModel.from_pretrained(
+    #         model,
+    #         args.model_path,
+    #         device_map="auto",
+    #         torch_dtype=torch.float16,
+    #     )
+    # Set up generation config
+    generation_config = model.default_generation_config
+    if args.generation_config is not None:
+        generation_config.update(**args.generation_config)
+    # Load data and chunk it
+    json_file = DATASETS[args.task]["data_path"]
+    instances = io.load(json_file)
+    instances = instances[dist.rank() :: dist.size()]
+    output_path = os.path.join(args.output_dir, f"outputs_{args.task}.jsonl")
+    processed_ids, outputs = load_existing_ids(output_path)
+    count = len(outputs)
+    # Run inference
+    new_outputs = []
+    for instance in tqdm(instances, disable=not dist.is_main()):
+        uuid = instance["id"]
+        sound_path = instance["sound"]
+        if sound_path in processed_ids:
+            continue  # Skip if already processed
+        sound = llava.Sound(sound_path)
+        conversations = instance["conversations"]
+        question = conversations[0]["value"]
+        response = model.generate_content([sound, question], generation_config=generation_config)
+        print("response", response)
+        output = {"id": sound_path, "question": question, "gt_answer": conversations[1]["value"],  "pred": response}
+        new_outputs.append(output)
+        count = count +1
+        if count % 20 == 0:
+            # Gather and save outputs
+            if dist.size() > 1:
+                outputs_new = dist.gather(new_outputs, dst=0)
+                if dist.is_main():
+                    outputs_new = list(itertools.chain(*outputs_new))
+                    final_outputs = outputs + outputs_new
+                    io.save(os.path.join(args.output_dir, f"outputs_{args.task}.jsonl"), final_outputs)
+            else:
+                final_outputs = outputs + new_outputs
+                io.save(os.path.join(args.output_dir, f"outputs_{args.task}.jsonl"), final_outputs)
+    if dist.size() > 1:
+        new_outputs = dist.gather(new_outputs, dst=0)
+        if not dist.is_main():
+            return
+        new_outputs = list(itertools.chain(*new_outputs))
+        final_outputs = outputs + new_outputs
+    io.save(os.path.join(args.output_dir, "outputs_"+str(args.task)+".jsonl"), final_outputs)
+if __name__ == "__main__":
+    main()

llava/eval/mmmu_utils/__pycache__/eval_utils.cpython-311.pyc ADDED Viewed

Binary file (2.58 kB). View file

llava/eval/mmmu_utils/eval_utils.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# This file is originated from the official MMMU codebase:
+# https://github.com/MMMU-Benchmark/MMMU
+import random
+import numpy as np
+def parse_choice(response, all_choices, index2ans=None):
+    """
+    Parse the prediction from the generated response.
+    Return the predicted index e.g., A, B, C, D.
+    """
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " "  # add space to avoid partial match
+    index_ans = True
+    ans_with_brack = False
+    candidates = []
+    for choice in all_choices:  # e.g., (A) (B) (C) (D)
+        if f"({choice})" in response:
+            candidates.append(choice)
+            ans_with_brack = True
+    if len(candidates) == 0:
+        for choice in all_choices:  # e.g., A B C D
+            if f" {choice} " in response:
+                candidates.append(choice)
+    # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
+    if len(candidates) == 0 and len(response.split()) > 5 and index2ans is not None:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False  # it's content ans.
+    if len(candidates) == 0:  # still not get answer, randomly choose one.
+        pred_index = random.choice(all_choices)
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_brack:
+                for can in candidates:
+                    index = response.rfind(f"({can})")
+                    start_indexes.append(index)  # -1 will be ignored anyway
+                # start_indexes = [generated_response.index(f'({can})') for can in candidates]
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        # get the last one
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:  # if only one candidate, use it.
+        pred_index = candidates[0]
+    return pred_index

llava/eval/registry_audio.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+Clotho-AQA-AQA:
+  tags:
+  - local
+Music-AVQA-AQA_All:
+  tags:
+  - local
+CochlScene-SceneClassification:
+  tags:
+  - local
+NSynth-Source:
+  tags:
+  - local
+NSynth-Instrument:
+  tags:
+  - local
+FSD50k-EventClassification:
+  tags:
+  - local
+Clotho-v2-AudioCaptioning:
+  tags:
+  - local
+audiocaps-AudioCaptioning:
+  tags:
+  - local
+ravdess-EmotionClassification:
+  tags:
+  - local
+GTZAN-GenreClassification:
+  tags:
+  - local
+UrbanSound8K-EventClassification:
+  tags:
+  - local
+Medley-solos-DB-InstrClassification:
+  tags:
+  - local
+ESC50-EventClassification:
+  tags:
+  - local
+CREMA-D-EmotionClassification:
+  tags:
+  - local
+IEMOCAP-EmotionClassification:
+  tags:
+  - local
+MELD-EmotionClassification:
+  tags:
+  - local
+MELD-SentimentClassification:
+  tags:
+  - local
+MMAU:
+  tags:
+  - local
+AudioEntailmentQA:
+  tags:
+  - local
+SPGI-ASR:
+  tags:
+  - local
+SWBD-ASR:
+  tags:
+  - local
+LibriSpeech-ASR-clean:
+  tags:
+  - local
+LibriSpeech-ASR-other:
+  tags:
+  - local
+VoxPopuli-ASR:
+  tags:
+  - local
+Europarl-ASR:
+  tags:
+  - local
+CV-ASR:
+  tags:
+  - local
+GigaSpeech-ASR:
+  tags:
+  - local
+CompA-R-AQA:
+  tags:
+  - local
+MuschoMusicQA:
+  tags:
+  - local
+CMM:
+  tags:
+  - local
+AIR-Bench:
+  tags:
+  - local

llava/media.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+__all__ = ["Media", "File", "Image", "Video", "Speech",  "Sound"]
+class Media:
+    pass
+class File(Media):
+    def __init__(self, path: str) -> None:
+        self.path = path
+class Image(File):
+    pass
+class Video(File):
+    pass
+class Speech(File):
+    pass
+class Sound(File):
+    pass

llava/mm_utils.py ADDED Viewed

	@@ -0,0 +1,641 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# dynamic_preprocess and find_closest_aspect_ratio are referenced from https://github.com/OpenGVLab/InternVL
+import base64
+import os
+import tempfile
+from io import BytesIO
+import numpy as np
+import torch
+from PIL import Image
+from transformers import StoppingCriteria
+from pydub import AudioSegment
+from torchvision import transforms
+import soundfile as sf
+from librosa import resample as librosa_resample
+import whisper
+import random
+from pytorchvideo.data.clip_sampling import ConstantClipsPerVideoSampler, UniformClipSampler
+DEFAULT_AUDIO_FRAME_SHIFT_MS = 10  # in milliseconds
+def get_frame_from_vcap(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):
+    import cv2
+    if fps == None or frame_count == None:
+        # if one of fps or frame_count is None, still recompute
+        fps = vidcap.get(cv2.CAP_PROP_FPS)
+        frame_count = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if fps == 0 or frame_count == 0:
+        print(f"Video file not found. return empty images. {video_file_name}")
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * num_frames, 0, [0.]
+    duration = frame_count / fps
+    frame_interval = frame_count // num_frames
+    if frame_interval == 0 and frame_count <= 1:
+        print(f"frame_interval is equal to 0. return empty image. {video_file_name}")
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * num_frames, 0, [0.]
+    # print("duration:", duration, "frames:", frame_count, "intervals:", frame_interval)
+    images = []
+    count = 0
+    success = True
+    frame_indices = np.linspace(0, frame_count - 1, num_frames, dtype=int)
+    frame_times = [frame / fps for frame in frame_indices]
+    while success:
+        # print("frame_count:", frame_count, "count:", count, "num_frames:", num_frames, "frame_interval:", frame_interval)
+        if frame_count >= num_frames:
+            success, frame = vidcap.read()
+            if count in frame_indices:
+                try:
+                    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    im_pil = Image.fromarray(img)
+                    images.append(im_pil)
+                except BaseException:
+                    continue
+                if len(images) >= num_frames:
+                    return images, num_frames, frame_times
+            count += 1
+        else:
+            # Left padding frames if the video is not long enough
+            success, frame = vidcap.read()
+            if success:
+                try:
+                    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    im_pil = Image.fromarray(img)
+                    images.append(im_pil)
+                except BaseException:
+                    continue
+                count += 1
+            else:
+                break
+    if len(images) == 0:
+        raise ValueError("Did not find enough frames in the video. return empty image.")
+    return images, len(images), frame_times
+def get_frame_from_vcap_with_fps(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):
+    """
+    num_frames is the max number of frames the model can support.
+    frame_count is the number of frames in the input video.
+    max_fps is the max FPS of the model can support.
+    fps is the fps of the input video.
+    """
+    import random
+    import cv2
+    if fps == None or frame_count == None:
+        # if one of fps or frame_count is None, still recompute
+        fps = vidcap.get(cv2.CAP_PROP_FPS)
+        frame_count = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if fps == 0 or frame_count == 0:
+        print(f"Video file not found. return empty images. {video_file_name}")
+        empty_video_frames = int(random.uniform(2, 8 * max_fps))
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * empty_video_frames, 0, [0.]
+    duration = frame_count / fps
+    # print("duration:", duration, "frames:", frame_count, "fps:", fps, "num_frames:", num_frames, "max_fps:", max_fps)
+    # If the video is too long (longer than max_fps and num_frames can support),
+    # we will use lower fps to sample frames.
+    if duration >= num_frames / max_fps:
+        frame_interval = frame_count // num_frames
+        # If the video is too short, we will skip the video if there is only one frame.
+        if frame_interval == 0 and frame_count <= 1:
+            print(f"frame_interval is equal to 0. return empty image. {video_file_name}")
+            empty_video_frames = int(random.uniform(2, 8 * max_fps))
+            return [
+                Image.new("RGB", (720, 720)),
+            ] * empty_video_frames, 0, [0.]
+        images = []
+        count = 0
+        success = True
+        frame_indices = np.linspace(0, frame_count - 1, num_frames, dtype=int)
+        frame_times = [frame / fps for frame in frame_indices]
+        while success:
+            if frame_count >= num_frames:
+                # success, frame = vidcap.read()
+                if count in frame_indices:
+                    success, frame = vidcap.read()
+                    try:
+                        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                        im_pil = Image.fromarray(img)
+                        images.append(im_pil)
+                    except:
+                        # print("Failed to read frame:", count)
+                        continue
+                    if len(images) >= num_frames:
+                        return images, num_frames, frame_times
+                else:
+                    success = vidcap.grab()
+                count += 1
+            else:
+                # Left padding frames if the video is not long enough
+                success, frame = vidcap.read()
+                if success:
+                    try:
+                        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                        im_pil = Image.fromarray(img)
+                        images.append(im_pil)
+                    except:
+                        # print("Failed to read frame:", count)
+                        continue
+                    count += 1
+                else:
+                    break
+    else:
+        frames_required = int(duration * max_fps)
+        frame_indices = np.linspace(0, frame_count - 1, frames_required, dtype=int)
+        if frames_required == 0:
+            print(f"frames_required is fewer than 2. Duration {duration}, return empty image.")
+            empty_video_frames = int(random.uniform(2, 8 * max_fps))
+            return [
+                Image.new("RGB", (720, 720)),
+            ] * empty_video_frames, 0, [0.]
+        elif frames_required == 1:
+            frame_indices = np.linspace(0, frame_count - 1, 2, dtype=int)
+        images = []
+        count = 0
+        looked = 0
+        success = True
+        while success:
+            success, frame = vidcap.read()
+            if success and (looked in frame_indices):
+                try:
+                    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    im_pil = Image.fromarray(img)
+                    images.append(im_pil)
+                except:
+                    continue
+                count += 1
+            looked += 1
+    frame_times = [frame / fps for frame in frame_indices]
+    if len(images) == 0:
+        empty_video_frames = int(random.uniform(2, 8 * max_fps))
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * empty_video_frames, 0, [0.]
+    else:
+        return images, len(images), frame_times
+def opencv_extract_frames(vpath_or_bytesio, frames=6, max_fps=0.0, fps=None, frame_count=None):
+    """
+    Extract frames from a video using OpenCV.
+    Args:
+        vpath_or_bytesio (str or BytesIO): Path to the video file or BytesIO object containing the video.
+        frames (int): Number of frames to extract from the video.
+        fps (float): Frames per second of the video. If 0.0, the function will extract frames at equal intervals.
+    Returns:
+        list: List of PIL Images extracted from the video.
+    Raises:
+        NotImplementedError: If the type of `vpath_or_bytesio` is not supported.
+    """
+    import cv2
+    if isinstance(vpath_or_bytesio, str):
+        vidcap = cv2.VideoCapture(vpath_or_bytesio)
+        if max_fps > 0.0:
+            return get_frame_from_vcap_with_fps(
+                vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=vpath_or_bytesio
+            )
+        return get_frame_from_vcap(
+            vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=vpath_or_bytesio
+        )
+    elif isinstance(vpath_or_bytesio, (BytesIO,)):
+        # assuming mp4
+        with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as temp_video:
+            temp_video.write(vpath_or_bytesio.read())
+            temp_video_name = temp_video.name
+            vidcap = cv2.VideoCapture(temp_video_name)
+            if max_fps > 0.0:
+                return get_frame_from_vcap_with_fps(
+                    vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=temp_video_name
+                )
+            return get_frame_from_vcap(
+                vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=temp_video_name
+            )
+    else:
+        raise NotImplementedError(type(vpath_or_bytesio))
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+def expand2square(pil_img, background_color):
+    """
+    Expand the given PIL image to a square shape by adding padding.
+    Parameters:
+    - pil_img: The PIL image to be expanded.
+    - background_color: The color of the padding to be added.
+    Returns:
+    - The expanded PIL image.
+    If the image is already square, it is returned as is.
+    If the image is wider than it is tall, padding is added to the top and bottom.
+    If the image is taller than it is wide, padding is added to the left and right.
+    """
+    width, height = pil_img.size
+    if pil_img.mode == "L":
+        background_color = background_color[0]
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=384, use_thumbnail=True):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    }
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def dynamic_s2_preprocess(image, s2_scales=[384, 768, 1152], max_num=12, image_size=384):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    min_num = (s2_scales[-1] // s2_scales[0]) ** 2  # at least use number of tiles as the largest scale
+    processed_images = []
+    ##########################################################################################
+    ############# Add tiles for all but the last scale using fixed squre ratio ###############
+    ##########################################################################################
+    for scale in s2_scales[:-1]:
+        target_width = image_size * (scale // s2_scales[0])
+        target_height = image_size * (scale // s2_scales[0])
+        blocks = (scale // s2_scales[0]) ** 2
+        # resize the image
+        resized_img = image.resize((target_width, target_height))
+        for i in range(blocks):
+            box = (
+                (i % (target_width // image_size)) * image_size,
+                (i // (target_width // image_size)) * image_size,
+                ((i % (target_width // image_size)) + 1) * image_size,
+                ((i // (target_width // image_size)) + 1) * image_size,
+            )
+            # split the image
+            split_img = resized_img.crop(box)
+            processed_images.append(split_img)
+    ##########################################################################################
+    ################ Add tiles for the last scale using dynamic aspect ratio #################
+    ##########################################################################################
+    # calculate the existing image aspect ratio
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    }
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    return processed_images, (target_aspect_ratio[1], target_aspect_ratio[0])
+def dynamic_s2_process_images_and_prompt(images, data_args, image_folder=None):
+    idx = 0
+    all_images = []
+    all_block_size = []
+    for img in images:
+        processed_images, block_size = process_image(img, data_args, image_folder, enable_dynamic_s2=True)
+        all_images.append(processed_images)
+        all_block_size.append(block_size)
+        idx += 2
+    if all_images:
+        all_images = torch.cat(all_images)
+    else:
+        all_images = None
+    return all_images, all_block_size
+def process_image(
+    image_file, data_args, image_folder, enable_dynamic_res=False, enable_dynamic_s2=False, max_tiles=None
+):
+    processor = data_args.image_processor
+    if isinstance(image_file, str):
+        if image_folder is not None:
+            image = Image.open(os.path.join(image_folder, image_file)).convert("RGB")
+        else:
+            image = Image.open(image_file).convert("RGB")
+    else:
+        # image is stored in bytearray
+        image = image_file
+    image = image.convert("RGB")
+    if hasattr(data_args.image_processor, "crop_size"):
+        # CLIP vision tower
+        crop_size = data_args.image_processor.crop_size
+    else:
+        # SIGLIP vision tower
+        assert hasattr(data_args.image_processor, "size")
+        crop_size = data_args.image_processor.size
+    if "dynamic_s2" in data_args.image_aspect_ratio and enable_dynamic_s2:
+        assert crop_size["height"] == crop_size["width"]
+        images, block_size = dynamic_s2_preprocess(
+            image, s2_scales=data_args.s2_scales, max_num=data_args.max_tiles, image_size=crop_size["height"]
+        )
+        images = [processor.preprocess(image, return_tensors="pt")["pixel_values"][0] for image in images]
+        return torch.stack(images), block_size
+    if "dynamic" in data_args.image_aspect_ratio and enable_dynamic_res:
+        assert crop_size["height"] == crop_size["width"]
+        if max_tiles is not None:
+            max_num = max_tiles
+        else:
+            max_num = data_args.max_tiles
+        images = dynamic_preprocess(image, min_num=data_args.min_tiles, max_num=max_num, image_size=crop_size["height"])
+        images = [processor.preprocess(image, return_tensors="pt")["pixel_values"][0] for image in images]
+        return torch.stack(images)
+    if data_args.image_aspect_ratio == "resize":
+        image = image.resize((crop_size["width"], crop_size["height"]))
+    if data_args.image_aspect_ratio == "pad":
+        def expand2square(pil_img, background_color):
+            width, height = pil_img.size
+            if width == height:
+                return pil_img
+            elif width > height:
+                result = Image.new(pil_img.mode, (width, width), background_color)
+                result.paste(pil_img, (0, (width - height) // 2))
+                return result
+            else:
+                result = Image.new(pil_img.mode, (height, height), background_color)
+                result.paste(pil_img, ((height - width) // 2, 0))
+                return result
+        image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+        image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+    else:
+        # Using default behavior of the vision encoder
+        # For CLIP, default is central crop
+        # For Radio, default is central crop
+        # For Siglip, default is resize
+        # For InternVIT, default is resize
+        image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+    return image
+def get_num_windows(T, sr, max_num_window=5):
+    window_length  = int(30.0 * sr)
+    window_overlap = int(0.0 * sr)
+    max_num_window = max_num_window
+    num_windows = 1
+    if T <= window_length:
+        num_windows = 1
+        full_length = window_length
+    elif T >= (max_num_window * window_length - (max_num_window - 1) * window_overlap):
+        num_windows = max_num_window
+        full_length = (max_num_window * window_length - (max_num_window - 1) * window_overlap)
+    else:
+        num_windows = 1 + int(np.ceil((T - window_length) / float(window_length - window_overlap)))
+        full_length = num_windows * window_length - (num_windows - 1) * window_overlap
+    return num_windows, full_length
+def load_audio(file_path, target_sr=16000, duration=30.0, start=0.0):
+    if file_path.endswith('.mp3'):
+        audio = AudioSegment.from_file(file_path)
+        if len(audio) > (start + duration) * 1000:
+            audio = audio[start * 1000:(start + duration) * 1000]
+        if audio.frame_rate != target_sr:
+            audio = audio.set_frame_rate(target_sr)
+        if audio.channels > 1:
+            audio = audio.set_channels(1)
+        data = np.array(audio.get_array_of_samples())
+        if audio.sample_width == 2:
+            data = data.astype(np.float32) / np.iinfo(np.int16).max
+        elif audio.sample_width == 4:
+            data = data.astype(np.float32) / np.iinfo(np.int32).max
+        else:
+            raise ValueError("Unsupported bit depth: {}".format(audio.sample_width))
+    else:
+        with sf.SoundFile(file_path) as audio:
+            original_sr = audio.samplerate
+            channels = audio.channels
+            max_frames = int((start + duration) * original_sr)
+            audio.seek(int(start * original_sr))
+            frames_to_read = min(max_frames, len(audio))
+            data = audio.read(frames_to_read)
+            if data.max() > 1 or data.min() < -1:
+                data = data / max(abs(data.max()), abs(data.min()))
+        if original_sr != target_sr:
+            if channels == 1:
+                data = librosa_resample(data.flatten(), orig_sr=original_sr, target_sr=target_sr)
+            else:
+                data = librosa_resample(data.T, orig_sr=original_sr, target_sr=target_sr)[0]
+        else:
+            if channels != 1:
+                data = data.T[0]
+    if data.min() >= 0:
+        data = 2 * data / abs(data.max()) - 1.0
+    else:
+        data = data / max(abs(data.max()), abs(data.min()))
+    assert len(data.shape) == 1, data.shape
+    return data
+def process_images(images, image_processor, model_cfg, enable_dynamic_res=False, max_tiles=None):
+    model_cfg.image_processor = image_processor
+    new_images = [
+        process_image(image, model_cfg, None, enable_dynamic_res=enable_dynamic_res, max_tiles=max_tiles)
+        for image in images
+    ]
+    if all(x.shape == new_images[0].shape for x in new_images):
+        if len(new_images[0].shape) == 4:
+            new_images = torch.cat(new_images, dim=0)
+        elif len(new_images[0].shape) == 3:
+            new_images = torch.stack(new_images, dim=0)
+        else:
+            raise ValueError(f"new_images rank does not equal to 4, rank: {len(new_images[0].shape)}")
+    else:
+        raise ValueError("The shape of images in new_images is different!")
+    return new_images
+def process_sounds(sounds):
+    sounds = torch.tensor(sounds)
+    return sounds
+def process_sound_masks(masks):
+    masks = torch.tensor(masks[0])
+    return masks
+def tokenizer_image_token(prompt, tokenizer, return_tensors=None):
+    return tokenizer(prompt, return_tensors=return_tensors).input_ids[0]
+def is_gemma_tokenizer(tokenizer):
+    return "gemma" in tokenizer.__class__.__name__.lower()
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith("checkpoint-"):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if (output_ids[0, -keyword_id.shape[0] :] == keyword_id).all():
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)

llava/model/FloatPointQuantizeTorch.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+import math
+import torch
+def floatExMy_quantize_torch(x, e_bit, m_bit, stochastic):
+    sign, x_abs = x.sign(), x.abs()
+    Elow, Ehigh, Mhigh = -(2 ** (e_bit - 1)) + 2, 2 ** (e_bit - 1), 2**m_bit
+    expo = torch.floor(torch.log2(x_abs))
+    expo = torch.clamp(expo, min=Elow, max=Ehigh)
+    mant = x_abs / torch.exp2(expo)
+    mant_int = torch.floor(mant)
+    mant_frac = mant - mant_int
+    mant_frac = mant_frac * Mhigh
+    if stochastic:
+        noise = mant_frac.new(mant_frac.shape).uniform_(-0.5, 0.5)
+        mant_frac.add_(noise)
+    mant_frac = torch.round(mant_frac)
+    mant_q = mant_int + mant_frac / Mhigh
+    y = sign * (2**expo) * mant_q
+    y = y.to(x)
+    return y
+def floatExM0_quantize_torch(x, e_bit, stochastic):
+    sign, x_abs = x.sign(), x.abs()
+    Elow, Ehigh = -(2 ** (e_bit - 1)) + 1, 2 ** (e_bit - 1)
+    expo = torch.log2(x_abs)
+    if stochastic:
+        noise = expo.new(expo.shape).uniform_(-0.5, 0.5)
+        expo.add(noise)
+        log_bias = math.log2(4 / 3) - 1 / 2
+        expo.add(torch.ones_like(expo) * log_bias)
+    expo = torch.clamp(expo, min=Elow - 1, max=Ehigh)
+    expo = torch.round(expo)
+    y = sign * (2**expo) * (expo > Elow)  # When underflow, set the value to 0
+    y = y.to(x)
+    return y
+def Dynamic_quantize_torch(x, bit, stochastic):
+    if stochastic:
+        raise NotImplementedError("Dynamic Tree quantization does not support stochastic")
+    sign, x_abs = x.sign(), x.abs()
+    expo = torch.ceil(torch.log10(x_abs))
+    expo = torch.clamp(expo, min=2 - bit)
+    mant = (10 * x_abs / torch.pow(10, expo) - 1) / 9  # Range from 0 - 1
+    mant_frac = mant * 2 ** (bit - 2 - expo.abs())
+    mant_frac = torch.round(mant_frac)
+    mant_frac = mant_frac / (2 ** (bit - 2 - expo.abs())) * 9 + 1
+    y = sign * (10**expo) * mant_frac / 10
+    zero_mask = y.abs() > 1.01 * 10 ** (1 - bit)
+    y = y * zero_mask
+    y = y.to(x)
+    return y
+def ZeroDynamic_quantize_torch(x, bit, stochastic):
+    if stochastic:
+        raise NotImplementedError("Dynamic Tree quantization does not support stochastic")
+    sign, x_abs = x.sign(), x.abs()
+    expo = torch.ceil(torch.log10(x_abs))
+    expo = torch.clamp(expo, min=2 - bit)
+    mant = (10 * x_abs / torch.pow(10, expo) - 1) / 9  # Range from 0 - 1
+    mant_frac = mant * 2 ** (bit - 2 - expo.abs())
+    mant_frac = torch.round(mant_frac)
+    mant_frac = mant_frac / (2 ** (bit - 2 - expo.abs())) * 9 + 1
+    y = sign * (10**expo) * mant_frac / 10
+    y = y.to(x)
+    return y

llava/model/FloatPointQuantizeTriton.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+import math
+import struct
+import numpy as np
+import torch
+import triton
+import triton.language as tl
+from triton.language.extra.cuda import libdevice
+segment_size = 1024**3
+def floatExMy_quantize_triton(x, e_bit, m_bit, stochastic):
+    x_ori_shape = x.shape
+    x = x.view(-1)
+    n_elements = x.numel()
+    if n_elements <= segment_size:
+        grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+        y = torch.empty_like(x)
+        if x.dtype in [torch.bfloat16, torch.float32]:
+            if stochastic:
+                noise = x.new(x.shape).uniform_(-0.5, 0.5)
+                _floatExMy_stochastic_quantize_kernel[grid](x, noise, y, n_elements, e_bit, m_bit)
+            else:
+                _floatExMy_quantize_kernel[grid](x, y, n_elements, e_bit, m_bit)
+                torch.cuda.synchronize()
+        else:
+            raise NotImplementedError(f"Other data format {x.dtype} for float quantization triton")
+    else:  # Triton will break when x.numel > 2 * 1024 ** 3
+        num_segments = n_elements // segment_size + 1
+        split_size = [segment_size] * (num_segments - 1) + [n_elements - segment_size * (num_segments - 1)]
+        x_list = x.split(split_size)
+        y_list = []
+        del x
+        for x in x_list:
+            n_elements = x.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            y = torch.empty_like(x)
+            if x.dtype in [torch.bfloat16, torch.float32]:
+                if stochastic:
+                    noise = x.new(x.shape).uniform_(-0.5, 0.5)
+                    _floatExMy_stochastic_quantize_kernel[grid](x, noise, y, n_elements, e_bit, m_bit)
+                else:
+                    _floatExMy_quantize_kernel[grid](x, y, n_elements, e_bit, m_bit)
+                    torch.cuda.synchronize()
+            else:
+                raise NotImplementedError(f"Other data format {x.dtype} for float quantization triton")
+            y_list.append(y)
+        y = torch.concat(y_list)
+        del y_list
+    y = y.reshape(x_ori_shape)
+    return y
+@triton.autotune(
+    configs=[
+        # triton.Config({'BLOCK_SIZE': 4,}, num_warps=4),
+        triton.Config(
+            {
+                "BLOCK_SIZE": 1024,
+            },
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE": 2048,
+            },
+            num_warps=4,
+        ),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def _floatExMy_quantize_kernel(
+    x_ptr,
+    output_ptr,
+    n_elements,
+    e_bit,
+    m_bit,
+    BLOCK_SIZE: tl.constexpr,
+):
+    if isinstance(e_bit, tl.constexpr):
+        ebit = e_bit.value
+    else:
+        ebit = e_bit
+    if isinstance(m_bit, tl.constexpr):
+        mbit = m_bit.value
+    else:
+        mbit = m_bit
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask)
+    x = x.to(tl.float32)
+    sign = 1 - 2 * libdevice.signbit(x)
+    x_abs = tl.abs(x)
+    Elow = -tl.exp2((ebit - 1).to(tl.float32)) + 2
+    Ehigh = tl.exp2((ebit - 1).to(tl.float32))
+    Mhigh = tl.exp2(mbit.to(tl.float32))
+    expo = tl.floor(tl.log2(x_abs))
+    expo = tl.clamp(expo, min=Elow, max=Ehigh)
+    mant = x_abs / tl.exp2(expo)
+    mant_int = tl.floor(mant)
+    mant_frac = mant - mant_int
+    mant_frac = mant_frac * Mhigh
+    # mant_frac = mant_frac + noise
+    mant_frac = libdevice.round(mant_frac)
+    mant_q = mant_int + mant_frac / Mhigh
+    y = sign * tl.exp2(expo) * mant_q
+    y = y.to(x_ptr.dtype.element_ty)
+    tl.store(output_ptr + offsets, y, mask=mask)
+@triton.autotune(
+    configs=[
+        # triton.Config({'BLOCK_SIZE': 4,}, num_warps=4),
+        triton.Config(
+            {
+                "BLOCK_SIZE": 1024,
+            },
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE": 2048,
+            },
+            num_warps=4,
+        ),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def _floatExMy_stochastic_quantize_kernel(
+    x_ptr,
+    noise_ptr,
+    output_ptr,
+    n_elements,
+    e_bit,
+    m_bit,
+    BLOCK_SIZE: tl.constexpr,
+):
+    if isinstance(e_bit, tl.constexpr):
+        ebit = e_bit.value
+    else:
+        ebit = e_bit
+    if isinstance(m_bit, tl.constexpr):
+        mbit = m_bit.value
+    else:
+        mbit = m_bit
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask)
+    noise = tl.load(noise_ptr + offsets, mask=mask)
+    x = x.to(tl.float32)
+    sign = 1 - 2 * libdevice.signbit(x)
+    x_abs = tl.abs(x)
+    Elow = -tl.exp2((ebit - 1).to(tl.float32)) + 2
+    Ehigh = tl.exp2((ebit - 1).to(tl.float32))
+    Mhigh = tl.exp2(mbit.to(tl.float32))
+    expo = tl.floor(tl.log2(x_abs))
+    expo = tl.clamp(expo, min=Elow, max=Ehigh)
+    mant = x_abs / tl.exp2(expo)
+    mant_int = tl.floor(mant)
+    mant_frac = mant - mant_int
+    mant_frac = mant_frac * Mhigh
+    mant_frac = mant_frac + noise
+    mant_frac = libdevice.round(mant_frac)
+    mant_q = mant_int + mant_frac / Mhigh
+    y = sign * tl.exp2(expo) * mant_q
+    y = y.to(x_ptr.dtype.element_ty)
+    tl.store(output_ptr + offsets, y, mask=mask)

llava/model/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+from .language_model.llava_llama import LlavaLlamaConfig, LlavaLlamaModel
+# FP8 related comments, development in progress (PI: ligeng zhu, haochen xi)
+# NOTE: VLM + LLM
+# from .language_model.qllava_qllama import QLlavaLlamaConfig, QLlavaLlamaModel
+# NOTE: Linear -> fp8, similar to transformer engine
+# from .language_model.qllama import QLlamaConfig, QLlamaForCausalLM, QLlamaModel
+# NOTE: Linear + Activation -> fp8, haochen's iclr version
+# from .language_model.qmemllama import QMemLlamaConfig, QMemLlamaForCausalLM, QMemLlamaModel
+"""
+TODO:
+    linear(weights):
+        simulated fp8: done
+        real fp8: in-progress (code already implmented)
+    activation:
+        simulated fp8: done
+        real fp8: in-progress (still coding)
+    optimizers:
+        current VILA: bf16
+        simulated fp8: done
+        real fp8 + fsdp (single node): done
+        real fp8 + fsdp (multiple node): in-progress
+1. linear fp8
+2. activation fp8
+3. fp8 infernce example (load directly from a fp8 and fwd)
+4. bind fp8 related configs to QLlamaConfig {"coat_fp8_args": {}}
+"""
+from .language_model.fp8linearqwen2 import FP8LinearQwen2Config, FP8LinearQwen2Model
+from .language_model.qllava_qllama import QLlavaLlamaConfig, QLlavaLlamaModel

llava/model/apply_delta.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+"""
+Usage:
+python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
+"""
+import argparse
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from llava import LlavaLlamaForCausalLM
+def apply_delta(base_model_path, target_model_path, delta_path):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    print("Loading delta")
+    delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
+    print("Applying delta")
+    for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
+        if name not in base.state_dict():
+            assert name in [
+                "model.mm_projector.weight",
+                "model.mm_projector.bias",
+            ], f"{name} not in base model"
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data += base.state_dict()[name]
+        else:
+            assert name in [
+                "model.embed_tokens.weight",
+                "lm_head.weight",
+            ], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
+            bparam = base.state_dict()[name]
+            param.data[: bparam.shape[0], : bparam.shape[1]] += bparam
+    print("Saving target model")
+    delta.save_pretrained(target_model_path)
+    delta_tokenizer.save_pretrained(target_model_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    args = parser.parse_args()
+    apply_delta(args.base_model_path, args.target_model_path, args.delta_path)

llava/model/builder.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import warnings
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, PretrainedConfig
+from llava.model import LlavaLlamaModel
+from llava.model.utils import is_mm_model
+def load_pretrained_model(
+    model_path,
+    model_name,
+    model_base=None,
+    load_8bit=False,
+    load_4bit=False,
+    device_map="auto",
+    device="cuda",
+    **kwargs,
+):
+    kwargs = {"device_map": device_map, **kwargs}
+    if device != "cuda":
+        kwargs["device_map"] = {"": device}
+    if load_8bit:
+        kwargs["load_in_8bit"] = True
+    elif load_4bit:
+        kwargs["load_in_4bit"] = True
+        kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+    else:
+        kwargs["torch_dtype"] = torch.float16
+        # kwargs["torch_dtype"] = torch.bfloat16
+    if is_mm_model(model_path):
+        # Load LLaVA model
+        ## TODO @yunhao: mind fixing lora
+        if "lora" in model_name.lower() and model_base is None:
+            warnings.warn(
+                "There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged."
+            )
+        if ("lora" in model_name.lower() or "dora" in model_name.lower()) and model_base is not None:
+            lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
+            print(lora_cfg_pretrained)
+            print("Loading LLaVA from base model...")
+            config = AutoConfig.from_pretrained(model_base)
+            prepare_config_for_eval(config, kwargs)
+            model = LlavaLlamaModel.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
+            tokenizer = model.tokenizer
+            token_num, tokem_dim = model.llm.lm_head.out_features, model.llm.lm_head.in_features
+            if model.llm.lm_head.weight.shape[0] != token_num:
+                model.llm.lm_head.weight = torch.nn.Parameter(
+                    torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)
+                )
+                model.llm.embed_tokens.weight = torch.nn.Parameter(
+                    torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)
+                )
+            print("Loading additional LLaVA weights...")
+            if os.path.exists(os.path.join(model_path, "non_lora_trainables.bin")):
+                non_lora_trainables = torch.load(
+                    os.path.join(model_path, "non_lora_trainables.bin"),
+                    map_location="cpu",
+                )
+            else:
+                # this is probably from HF Hub
+                from huggingface_hub import hf_hub_download
+                def load_from_hf(repo_id, filename, subfolder=None):
+                    cache_file = hf_hub_download(repo_id=repo_id, filename=filename, subfolder=subfolder)
+                    return torch.load(cache_file, map_location="cpu")
+                non_lora_trainables = load_from_hf(model_path, "non_lora_trainables.bin")
+            non_lora_trainables = {
+                (k[11:] if k.startswith("base_model.") else k): v for k, v in non_lora_trainables.items()
+            }
+            if any(k.startswith("model.model.") for k in non_lora_trainables):
+                non_lora_trainables = {
+                    (k[6:] if k.startswith("model.") else k): v for k, v in non_lora_trainables.items()
+                }
+            model.load_state_dict(non_lora_trainables, strict=False)
+            from peft import PeftModel
+            print("Loading LoRA weights...")
+            model = PeftModel.from_pretrained(model, model_path)
+            print("Merging LoRA weights...")
+            model = model.merge_and_unload()
+            print("Model is loaded...")
+        else:
+            config = AutoConfig.from_pretrained(model_path)
+            config.resume_path = model_path
+            prepare_config_for_eval(config, kwargs)
+            model = LlavaLlamaModel(config=config, low_cpu_mem_usage=True, **kwargs)
+            tokenizer = model.tokenizer
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print("Convert to FP16...")
+            model.to(torch.float16)
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, legacy=False)
+            model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+    model.eval()
+    image_processor = None
+    if is_mm_model(model_path):
+        model.resize_token_embeddings(len(tokenizer))
+    if hasattr(model.llm.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+    return tokenizer, model, image_processor, context_len
+def prepare_config_for_eval(config: PretrainedConfig, kwargs: dict):
+    try:
+        # compatible with deprecated config convention
+        if getattr(config, "vision_tower_cfg", None) is None:
+            config.vision_tower_cfg = config.mm_vision_tower
+    except AttributeError:
+        raise ValueError(f"Invalid configuration! Cannot find vision_tower in config:\n{config}")
+    config.model_dtype = kwargs.pop("torch_dtype").__str__()

llava/model/coat/activation/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.

llava/model/coat/activation/fake_quantization/FloatPointQuantizeTorch.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import math
+import torch
+def floatExMy_quantize_torch(x, e_bit, m_bit, stochastic):
+    sign, x_abs = x.sign(), x.abs()
+    Elow, Ehigh, Mhigh = -(2 ** (e_bit - 1)) + 2, 2 ** (e_bit - 1), 2**m_bit
+    expo = torch.floor(torch.log2(x_abs))
+    expo = torch.clamp(expo, min=Elow, max=Ehigh)
+    mant = x_abs / torch.exp2(expo)
+    mant_int = torch.floor(mant)
+    mant_frac = mant - mant_int
+    mant_frac = mant_frac * Mhigh
+    if stochastic:
+        noise = mant_frac.new(mant_frac.shape).uniform_(-0.5, 0.5)
+        mant_frac.add_(noise)
+    mant_frac = torch.round(mant_frac)
+    mant_q = mant_int + mant_frac / Mhigh
+    y = sign * (2**expo) * mant_q
+    y = y.to(x)
+    return y
+def floatExM0_quantize_torch(x, e_bit, stochastic):
+    sign, x_abs = x.sign(), x.abs()
+    Elow, Ehigh = -(2 ** (e_bit - 1)) + 1, 2 ** (e_bit - 1)
+    expo = torch.log2(x_abs)
+    if stochastic:
+        noise = expo.new(expo.shape).uniform_(-0.5, 0.5)
+        expo.add(noise)
+        log_bias = math.log2(4 / 3) - 1 / 2
+        expo.add(torch.ones_like(expo) * log_bias)
+    expo = torch.clamp(expo, min=Elow - 1, max=Ehigh)
+    expo = torch.round(expo)
+    y = sign * (2**expo) * (expo > Elow)  # When underflow, set the value to 0
+    y = y.to(x)
+    return y
+def Dynamic_quantize_torch(x, bit, stochastic):
+    if stochastic:
+        raise NotImplementedError("Dynamic Tree quantization does not support stochastic")
+    sign, x_abs = x.sign(), x.abs()
+    expo = torch.ceil(torch.log10(x_abs))
+    expo = torch.clamp(expo, min=2 - bit)
+    mant = (10 * x_abs / torch.pow(10, expo) - 1) / 9  # Range from 0 - 1
+    mant_frac = mant * 2 ** (bit - 2 - expo.abs())
+    mant_frac = torch.round(mant_frac)
+    mant_frac = mant_frac / (2 ** (bit - 2 - expo.abs())) * 9 + 1
+    y = sign * (10**expo) * mant_frac / 10
+    zero_mask = y.abs() > 1.01 * 10 ** (1 - bit)
+    y = y * zero_mask
+    y = y.to(x)
+    return y
+def ZeroDynamic_quantize_torch(x, bit, stochastic):
+    if stochastic:
+        raise NotImplementedError("Dynamic Tree quantization does not support stochastic")
+    sign, x_abs = x.sign(), x.abs()
+    expo = torch.ceil(torch.log10(x_abs))
+    expo = torch.clamp(expo, min=2 - bit)
+    mant = (10 * x_abs / torch.pow(10, expo) - 1) / 9  # Range from 0 - 1
+    mant_frac = mant * 2 ** (bit - 2 - expo.abs())
+    mant_frac = torch.round(mant_frac)
+    mant_frac = mant_frac / (2 ** (bit - 2 - expo.abs())) * 9 + 1
+    y = sign * (10**expo) * mant_frac / 10
+    y = y.to(x)
+    return y

llava/model/coat/activation/fake_quantization/FloatPointQuantizeTriton.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import math
+import struct
+import numpy as np
+import torch
+import triton
+import triton.language as tl
+from triton.language.extra.cuda import libdevice
+def floatExMy_quantize_triton(x, e_bit, m_bit, stochastic):
+    n_elements = x.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+    y = torch.zeros_like(x)
+    if x.dtype in [torch.bfloat16, torch.float32]:
+        if stochastic:
+            noise = x.new(x.shape).uniform_(-0.5, 0.5)
+            _floatExMy_stochastic_quantize_kernel[grid](x, noise, y, n_elements, e_bit, m_bit)
+        else:
+            _floatExMy_quantize_kernel[grid](x, y, n_elements, e_bit, m_bit)
+    else:
+        raise NotImplementedError(f"Other data format {x.dtype} for float quantization triton")
+    return y
+@triton.autotune(
+    configs=[
+        # triton.Config({'BLOCK_SIZE': 4,}, num_warps=4),
+        triton.Config(
+            {
+                "BLOCK_SIZE": 1024,
+            },
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE": 2048,
+            },
+            num_stages=1,
+        ),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def _floatExMy_quantize_kernel(
+    x_ptr,
+    output_ptr,
+    n_elements,
+    e_bit,
+    m_bit,
+    BLOCK_SIZE: tl.constexpr,
+):
+    if isinstance(e_bit, tl.constexpr):
+        ebit = e_bit.value
+    else:
+        ebit = e_bit
+    if isinstance(m_bit, tl.constexpr):
+        mbit = m_bit.value
+    else:
+        mbit = m_bit
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask)
+    x = x.to(tl.float32)
+    sign = 1 - 2 * libdevice.signbit(x)
+    x_abs = tl.abs(x)
+    Elow = -tl.exp2((ebit - 1).to(tl.float32)) + 2
+    Ehigh = tl.exp2((ebit - 1).to(tl.float32))
+    Mhigh = tl.exp2(mbit.to(tl.float32))
+    expo = tl.floor(tl.log2(x_abs))
+    expo = tl.clamp(expo, min=Elow, max=Ehigh)
+    mant = x_abs / tl.exp2(expo)
+    mant_int = tl.floor(mant)
+    mant_frac = mant - mant_int
+    mant_frac = mant_frac * Mhigh
+    # mant_frac = mant_frac + noise
+    mant_frac = libdevice.round(mant_frac)
+    mant_q = mant_int + mant_frac / Mhigh
+    y = sign * tl.exp2(expo) * mant_q
+    y = y.to(x_ptr.dtype.element_ty)
+    tl.store(output_ptr + offsets, y, mask=mask)
+@triton.autotune(
+    configs=[
+        # triton.Config({'BLOCK_SIZE': 4,}, num_warps=4),
+        triton.Config(
+            {
+                "BLOCK_SIZE": 1024,
+            },
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE": 2048,
+            },
+            num_stages=1,
+        ),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def _floatExMy_stochastic_quantize_kernel(
+    x_ptr,
+    noise_ptr,
+    output_ptr,
+    n_elements,
+    e_bit,
+    m_bit,
+    BLOCK_SIZE: tl.constexpr,
+):
+    if isinstance(e_bit, tl.constexpr):
+        ebit = e_bit.value
+    else:
+        ebit = e_bit
+    if isinstance(m_bit, tl.constexpr):
+        mbit = m_bit.value
+    else:
+        mbit = m_bit
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask)
+    noise = tl.load(noise_ptr + offsets, mask=mask)
+    x = x.to(tl.float32)
+    sign = 1 - 2 * libdevice.signbit(x)
+    x_abs = tl.abs(x)
+    Elow = -tl.exp2((ebit - 1).to(tl.float32)) + 2
+    Ehigh = tl.exp2((ebit - 1).to(tl.float32))
+    Mhigh = tl.exp2(mbit.to(tl.float32))
+    expo = tl.floor(tl.log2(x_abs))
+    expo = tl.clamp(expo, min=Elow, max=Ehigh)
+    mant = x_abs / tl.exp2(expo)
+    mant_int = tl.floor(mant)
+    mant_frac = mant - mant_int
+    mant_frac = mant_frac * Mhigh
+    mant_frac = mant_frac + noise
+    mant_frac = libdevice.round(mant_frac)
+    mant_q = mant_int + mant_frac / Mhigh
+    y = sign * tl.exp2(expo) * mant_q
+    y = y.to(x_ptr.dtype.element_ty)
+    tl.store(output_ptr + offsets, y, mask=mask)

llava/model/coat/activation/fake_quantization/quantize_function.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import re
+import torch
+from .FloatPointQuantizeTorch import *
+from .FloatPointQuantizeTriton import *
+def block_cut(input, row_block, column_block, pad_block=False):
+    # print(input.shape)
+    original_shape = input.shape
+    # input tensor shape is M * N
+    if len(input.shape) > 2:
+        input = input.reshape(-1, input.shape[2])
+    elif len(input.shape) == 2:
+        pass
+    else:
+        raise ValueError(f"input shape {input.shape} does not match for block cut, {input}")
+    M, N = input.shape[0], input.shape[1]
+    if row_block == -1:
+        row_block = M
+    if column_block == -1:
+        column_block = N
+    if pad_block:
+        row_remainder, col_remainder = M % row_block, N % column_block
+        if row_remainder:
+            row_pad = row_block - row_remainder
+        else:
+            row_pad = 0
+        if col_remainder:
+            col_pad = column_block - col_remainder
+        else:
+            col_pad = 0
+        input = torch.nn.functional.pad(
+            input, (0, col_pad, 0, row_pad), "constant", 0
+        )  # refer to torch's doc to see why
+        M, N = input.shape[0], input.shape[1]
+        row_num, column_num = M // row_block, N // column_block
+    else:
+        row_num, column_num = M // row_block, N // column_block
+    assert row_num * row_block == M, f"{row_num}, {row_block}, {M}, {original_shape}"
+    assert column_num * column_block == N, f"{column_num}, {column_block}, {N}, {original_shape}"
+    # print(input.shape)
+    input = (
+        input.reshape(row_num, row_block, column_num, column_block)
+        .permute(0, 2, 1, 3)
+        .reshape(row_num * column_num, row_block, column_block)
+    )
+    # print(input.shape)
+    return input
+def block_reshape(input, origin_input, row_block, column_block, pad_block=False):
+    if len(origin_input.shape) > 2:
+        flatten_input = origin_input.reshape(-1, origin_input.shape[2])
+    elif len(origin_input.shape) == 2:
+        flatten_input = origin_input
+    else:
+        raise ValueError(f"input shape {input.shape} does not match for block cut")
+    M, N = flatten_input.shape[0], flatten_input.shape[1]
+    if row_block == -1:
+        row_block = M
+    if column_block == -1:
+        column_block = N
+    if pad_block:
+        row_remainder, col_remainder = M % row_block, N % column_block
+        if row_remainder:
+            row_pad = row_block - row_remainder
+        else:
+            row_pad = 0
+        if col_remainder:
+            col_pad = column_block - col_remainder
+        else:
+            col_pad = 0
+        pad_origin_input = torch.nn.functional.pad(origin_input, (0, col_pad, 0, row_pad), "constant", 0)
+        M, N = pad_origin_input.shape[0], pad_origin_input.shape[1]
+        row_num, column_num = M // row_block, N // column_block
+    else:
+        row_num, column_num = M // row_block, N // column_block
+    input = (
+        input.reshape(row_num, column_num, row_block, column_block)
+        .permute(0, 2, 1, 3)
+        .reshape(row_num * row_block, column_num * column_block)
+    )
+    M, N = flatten_input.shape[0], flatten_input.shape[1]
+    input = input[:M, :N]
+    if len(origin_input.shape) > 2:
+        input = input.reshape(origin_input.shape)
+    elif len(origin_input.shape) == 2:
+        pass
+    else:
+        raise ValueError(f"input shape {input.shape} does not match for block reshape")
+    return input
+def block_verify_int8(input, row_block, column_block, layer_type, necessary=True):
+    Binput = block_cut(input, row_block, column_block)
+    Binput = Binput.to(torch.float32)
+    for n in range(Binput.shape[0]):
+        unique_values = len(torch.unique(Binput[n, :, :]))
+        if unique_values > 256:
+            if necessary:
+                raise ValueError(f"{layer_type} contains more than 256 unique values.")
+            else:
+                return False
+    return True
+def block_quant(input, symm, bits, stochastic, epsilon, apply_quantize, layer_name):
+    Quant_fn = SymmQuantizer
+    return Quant_fn.apply(input, symm, bits, stochastic, epsilon, apply_quantize, layer_name)
+def extract_bit(string):
+    match = re.match(r"INT(\d+)", string)  # INT8
+    if match:
+        return "integer", int(match.group(1)), None
+    match = re.match(r"E(\d+)M(\d+)", string)  # E4M3 / E5M2
+    if match:
+        Ebit, Mbit = int(match.group(1)), int(match.group(2))
+        if Ebit == 1:
+            return "integer", Mbit + 1, None
+        if Mbit == 0:
+            return "floatExM0", int(match.group(1)), 0
+        return "floatExMy", int(match.group(1)), int(match.group(2))
+    match = re.match(r"DE(\d+)", string)
+    if match:
+        return "Dynamic", int(match.group(1)), None
+    match = re.match(r"ZeroD(\d+)", string)
+    if match:
+        return "ZeroDynamic", int(match.group(1)), None
+    raise ValueError(f"{string} data format is not supported")
+class SymmQuantizer(torch.autograd.function.InplaceFunction):
+    @staticmethod
+    def forward(ctx, input, symm, bits, stochastic, epsilon, apply_quantize=True, layer_name=None):
+        with torch.no_grad():
+            absmax_per_block = input.abs().amax(dim=(1, 2)).unsqueeze(1).unsqueeze(2) + epsilon
+            if bits == "100" or not apply_quantize:
+                return input, input, torch.ones_like(absmax_per_block)
+            elif bits == "FP32":
+                return input.to(torch.float32), input.to(torch.float32), torch.ones_like(absmax_per_block)
+            elif bits == "FP16":
+                return input.to(torch.float16), input.to(torch.float16), torch.ones_like(absmax_per_block)
+            elif bits == "BF16":
+                return input.to(torch.bfloat16), input.to(torch.bfloat16), torch.ones_like(absmax_per_block)
+            else:
+                QuantType, bit1, bit2 = extract_bit(bits)
+                if not symm:
+                    bit1 = bit1 + 1  # pretend to be asymmtric
+                if QuantType == "integer":
+                    Qn, Qp = -(2 ** (bit1 - 1) - 1), 2 ** (bit1 - 1) - 1
+                elif QuantType == "floatExMy":
+                    Qn, Qp = -(2 - 2 ** (-bit2)) * (2 ** (2 ** (bit1 - 1))), (2 - 2 ** (-bit2)) * (
+                        2 ** (2 ** (bit1 - 1))
+                    )
+                    if bit1 == 4 and bit2 == 3:  # E4M3
+                        Qn, Qp = -448, 448
+                    if bit1 == 5 and bit2 == 2:  # E5M2
+                        Qn, Qp = -57344, 57344
+                elif QuantType == "floatExM0":
+                    Qn, Qp = -(2 ** (2 ** (bit1 - 1))) + 1, 2 ** (2 ** (bit1 - 1))
+                elif QuantType == "Dynamic":
+                    Qn, Qp = -1, 1
+                elif QuantType == "ZeroDynamic":
+                    Qn, Qp = -1, 1
+                else:
+                    raise NotImplementedError(f"{bits} is not supported by quantization")
+                scale_per_block = (2 * absmax_per_block) / (Qp - Qn)
+                scale_per_block = scale_per_block.to(input)
+                Qinput = input / scale_per_block
+                if QuantType == "integer":
+                    if stochastic:
+                        noise = Qinput.new(Qinput.shape).uniform_(-0.5, 0.5)
+                        Qinput.add_(noise)
+                    Qinput.clamp_(Qn, Qp).round_()
+                elif QuantType == "floatExMy":
+                    # Qinput = floatExMy_quantize_torch(Qinput, bit1, bit2, stochastic)
+                    Qinput = floatExMy_quantize_triton(Qinput, bit1, bit2, stochastic)
+                elif QuantType == "floatExM0":
+                    Qinput = floatExM0_quantize_torch(Qinput, bit1, stochastic)
+                else:
+                    raise NotImplementedError(f"{bits} is not supported by quantization")
+                RQinput = Qinput * scale_per_block
+                if input.dtype != Qinput.dtype:
+                    print(
+                        f"Input type is {input.dtype}, Qinput type is {Qinput.dtype}, scale_per_block type is {scale_per_block.dtype}",
+                        file=open("debug.txt", "a"),
+                    )
+                    import IPython
+                    IPython.embed()
+                return RQinput, Qinput, scale_per_block
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output, None, None, None, None, None

llava/model/coat/activation/fake_quantization/utils.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+def list_has_common_element(list1, list2):
+    set1 = set(list1)
+    set2 = set(list2)
+    return len(set1.intersection(set2)) > 0
+def calculate_scale_num(input, row_block, col_block):
+    if len(input.shape) > 2:
+        input = input.reshape(-1, input.shape[2])
+    elif len(input.shape) == 2:
+        pass
+    else:
+        raise ValueError(f"input shape {input.shape} does not match for block cut, {input}")
+    M, N = input.shape[0], input.shape[1]
+    if row_block == -1:
+        row_block = M
+    if col_block == -1:
+        col_block = N
+    return input.numel() / (row_block * col_block)
+def quant_get_local_rank() -> int:
+    return int(os.environ.get("LOCAL_RANK") or 0)
+def format_string_with_condition(
+    input_string,
+    condition_config,
+    symm,
+    bits,
+    blocksize_config,
+    input_pad=20,
+):
+    padded_string = input_string.ljust(input_pad)
+    output_string = padded_string
+    for k, v in condition_config.items():
+        if v:
+            output_string = output_string + k.ljust(10) + "True".ljust(6) + "".ljust(6)
+        else:
+            output_string = output_string + k.ljust(10) + "".ljust(6) + "False".ljust(6)
+    output_string = output_string + f"Symm {symm}".ljust(10)
+    for k, v in bits.items():
+        output_string = output_string + f"{k} bit".ljust(10) + v.ljust(10)
+    for k, v in blocksize_config.items():
+        output_string += f"{k}: {v}".ljust(15)
+    return output_string
+def print_warning(sentence):
+    print("*" * (len(sentence) + 4))
+    print(f"* {sentence} *")
+    print("*" * (len(sentence) + 4))
+def check_nan_inf(tensor, check_nan, check_inf):
+    if check_nan:
+        contain_nan = torch.isnan(tensor).any()
+    else:
+        contain_nan = False
+    if check_inf:
+        contain_inf = torch.isinf(tensor).any()
+    else:
+        contain_inf = False
+    return contain_nan, contain_inf
+def move_torch_to_numpy(tensor):
+    if tensor is None:
+        return None
+    if tensor.is_cuda:
+        tensor = tensor.cpu()
+    return tensor.detach().float().numpy()
+def flatten_to_1d(tensor):
+    if tensor is None:
+        return None
+    return tensor.reshape(-1)

llava/model/coat/activation/models/_fp8_quantization_config.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+from dataclasses import dataclass
+from transformers import PretrainedConfig
+@dataclass
+class QuantizationConfig:
+    quantize_model: str = "false"
+    symm: bool = True
+    epsilon: float = 1e-10
+    fabit: str = "E4M3"
+    fwbit: str = "E4M3"
+    fobit: str = "E4M3"
+    babit: str = "E5M2"
+    bwbit: str = "E5M2"
+    bobit: str = "E5M2"
+    qchoice: str = "none"
+    group_size: int = -1
+    pad_to_multiple_of: int = 0
+    weight_memory_efficient: bool = True
+    # Legacy
+    row_blocksize: int = -1
+    col_blocksize: int = -1
+    def __init__(
+        self,
+        quantize_model: str = "false",
+        symm: bool = True,
+        epsilon: float = 1e-10,
+        fabit: str = "E4M3",
+        fwbit: str = "E4M3",
+        fobit: str = "E4M3",
+        babit: str = "E5M2",
+        bwbit: str = "E5M2",
+        bobit: str = "E5M2",
+        qchoice: str = "none",
+        group_size: int = -1,
+        pad_to_multiple_of: int = 0,
+        weight_memory_efficient: bool = True,
+        row_blocksize: int = -1,
+        col_blocksize: int = -1,
+        **kwargs,
+    ):
+        super().__init__()
+        self.quantize_model = quantize_model
+        self.symm = symm
+        self.epsilon = epsilon
+        self.fabit = fabit
+        self.fwbit = fwbit
+        self.fobit = fobit
+        self.babit = babit
+        self.bwbit = bwbit
+        self.bobit = bobit
+        self.qchoice = qchoice
+        self.group_size = group_size
+        self.pad_to_multiple_of = pad_to_multiple_of
+        self.weight_memory_efficient = weight_memory_efficient
+        self.row_blocksize = row_blocksize
+        self.col_blocksize = col_blocksize

llava/model/coat/activation/models/_fp8_weightcache.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+import torch.nn as nn
+from ..real_quantization import fp8_division_transpose
+class FP8CacheWeightModule(nn.Module):
+    def __init__(self, config, qargs, layer_id):
+        super().__init__()
+        self.config = config
+        self.qargs = qargs
+        self.layer_id = layer_id
+    def prepare_weight(self, weight, weight_name, is_first_microbatch):
+        if is_first_microbatch:
+            if self.qargs.weight_memory_efficient:
+                # print(f"{weight_name} uses first microbatch")
+                weight_fp8, weight_s, weight_fp8_t = fp8_division_transpose(
+                    weight, self.qargs.group_size, self.fwobits["fwbit"]
+                )
+                setattr(self, f"{weight_name}_fp8_scale", weight_s)
+                return weight_fp8, weight_fp8_t, weight_s
+            else:
+                # print(f"{weight_name} uses first microbatch")
+                weight_fp8, weight_s, weight_fp8_t = fp8_division_transpose(
+                    weight, self.qargs.group_size, self.fwobits["fwbit"]
+                )
+                setattr(self, f"{weight_name}_fp8", weight_fp8)
+                setattr(self, f"{weight_name}_fp8_t", weight_fp8_t)
+                setattr(self, f"{weight_name}_fp8_scale", weight_s)
+                return weight_fp8, weight_fp8_t, weight_s
+        else:
+            if self.qargs.weight_memory_efficient:
+                return getattr(self, f"{weight_name}_fp8_scale")
+            else:
+                return (
+                    getattr(self, f"{weight_name}_fp8"),
+                    getattr(self, f"{weight_name}_fp8_t"),
+                    getattr(self, f"{weight_name}_fp8_scale"),
+                )
+    def forward(self, x):
+        pass

llava/model/coat/activation/models/_fp8manager.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+class FP8Manager:
+    """Class to keep track of and manipulate the global
+    FP8 state at different stages of execution.
+    """
+    is_first_microbatch = False

llava/model/coat/activation/models/coat_llama.py ADDED Viewed

	@@ -0,0 +1,1479 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import os
+from fnmatch import fnmatch
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_flash_attention_utils import _flash_attention_forward
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDynamicNTKScalingRotaryEmbedding,
+    LlamaForCausalLM,
+    LlamaLinearScalingRotaryEmbedding,
+    LlamaModel,
+    LlamaPreTrainedModel,
+    LlamaRMSNorm,
+    LlamaRotaryEmbedding,
+    _prepare_4d_causal_attention_mask_with_cache_position,
+    apply_rotary_pos_emb,
+    repeat_kv,
+    rotate_half,
+)
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_greater_or_equal_2_10,
+    is_torchdynamo_compiling,
+    logging,
+    replace_return_docstrings,
+)
+from ..real_quantization import (
+    Coat_quantize_bgn,
+    Coat_quantize_end,
+    fp8_add_Ifp_Ifp_Ofp_Og16,
+    fp8_add_Ifp_Ifp_Ofp_Opt,
+    fp8_division,
+    fp8_division_transpose,
+    fp8_gelu_backward,
+    fp8_gelu_forward,
+    fp8_layernorm_noparam_backward,
+    fp8_layernorm_noparam_forward,
+    fp8_linear_backward,
+    fp8_linear_forward,
+    fp8_mul_backward,
+    fp8_mul_forward,
+    fp8_quantize,
+    fp8_quantize_pertensor,
+    fp8_quantize_pertensor_transpose,
+    fp8_rmsnorm_backward,
+    fp8_rmsnorm_forward,
+    fp8_silu_backward,
+    fp8_silu_forward,
+    fp8_transpose,
+)
+# FP8 related
+from ._fp8_quantization_config import QuantizationConfig
+from ._fp8_weightcache import FP8CacheWeightModule
+from ._fp8manager import FP8Manager
+logger = logging.get_logger(__name__)
+class CoatLlamaConfig(LlamaConfig):
+    model_type = "fp8_llama"
+class CoatLlamaBeforeAttentionResidual(FP8CacheWeightModule):
+    """
+    This is a typical transformer attention module that contains (1) Residual (2) LayerNorm / RMSNorm (3) 1 * Linear layers
+    """
+    def __init__(self, config: CoatLlamaConfig, qargs: QuantizationConfig, layer_idx: Optional[int] = None):
+        super().__init__(config, qargs, layer_idx)
+        self.qargs = qargs
+        self.fwobits = {
+            "fabit": self.qargs.fabit,
+            "fwbit": self.qargs.fwbit,
+            "fobit": self.qargs.fobit,
+            "babit": self.qargs.babit,
+            "bwbit": self.qargs.bwbit,
+            "bobit": self.qargs.bobit,
+        }
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+    def forward(self, re_x, x, s, rmsnorm_weight):
+        if self.training:
+            if self.qargs.weight_memory_efficient:
+                # Prepare
+                with torch.no_grad():
+                    weight1_s = self.prepare_weight(self.q_proj.weight, "q_proj", FP8Manager.is_first_microbatch)
+                    weight2_s = self.prepare_weight(self.k_proj.weight, "k_proj", FP8Manager.is_first_microbatch)
+                    weight3_s = self.prepare_weight(self.v_proj.weight, "v_proj", FP8Manager.is_first_microbatch)
+                return _CoatLlamaBeforeAttentionResidual.apply(
+                    re_x,
+                    x,
+                    s,
+                    self.q_proj.weight,
+                    None,
+                    None,
+                    weight1_s,
+                    self.k_proj.weight,
+                    None,
+                    None,
+                    weight2_s,
+                    self.v_proj.weight,
+                    None,
+                    None,
+                    weight3_s,
+                    rmsnorm_weight,
+                    self.qargs.group_size,
+                    self.fwobits,
+                    self.layer_id,
+                    self.config,
+                    self.qargs,
+                )
+            else:
+                # Prepare
+                with torch.no_grad():
+                    weight1, weight1_t, weight1_s = self.prepare_weight(
+                        self.q_proj.weight, "q_proj", FP8Manager.is_first_microbatch
+                    )
+                    weight2, weight2_t, weight2_s = self.prepare_weight(
+                        self.k_proj.weight, "k_proj", FP8Manager.is_first_microbatch
+                    )
+                    weight3, weight3_t, weight3_s = self.prepare_weight(
+                        self.v_proj.weight, "v_proj", FP8Manager.is_first_microbatch
+                    )
+                return _CoatLlamaBeforeAttentionResidual.apply(
+                    re_x,
+                    x,
+                    s,
+                    self.q_proj.weight,
+                    weight1,
+                    weight1_t,
+                    weight1_s,
+                    self.k_proj.weight,
+                    weight2,
+                    weight2_t,
+                    weight2_s,
+                    self.v_proj.weight,
+                    weight3,
+                    weight3_t,
+                    weight3_s,
+                    rmsnorm_weight,
+                    self.qargs.group_size,
+                    self.fwobits,
+                    self.layer_id,
+                    self.config,
+                    self.qargs,
+                )
+        else:
+            return re_x, self.att_proj(self.attn_norm(re_x))
+class _CoatLlamaBeforeAttentionResidual(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        re_x,
+        in_x,
+        in_s,
+        weight1_origin,
+        weight1,
+        weight1_t,
+        weight1_s,
+        weight2_origin,
+        weight2,
+        weight2_t,
+        weight2_s,
+        weight3_origin,
+        weight3,
+        weight3_t,
+        weight3_s,
+        rmsnorm_weight,
+        group_size,
+        fwobits,
+        layer_id,
+        config,
+        qargs,
+        eps=1e-5,
+    ):
+        # for autograd
+        if fwobits["fabit"] == "E4M3":
+            # in_x = in_x.to(torch.float8_e4m3fn)
+            in_x = in_x.view(torch.float8_e4m3fn)
+        else:
+            raise ValueError("fabit should be E4M3")
+        # LayerNorm
+        ln_x, ln_s, ln_x_t, ln_utils = fp8_rmsnorm_forward(
+            in_x, in_s, rmsnorm_weight, group_size, eps, transpose_output_2d=True
+        )
+        # Linear Layer QKV Projection
+        if qargs.weight_memory_efficient:
+            assert weight1 is None  # memory efficient
+            weight1, weight1_s = fp8_division(weight1_origin, qargs.group_size, fwobits["fwbit"], weight1_s)
+            weight2, weight2_s = fp8_division(weight2_origin, qargs.group_size, fwobits["fwbit"], weight2_s)
+            weight3, weight3_s = fp8_division(weight3_origin, qargs.group_size, fwobits["fwbit"], weight3_s)
+        fc1_x = fp8_linear_forward(ln_x, ln_s, weight1, weight1_s, False, group_size)  # query states
+        fc2_x = fp8_linear_forward(ln_x, ln_s, weight2, weight2_s, False, group_size)  # key states
+        fc3_x = fp8_linear_forward(ln_x, ln_s, weight3, weight3_s, False, group_size)  # value states
+        # ==================== save for backward ====================
+        ctx.save_for_backward(in_x, in_s, ln_x_t, ln_s)
+        if qargs.weight_memory_efficient:
+            assert weight1_t is None and weight2_t is None and weight3_t is None
+            ctx.weight = weight1_origin, weight1_s, weight2_origin, weight2_s, weight3_origin, weight3_s
+        else:
+            ctx.weight = weight1_t, weight1_s, weight2_t, weight2_s, weight3_t, weight3_s
+        ctx.group_size = group_size
+        ctx.ln_utils = ln_utils
+        ctx.utils = fwobits, layer_id, config, qargs
+        return re_x, fc1_x, fc2_x, fc3_x
+    @staticmethod
+    def backward(ctx, fp_grad, query_g, key_g, value_g):
+        in_x, in_s, ln_x_t, ln_s = ctx.saved_tensors
+        weight1_t, weight1_s, weight2_t, weight2_s, weight3_t, weight3_s = ctx.weight
+        group_size = ctx.group_size
+        rms_weight, rstd, num_warps = ctx.ln_utils
+        fwobits, layer_id, config, qargs = ctx.utils
+        # ==================== Begin backward ====================
+        # Quantize the RoPE and FlashAttention Output. grad_input and grad_weight requires different data layout.
+        query_g, query_gs, query_g_t = fp8_quantize_pertensor_transpose(
+            query_g, group_size, fwobits["babit"], transpose_output_2d=True, stochastic=False
+        )
+        key_g, key_gs, key_g_t = fp8_quantize_pertensor_transpose(
+            key_g, group_size, fwobits["babit"], transpose_output_2d=True, stochastic=False
+        )
+        value_g, value_gs, value_g_t = fp8_quantize_pertensor_transpose(
+            value_g, group_size, fwobits["babit"], transpose_output_2d=True, stochastic=False
+        )
+        # Linear Layer QKV Projection
+        if qargs.weight_memory_efficient:
+            weight1_t, weight1_s = fp8_division_transpose(
+                weight1_t, qargs.group_size, fwobits["fwbit"], weight1_s, only_transposed=True
+            )
+            weight2_t, weight2_s = fp8_division_transpose(
+                weight2_t, qargs.group_size, fwobits["fwbit"], weight2_s, only_transposed=True
+            )
+            weight3_t, weight3_s = fp8_division_transpose(
+                weight3_t, qargs.group_size, fwobits["fwbit"], weight3_s, only_transposed=True
+            )
+        fc1_g1, att_q_wg = fp8_linear_backward(
+            ln_x_t, ln_s, query_g, query_gs, query_g_t, weight1_t, weight1_s, group_size
+        )
+        fc1_g2, att_k_wg = fp8_linear_backward(ln_x_t, ln_s, key_g, key_gs, key_g_t, weight2_t, weight2_s, group_size)
+        fc1_g3, att_v_wg = fp8_linear_backward(
+            ln_x_t, ln_s, value_g, value_gs, value_g_t, weight3_t, weight3_s, group_size
+        )
+        fc1_g = fc1_g1 + fc1_g2 + fc1_g3
+        # LayerNorm
+        in_g, rms_weight_grad = fp8_rmsnorm_backward(in_x, in_s, fc1_g, rms_weight, rstd, group_size, num_warps)
+        # Add the gradient together, and prepare the input of the next layer.
+        re_g, (in_g, in_sg, in_sg_g16) = fp8_add_Ifp_Ifp_Ofp_Opt(
+            fp_grad, in_g, group_size, fwobits["babit"], stochastic=False
+        )
+        # for autograd. forward's data type should be the same of backward tensor. this will not change the actual binary representation.
+        in_g = in_g.view(torch.float8_e4m3fn)
+        # Although the next operator is a linear layer in MLPResidual module, we return in_sg_g16 to make the size compatible with the forward. Otherwise it will not pass autograd.
+        return (
+            re_g,
+            in_g,
+            in_sg_g16,
+            att_q_wg,
+            None,
+            None,
+            None,
+            att_k_wg,
+            None,
+            None,
+            None,
+            att_v_wg,
+            None,
+            None,
+            None,
+            rms_weight_grad,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+class CoatLlamaAfterAttentionResidual(FP8CacheWeightModule):
+    """
+    This is a typical transformer attention module that contains (1) Residual (2) 1 * Linear layers
+    """
+    def __init__(self, config: CoatLlamaConfig, qargs: QuantizationConfig, layer_id):
+        super().__init__(config, qargs, layer_id)
+        self.qargs = qargs
+        self.fwobits = {
+            "fabit": self.qargs.fabit,
+            "fwbit": self.qargs.fwbit,
+            "fobit": self.qargs.fobit,
+            "babit": self.qargs.babit,
+            "bwbit": self.qargs.bwbit,
+            "bobit": self.qargs.bobit,
+        }
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+    def forward(self, re_x, in_x):
+        if self.training:
+            if self.qargs.weight_memory_efficient:
+                # prepare for the weight
+                with torch.no_grad():
+                    weight4_s = self.prepare_weight(self.o_proj.weight, "o_proj", FP8Manager.is_first_microbatch)
+                return _CoatLlamaAfterAttentionResidual.apply(
+                    re_x,
+                    in_x,
+                    self.o_proj.weight,
+                    None,
+                    None,
+                    weight4_s,
+                    self.qargs.group_size,
+                    self.fwobits,
+                    self.layer_id,
+                    self.config,
+                    self.qargs,
+                )
+            else:
+                # prepare for the weight
+                with torch.no_grad():
+                    weight4, weight4_t, weight4_s = self.prepare_weight(
+                        self.o_proj.weight, "o_proj", FP8Manager.is_first_microbatch
+                    )
+                return _CoatLlamaAfterAttentionResidual.apply(
+                    re_x,
+                    in_x,
+                    self.o_proj.weight,
+                    weight4,
+                    weight4_t,
+                    weight4_s,
+                    self.qargs.group_size,
+                    self.fwobits,
+                    self.layer_id,
+                    self.config,
+                    self.qargs,
+                )
+        else:
+            return re_x + self.attn_out(in_x), None, None
+class _CoatLlamaAfterAttentionResidual(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx, re_x, flash_x, weight4_origin, weight4, weight4_t, weight4_s, group_size, fwobits, layer_id, config, qargs
+    ):
+        # Quantize the FlashAttention Output
+        flash_qx, flash_s, _ = fp8_quantize_pertensor(
+            flash_x, group_size, fwobits["fabit"]
+        )  # Modified to make it memory efficient
+        # # Attention Projection Linear Layer
+        if qargs.weight_memory_efficient:
+            assert weight4 is None  # memory efficient
+            weight4, weight4_s = fp8_division(weight4_origin, qargs.group_size, fwobits["fwbit"], weight4_s)
+        fc4_x = fp8_linear_forward(flash_qx, flash_s, weight4, weight4_s, False, group_size)  #
+        # import IPython
+        # IPython.embed()
+        # Add the activations together
+        fp_x, (out_x, out_s) = fp8_add_Ifp_Ifp_Ofp_Og16(re_x, fc4_x, flash_qx.dtype, group_size)
+        # ==================== save for backward ====================
+        ctx.save_for_backward(flash_x, flash_s)
+        if qargs.weight_memory_efficient:
+            assert weight4_t is None
+            ctx.weight = weight4_origin, weight4_s
+        else:
+            ctx.weight = weight4_t, weight4_s
+        ctx.group_size = group_size
+        ctx.fwobits = fwobits
+        ctx.utils = fwobits, layer_id, config, qargs
+        # For autograd
+        out_x = out_x.view(torch.float8_e4m3fn)
+        return fp_x, out_x, out_s
+    @staticmethod
+    def backward(ctx, fp_grad, out_g, out_gs):
+        flash_x, flash_s = ctx.saved_tensors
+        weight4_t, weight4_s = ctx.weight
+        group_size = ctx.group_size
+        fwobits = ctx.fwobits
+        fwobits, layer_id, config, qargs = ctx.utils
+        # for autograd
+        if fwobits["babit"] == "E5M2":
+            # out_g = out_g.to(torch.float8_e5m2)
+            out_g = out_g.view(torch.float8_e5m2)
+        else:
+            raise ValueError("babit should be E5M2")
+        out_gs_max = out_gs.max()
+        # ==================== Begin backward ====================
+        # Output Projection
+        out_g_t = fp8_transpose(out_g, transpose_output_2d=True)
+        # We do not save an extra flash_x to save the memory usage
+        flash_x_t, flash_s = fp8_division_transpose(
+            flash_x, group_size, fwobits["fabit"], flash_s, stochastic=False, only_transposed=True
+        )
+        if qargs.weight_memory_efficient:
+            weight4_t, weight4_s = fp8_division_transpose(
+                weight4_t, qargs.group_size, fwobits["fwbit"], weight4_s, only_transposed=True
+            )
+        fc4_g, attn_out_wg = fp8_linear_backward(
+            flash_x_t, flash_s, out_g, out_gs_max, out_g_t, weight4_t, weight4_s, group_size
+        )
+        return fp_grad, fc4_g, attn_out_wg, None, None, None, None, None, None, None, None
+class CoatLlamaMLPResidual(FP8CacheWeightModule):
+    """
+    This is a typical transformer attention module that contains (1) Residual (2) LayerNorm / RMSNorm (3) 2 / 3 * Linear layers
+    (4) GELU / Silu Activation
+    """
+    def __init__(self, config: CoatLlamaConfig, qargs: QuantizationConfig, layer_id, hidden_size: int):
+        super().__init__(config, qargs, layer_id)
+        self.qargs = qargs
+        self.fwobits = {
+            "fabit": self.qargs.fabit,
+            "fwbit": self.qargs.fwbit,
+            "fobit": self.qargs.fobit,
+            "babit": self.qargs.babit,
+            "bwbit": self.qargs.bwbit,
+            "bobit": self.qargs.bobit,
+        }
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.training = True
+        # below is only used when training = False
+        assert config.hidden_act == "silu", "We only support silu activation currently"
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, re_x, x, s, rmsnorm_weight):
+        if self.training:
+            if self.qargs.weight_memory_efficient:  # prepare for the weight
+                with torch.no_grad():
+                    weight1_s = self.prepare_weight(self.gate_proj.weight, "gate_proj", FP8Manager.is_first_microbatch)
+                    weight2_s = self.prepare_weight(self.up_proj.weight, "up_proj", FP8Manager.is_first_microbatch)
+                    weight3_s = self.prepare_weight(self.down_proj.weight, "down_proj", FP8Manager.is_first_microbatch)
+                return _CoatLlamaMLPResidual.apply(
+                    re_x,
+                    x,
+                    s,
+                    self.gate_proj.weight,
+                    None,
+                    None,
+                    weight1_s,
+                    self.up_proj.weight,
+                    None,
+                    None,
+                    weight2_s,
+                    self.down_proj.weight,
+                    None,
+                    None,
+                    weight3_s,
+                    rmsnorm_weight,
+                    self.qargs.group_size,
+                    self.fwobits,
+                    self.layer_id,
+                    self.config,
+                    self.qargs,
+                )
+            else:
+                # prepare for the weight
+                with torch.no_grad():
+                    weight1, weight1_t, weight1_s = self.prepare_weight(
+                        self.gate_proj.weight, "gate_proj", FP8Manager.is_first_microbatch
+                    )
+                    weight2, weight2_t, weight2_s = self.prepare_weight(
+                        self.up_proj.weight, "up_proj", FP8Manager.is_first_microbatch
+                    )
+                    weight3, weight3_t, weight3_s = self.prepare_weight(
+                        self.down_proj.weight, "down_proj", FP8Manager.is_first_microbatch
+                    )
+                return _CoatLlamaMLPResidual.apply(
+                    re_x,
+                    x,
+                    s,
+                    self.gate_proj.weight,
+                    weight1,
+                    weight1_t,
+                    weight1_s,
+                    self.up_proj.weight,
+                    weight2,
+                    weight2_t,
+                    weight2_s,
+                    self.down_proj.weight,
+                    weight3,
+                    weight3_t,
+                    weight3_s,
+                    rmsnorm_weight,
+                    self.qargs.group_size,
+                    self.fwobits,
+                    self.layer_id,
+                    self.config,
+                    self.qargs,
+                )
+        else:
+            raise NotImplementedError("Need TODO")
+            og_x = re_x
+            re_x = self.ff_norm(re_x)
+            re_x = self.ff_proj(re_x)
+            re_x = self.act(re_x)
+            re_x = self.ff_out(re_x)
+            re_x = og_x + re_x
+            return re_x, None, None
+class _CoatLlamaMLPResidual(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        re_x,
+        in_x,
+        in_s,
+        weight1_origin,
+        weight1,
+        weight1_t,
+        weight1_s,
+        weight2_origin,
+        weight2,
+        weight2_t,
+        weight2_s,
+        weight3_origin,
+        weight3,
+        weight3_t,
+        weight3_s,
+        rmsnorm_weight,
+        group_size,
+        fwobits,
+        layer_id,
+        config,
+        qargs,
+        eps=1e-5,
+    ):
+        # For autograd
+        if fwobits["fabit"] == "E4M3":
+            # in_x = in_x.to(torch.float8_e4m3fn)
+            in_x = in_x.view(torch.float8_e4m3fn)
+        else:
+            raise ValueError("fabit should be E4M3")
+        # LayerNorm
+        ln_x, ln_s, ln_x_t, ln_utils = fp8_rmsnorm_forward(
+            in_x, in_s, rmsnorm_weight, group_size, eps, transpose_output_2d=True
+        )
+        # Linear Layer of Up Projection and Gate Projection. They are fused as one linear layer.
+        if qargs.weight_memory_efficient:
+            assert weight1 is None and weight2 is None and weight3 is None  # memory efficient
+            weight1, weight1_s = fp8_division(weight1_origin, qargs.group_size, fwobits["fwbit"], weight1_s)
+            weight2, weight2_s = fp8_division(weight2_origin, qargs.group_size, fwobits["fwbit"], weight2_s)
+            weight3, weight3_s = fp8_division(weight3_origin, qargs.group_size, fwobits["fwbit"], weight3_s)
+        gate_x, gate_s = fp8_linear_forward(ln_x, ln_s, weight1, weight1_s, True, group_size)  # Gate Proj
+        up_x, up_s = fp8_linear_forward(ln_x, ln_s, weight2, weight2_s, True, group_size)  # Up Proj
+        # silu Activation
+        silu_x, silu_s = fp8_silu_forward(gate_x, gate_s, group_size)
+        # Element-wise Multiplication
+        mul_x, mul_s, mul_x_t = fp8_mul_forward(silu_x, silu_s, up_x, up_s, group_size, transpose_output_2d=True)
+        # Output Projection
+        if weight3 is None:  # memory efficient
+            weight3, weight3_s = fp8_division(weight3_origin, qargs.group_size, fwobits["fwbit"], weight3_s)
+        fc3_x = fp8_linear_forward(mul_x, mul_s, weight3, weight3_s, False, group_size)
+        # Add the activation together
+        fp_x, (out_x, out_s) = fp8_add_Ifp_Ifp_Ofp_Og16(re_x, fc3_x, mul_x.dtype, group_size)
+        # ==================== save for backward ====================
+        ctx.save_for_backward(in_x, in_s, ln_x_t, ln_s, gate_x, gate_s, up_x, up_s, silu_x, silu_s, mul_x_t, mul_s)
+        ctx.weight = (weight1_t, weight1_s, weight2_t, weight2_s)
+        if (
+            qargs.weight_memory_efficient
+        ):  # Weight_1/2_origin will not be saved twice, so it will be more memory efficient.
+            assert weight1_t is None and weight2_t is None and weight3_t is None
+            ctx.weight = (weight1_origin, weight1_s, weight2_origin, weight2_s, weight3_origin, weight3_s)
+        else:  # Weight1/2_t is different from the origin weight, so saving it will consumes additional memory footprint.
+            ctx.weight = (weight1_t, weight1_s, weight2_t, weight2_s, weight3_t, weight3_s)
+        ctx.group_size = group_size
+        ctx.ln_utils = ln_utils
+        ctx.utils = fwobits, layer_id, config, qargs
+        out_x = out_x.view(torch.float8_e4m3fn)
+        return fp_x, out_x, out_s
+    @staticmethod
+    def backward(ctx, fp_grad, out_g, out_gs):
+        fwobits, layer_id, config, qargs = ctx.utils
+        in_x, in_s, ln_x_t, ln_s, gate_x, gate_s, up_x, up_s, silu_x, silu_s, mul_x_t, mul_s = ctx.saved_tensors
+        (weight1_t, weight1_s, weight2_t, weight2_s, weight3_t, weight3_s) = ctx.weight
+        group_size = ctx.group_size
+        rms_weight, rstd, num_warps = ctx.ln_utils
+        fwobits, layer_id, config, qargs = ctx.utils
+        # For autograd
+        if fwobits["babit"] == "E5M2":
+            # out_g = out_g.to(torch.float8_e5m2)
+            out_g = out_g.view(torch.float8_e5m2)
+        else:
+            raise ValueError("babit should be E5M2")
+        out_gs_max = out_gs.max()
+        # ==================== Begin backward ====================
+        # Output Projection
+        out_gs = out_gs.max()
+        out_g_t = fp8_transpose(out_g, transpose_output_2d=True)
+        if qargs.weight_memory_efficient:
+            weight3_t, weight3_s = fp8_division_transpose(
+                weight3_t, qargs.group_size, fwobits["fwbit"], weight3_s, only_transposed=True
+            )
+        fc3_g, weight3_grad = fp8_linear_backward(
+            mul_x_t, mul_s, out_g, out_gs_max, out_g_t, weight3_t, weight3_s, group_size
+        )
+        # [MEM TEST]
+        del out_g, out_g_t, weight3_t
+        # Element-wise Multiplication, 1 means gate, 2 means up
+        mul_g1, (mul_g2, mul_gs2, mul_g2_t) = fp8_mul_backward(
+            silu_x, silu_s, up_x, up_s, fc3_g, group_size, fwobits["babit"], output_quantized_transpose=True
+        )
+        # Silu activation
+        silu_g, silu_gs, silu_g_t = fp8_silu_backward(
+            gate_x, gate_s, mul_g1, group_size, fwobits["babit"], output_quantized_transpose=True
+        )
+        # Linear Layer of Up and Gate Projection
+        if qargs.weight_memory_efficient:
+            weight1_t, weight1_s = fp8_division_transpose(
+                weight1_t, group_size, fwobits["fwbit"], weight1_s, only_transposed=True
+            )
+            weight2_t, weight2_s = fp8_division_transpose(
+                weight2_t, group_size, fwobits["fwbit"], weight2_s, only_transposed=True
+            )
+        # Gate Proj
+        fc1_g, weight1_grad = fp8_linear_backward(
+            ln_x_t, ln_s, silu_g, silu_gs, silu_g_t, weight1_t, weight1_s, group_size
+        )
+        fc2_g, weight2_grad = fp8_linear_backward(
+            ln_x_t, ln_s, mul_g2, mul_gs2, mul_g2_t, weight2_t, weight2_s, group_size
+        )
+        fc_g = fc1_g + fc2_g
+        # layerNorm
+        in_g, rms_weight_grad = fp8_rmsnorm_backward(in_x, in_s, fc_g, rms_weight, rstd, group_size, num_warps)
+        # Add the gradient together
+        re_g, (in_g, in_sg, in_sg_g16) = fp8_add_Ifp_Ifp_Ofp_Opt(
+            fp_grad, in_g, group_size, fwobits["babit"], stochastic=False
+        )
+        in_g = in_g.view(torch.float8_e4m3fn)
+        return (
+            re_g,
+            in_g,
+            in_sg_g16,
+            weight1_grad,
+            None,
+            None,
+            None,
+            weight2_grad,
+            None,
+            None,
+            None,
+            weight3_grad,
+            None,
+            None,
+            None,
+            rms_weight_grad,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+class LlamaAttentionWithoutLinear(nn.Module):
+    """
+    Remove the Q/K/V/O projection layer in LlamaAttention module and only calculate the attention logic.
+    The Q/K/V Projection is moved to BeforeAttention Module, and the O Projection is moved to AfterAttention Module.
+    """
+    def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        # TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers)
+        self.rotary_emb = LlamaRotaryEmbedding(config=self.config)
+    def forward(
+        self,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = query_states.size()
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaFlashAttention2WithoutLinear(LlamaAttentionWithoutLinear):
+    """
+    Remove the Q/K/V/O projection layer in LlamaFlashAttention2 module and only calculate the attention logic.
+    The Q/K/V Projection is moved to BeforeAttention Module, and the O Projection is moved to AfterAttention Module.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+        output_attentions = False
+        bsz, q_len, _ = query_states.size()
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        dropout_rate = self.attention_dropout if self.training else 0.0
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaSdpaAttentionWithoutLinear(LlamaAttentionWithoutLinear):
+    """
+    Remove the Q/K/V/O projection layer in LlamaSdpaAttention module and only calculate the attention logic.
+    The Q/K/V Projection is moved to BeforeAttention Module, and the O Projection is moved to AfterAttention Module.
+    """
+    # Adapted from LlamaAttention.forward
+    def forward(
+        self,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                query_states=query_states,
+                key_states=key_states,
+                value_states=value_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+        bsz, q_len, _ = query_states.size()
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        return attn_output, None, past_key_value
+COAT_LLAMA_ATTENTION_CLASSES = {
+    "eager": LlamaAttentionWithoutLinear,
+    "flash_attention_2": LlamaFlashAttention2WithoutLinear,
+    "sdpa": LlamaSdpaAttentionWithoutLinear,
+}
+class CoatLlamaDecoderLayer(nn.Module):
+    def __init__(self, config: CoatLlamaConfig, layer_idx: int):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.self_attn = COAT_LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+        self.qargs = QuantizationConfig(**config.coat_fp8_args)
+        self.BeforeAttention = CoatLlamaBeforeAttentionResidual(config, self.qargs, layer_idx)
+        self.AfterAttention = CoatLlamaAfterAttentionResidual(config, self.qargs, layer_idx)
+        self.MLPResidual = CoatLlamaMLPResidual(config, self.qargs, layer_idx, self.hidden_size)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        quant_hidden_states: torch.Tensor,
+        scale_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): BF16 input to the layer of shape `(batch, seq_len, embed_dim)`
+            quant_hidden_states (`torch.float8_e4m3fn`): FP8 input to the layer of shape `(batch, seq_len, embed_dim)`
+            scale_hidden_states (`torch.bfloat16`): BF16 scaling factor to the layer of shape `(batch, seq_len, embed_dim // group_size)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        # Coat: The residual, LayerNorm, and the Q/K/V Projection Linear Layer
+        residual, query_states, key_states, value_states = self.BeforeAttention(
+            hidden_states, quant_hidden_states, scale_hidden_states, self.input_layernorm.weight
+        )
+        # Self Attention without any linear layer
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            query_states=query_states,
+            key_states=key_states,
+            value_states=value_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        # Coat: The Output Projection Linear Layer and Residual
+        hidden_states, quant_hidden_states, scale_hidden_states = self.AfterAttention(residual, hidden_states)
+        # Residual Connection, LayerNorm, and the whole MLP module
+        hidden_states, quant_hidden_states, scale_hidden_states = self.MLPResidual(
+            hidden_states, quant_hidden_states, scale_hidden_states, self.post_attention_layernorm.weight
+        )
+        outputs = ((hidden_states, quant_hidden_states, scale_hidden_states),)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class CoatLlamaPreTrainedModel(PreTrainedModel):
+    config_class = CoatLlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class CoatLlamaModel(CoatLlamaPreTrainedModel):
+    """
+    Coat Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CoatLlamaDecoderLayer`]
+    Args:
+        config: CoatLlamaConfig
+    """
+    def __init__(self, config: CoatLlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [CoatLlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = LlamaRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Quantize
+        self.qargs = QuantizationConfig(**config.coat_fp8_args)
+        self.quantize_input_before_block = Coat_quantize_bgn(self.qargs)
+        self.quantize_output_after_block = Coat_quantize_end(self.qargs)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        # Prepare the input for Coat decoderlayer
+        hidden_states, quant_hidden_states, scale_hidden_states = self.quantize_input_before_block(hidden_states)
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    quant_hidden_states,
+                    scale_hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    quant_hidden_states,
+                    scale_hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states, quant_hidden_states, scale_hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        # Summarize the output of the Decoder Layer
+        hidden_states = self.quantize_output_after_block(hidden_states, quant_hidden_states, scale_hidden_states)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    _update_causal_mask = LlamaModel._update_causal_mask
+class CoatLlamaForCausalLM(CoatLlamaPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = CoatLlamaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    forward = LlamaForCausalLM.forward
+    prepare_inputs_for_generation = LlamaForCausalLM.prepare_inputs_for_generation
+# TODO
+# class LlamaForSequenceClassification(LlamaPreTrainedModel):
+# class LlamaForQuestionAnswering(LlamaPreTrainedModel):
+# class LlamaForTokenClassification(LlamaPreTrainedModel):
+def make_state_dict_compatible(state_dict: dict[str, torch.Tensor]):
+    compatible_state_dict = {}
+    for key, value in state_dict.items():
+        if fnmatch(key, "*self_attn.q_proj*"):
+            new_key = key.replace("self_attn.q_proj", "BeforeAttention.q_proj")
+        elif fnmatch(key, "*self_attn.k_proj*"):
+            new_key = key.replace("self_attn.k_proj", "BeforeAttention.k_proj")
+        elif fnmatch(key, "*self_attn.v_proj*"):
+            new_key = key.replace("self_attn.v_proj", "BeforeAttention.v_proj")
+        elif fnmatch(key, "*self_attn.o_proj*"):
+            new_key = key.replace("self_attn.o_proj", "AfterAttention.o_proj")
+        elif fnmatch(key, "*mlp.gate_proj*"):
+            new_key = key.replace("mlp.gate_proj", "MLPResidual.gate_proj")
+        elif fnmatch(key, "*mlp.up_proj*"):
+            new_key = key.replace("mlp.up_proj", "MLPResidual.up_proj")
+        elif fnmatch(key, "*mlp.down_proj*"):
+            new_key = key.replace("mlp.down_proj", "MLPResidual.down_proj")
+        else:
+            new_key = key
+        compatible_state_dict[new_key] = value
+    return compatible_state_dict
+AutoConfig.register("fp8_llama", CoatLlamaConfig)
+AutoModel.register(CoatLlamaConfig, CoatLlamaModel)
+AutoModelForCausalLM.register(CoatLlamaConfig, CoatLlamaForCausalLM)

llava/model/coat/activation/models/coat_llama_convert_from_hf.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+import argparse
+import os
+from dataclasses import asdict, dataclass, field
+from typing import Optional
+import torch
+import transformers
+from coat.activation.models._fp8_quantization_config import QuantizationConfig
+from coat.activation.models.coat_llama import CoatLlamaConfig, CoatLlamaForCausalLM, make_state_dict_compatible
+from transformers import AutoConfig, AutoModelForCausalLM
+@dataclass
+class ConvertArguments:
+    model_name: str = field(metadata={"help": "The model name or path to download the LLaMA model"})
+    save_path: str = field(metadata={"help": "The path where the converted model weights will be saved"})
+    cache_dir: str = field(default=None, metadata={"help": "Directory to cache the model"})
+def download_and_convert_llama(convert_args: ConvertArguments, quantization_args: QuantizationConfig):
+    """
+    Downloads a LLaMA model, converts its weights using `make_state_dict_compatible`,
+    and saves the converted model.
+    Args:
+        model_name (str): The model name or path to download the LLaMA model.
+        save_path (str): The path where the converted model weights will be saved.
+        cache_dir (Optional[str]): Directory to cache the model. Defaults to None.
+    Returns:
+        None
+    """
+    model_name = convert_args.model_name
+    save_path = convert_args.save_path
+    cache_dir = convert_args.cache_dir
+    # Step 1: Download the original LLaMA model
+    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
+    # Step 2: Initialize the model configuration for FP8 or other custom config
+    config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
+    # Step 3: Apply make_state_dict_compatible to convert weights
+    compatible_state_dict = make_state_dict_compatible(model.state_dict())
+    # Step 4: Create a new model instance with compatible configuration
+    fp8_config = CoatLlamaConfig(**config.to_dict())
+    fp8_config.coat_fp8_args = asdict(quantization_args)
+    converted_model = AutoModelForCausalLM.from_config(fp8_config)
+    converted_model.load_state_dict(compatible_state_dict)
+    # Step 5: Save the converted model and configuration using save_pretrained
+    os.makedirs(save_path, exist_ok=True)
+    converted_model.save_pretrained(save_path)
+    print(f"Converted model saved at {save_path}")
+if __name__ == "__main__":
+    # Parse command-line arguments
+    parser = transformers.HfArgumentParser((ConvertArguments, QuantizationConfig))  # NOTE: FP8
+    convert_args, quantization_args = parser.parse_args_into_dataclasses()
+    # Call the function with parsed arguments
+    download_and_convert_llama(convert_args, quantization_args)

llava/model/coat/activation/models/coat_olmo.py ADDED Viewed

	@@ -0,0 +1,1942 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+Adapted from
+[MosaiclML](https://github.com/mosaicml/examples.git) and
+[minGPT](https://github.com/karpathy/minGPT.git)
+"""
+from __future__ import annotations
+import logging
+import math
+import sys
+from abc import abstractmethod
+from collections import defaultdict
+from functools import partial
+from typing import Callable, Dict, Iterable, List, NamedTuple, Optional, Sequence, Set, Tuple, cast
+import torch
+import torch.backends.cuda
+import torch.nn as nn
+import torch.nn.functional as F
+from olmo.aliases import PathOrStr
+from olmo.beam_search import BeamSearch, Constraint, FinalSequenceScorer, Sampler
+from olmo.config import (
+    ActivationCheckpointingStrategy,
+    ActivationType,
+    BlockType,
+    CheckpointType,
+    FSDPWrapStrategy,
+    InitFnType,
+    LayerNormType,
+    ModelConfig,
+    QuantActivationConfig,
+    ShardedCheckpointerType,
+    TrainConfig,
+)
+from olmo.exceptions import OLMoConfigurationError
+from olmo.initialization import init_normal
+from olmo.model import (
+    Activation,
+    BufferCache,
+    Dropout,
+    LayerNorm,
+    LayerNormBase,
+    OLMo,
+    OLMoBlock,
+    OLMoBlockGroup,
+    OLMoGenerateOutput,
+    OLMoOutput,
+    RMSLayerNorm,
+    RotaryEmbedding,
+    _non_meta_init_device,
+    activation_checkpoint_function,
+    alibi_attention_bias,
+    causal_attention_bias,
+    get_causal_attention_bias,
+    should_checkpoint_block,
+)
+from olmo.torch_util import ensure_finite_, get_cumulative_document_lengths
+from torch import einsum
+from ..real_quantization import (
+    Coat_quantize_bgn,
+    Coat_quantize_end,
+    fp8_add_Ifp_Ifp_Ofp_Og16,
+    fp8_add_Ifp_Ifp_Ofp_Opt,
+    fp8_division,
+    fp8_division_transpose,
+    fp8_gelu_backward,
+    fp8_gelu_forward,
+    fp8_layernorm_noparam_backward,
+    fp8_layernorm_noparam_forward,
+    fp8_linear_backward,
+    fp8_linear_forward,
+    fp8_mul_backward,
+    fp8_mul_forward,
+    fp8_quantize,
+    fp8_quantize_pertensor,
+    fp8_quantize_pertensor_transpose,
+    fp8_rmsnorm_backward,
+    fp8_rmsnorm_forward,
+    fp8_silu_backward,
+    fp8_silu_forward,
+    fp8_transpose,
+)
+from ._fp8_weightcache import FP8CacheWeightModule
+from ._fp8manager import FP8Manager
+if sys.version_info.minor > 8:
+    from collections.abc import MutableMapping
+elif sys.version_info.minor == 8:
+    from typing import MutableMapping
+else:
+    raise SystemExit("This script supports Python 3.8 or higher")
+__all__ = [
+    "LayerNormBase",
+    "LayerNorm",
+    "RMSLayerNorm",
+    "RotaryEmbedding",
+    "Activation",
+    "GELU",
+    "ReLU",
+    "SwiGLU",
+    "OLMoBlock",
+    "OLMoSequentialBlock",
+    "OLMo",
+    "OLMoOutput",
+    "OLMoGenerateOutput",
+]
+log = logging.getLogger(__name__)
+class CoatOLMoBeforeAttentionResidual(FP8CacheWeightModule):
+    """
+    This is a typical transformer attention module that contains (1) Residual (2) LayerNorm / RMSNorm (3) 1 * Linear layers
+    """
+    def __init__(self, config: ModelConfig, qargs: QuantActivationConfig, layer_id, fused_dims: tuple):
+        super().__init__(config, qargs, layer_id)
+        self.qargs = qargs
+        self.fwobits = {
+            "fabit": self.qargs.fabit,
+            "fwbit": self.qargs.fwbit,
+            "fobit": self.qargs.fobit,
+            "babit": self.qargs.babit,
+            "bwbit": self.qargs.bwbit,
+            "bobit": self.qargs.bobit,
+        }
+        self.ln_normalized_shape = config.d_model
+        self.att_proj = nn.Linear(config.d_model, sum(fused_dims), bias=config.include_bias, device=config.init_device)
+        self.attn_norm = LayerNorm.build(config)
+    def forward(self, re_x, x, s):
+        if self.training:
+            if self.qargs.weight_memory_efficient:
+                # Prepare
+                with torch.no_grad():
+                    weight1_s = self.prepare_weight(self.att_proj.weight, "att_proj", FP8Manager.is_first_microbatch)
+                return _CoatOLMoBeforeAttentionResidual.apply(
+                    re_x,
+                    x,
+                    s,
+                    self.att_proj.weight,
+                    None,
+                    None,
+                    weight1_s,
+                    self.qargs.group_size,
+                    self.fwobits,
+                    self.layer_id,
+                    self.config,
+                    self.qargs,
+                )
+            else:
+                # Prepare
+                with torch.no_grad():
+                    weight1, weight1_t, weight1_s = self.prepare_weight(
+                        self.att_proj.weight, "att_proj", FP8Manager.is_first_microbatch
+                    )
+                return _CoatOLMoBeforeAttentionResidual.apply(
+                    re_x,
+                    x,
+                    s,
+                    self.att_proj.weight,
+                    weight1,
+                    weight1_t,
+                    weight1_s,
+                    self.qargs.group_size,
+                    self.fwobits,
+                    self.layer_id,
+                    self.config,
+                    self.qargs,
+                )
+        else:
+            return re_x, self.att_proj(self.attn_norm(re_x))
+class _CoatOLMoBeforeAttentionResidual(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        re_x,
+        in_x,
+        in_s,
+        weight1_origin,
+        weight1,
+        weight1_t,
+        weight1_s,
+        group_size,
+        fwobits,
+        layer_id,
+        config,
+        qargs,
+        eps=1e-5,
+    ):
+        # for autograd
+        if fwobits["fabit"] == "E4M3":
+            # in_x = in_x.to(torch.float8_e4m3fn)
+            in_x = in_x.view(torch.float8_e4m3fn)
+        else:
+            raise ValueError("fabit should be E4M3")
+        # LayerNorm
+        ln_x, ln_s, ln_x_t, ln_utils = fp8_layernorm_noparam_forward(
+            in_x, in_s, group_size, eps, transpose_output_2d=True
+        )
+        # Linear Layer QKV Projection
+        if qargs.weight_memory_efficient:
+            assert weight1 is None  # memory efficient
+            weight1, weight1_s = fp8_division(weight1_origin, qargs.group_size, fwobits["fwbit"], weight1_s)
+        fc1_x = fp8_linear_forward(ln_x, ln_s, weight1, weight1_s, False, group_size)
+        # ==================== save for backward ====================
+        ctx.save_for_backward(in_x, in_s, ln_x_t, ln_s)
+        if qargs.weight_memory_efficient:
+            assert weight1_t is None
+            ctx.weight = weight1_origin, weight1_s
+        else:
+            ctx.weight = weight1_t, weight1_s
+        ctx.group_size = group_size
+        ctx.ln_utils = ln_utils
+        ctx.utils = fwobits, layer_id, config, qargs
+        return re_x, fc1_x
+    @staticmethod
+    def backward(ctx, fp_grad, flash_g):
+        in_x, in_s, ln_x_t, ln_s = ctx.saved_tensors
+        weight1_t, weight1_s = ctx.weight
+        group_size = ctx.group_size
+        mean, rstd, num_warps = ctx.ln_utils
+        fwobits, layer_id, config, qargs = ctx.utils
+        # ==================== Begin backward ====================
+        # Quantize the RoPE and FlashAttention Output. grad_input and grad_weight requires different data layout.
+        flash_g, flash_gs, flash_g_t = fp8_quantize_pertensor_transpose(
+            flash_g, group_size, fwobits["babit"], transpose_output_2d=True, stochastic=False
+        )
+        # Linear Layer QKV Projection
+        if qargs.weight_memory_efficient:
+            weight1_t, weight1_s = fp8_division_transpose(
+                weight1_t, qargs.group_size, fwobits["fwbit"], weight1_s, only_transposed=True
+            )
+        fc1_g, att_proj_wg = fp8_linear_backward(
+            ln_x_t, ln_s, flash_g, flash_gs, flash_g_t, weight1_t, weight1_s, group_size
+        )
+        # LayerNorm
+        in_g = fp8_layernorm_noparam_backward(in_x, in_s, fc1_g, group_size, mean, rstd, num_warps)
+        # Add the gradient together, and prepare the input of the next layer.
+        re_g, (in_g, in_sg, in_sg_g16) = fp8_add_Ifp_Ifp_Ofp_Opt(
+            fp_grad, in_g, group_size, fwobits["babit"], stochastic=False
+        )
+        # for autograd. forward's data type should be the same of backward tensor. this will not change the actual binary representation.
+        in_g = in_g.view(torch.float8_e4m3fn)
+        # Although the next operator is a linear layer in MLPResidual module, we return in_sg_g16 to make the size compatible with the forward. Otherwise it will not pass autograd.
+        return re_g, in_g, in_sg_g16, att_proj_wg, None, None, None, None, None, None, None, None, None
+class CoatOLMoAfterAttentionResidual(FP8CacheWeightModule):
+    """
+    This is a typical transformer attention module that contains (1) Residual (2) 1 * Linear layers
+    """
+    def __init__(self, config: ModelConfig, qargs: QuantActivationConfig, layer_id):
+        super().__init__(config, qargs, layer_id)
+        self.qargs = qargs
+        self.fwobits = {
+            "fabit": self.qargs.fabit,
+            "fwbit": self.qargs.fwbit,
+            "fobit": self.qargs.fobit,
+            "babit": self.qargs.babit,
+            "bwbit": self.qargs.bwbit,
+            "bobit": self.qargs.bobit,
+        }
+        self.attn_out = nn.Linear(config.d_model, config.d_model, bias=config.include_bias, device=config.init_device)
+    def forward(self, re_x, in_x):
+        if self.training:
+            if self.qargs.weight_memory_efficient:
+                # prepare for the weight
+                with torch.no_grad():
+                    weight2_s = self.prepare_weight(self.attn_out.weight, "attn_out", FP8Manager.is_first_microbatch)
+                return _CoatOLMoAfterAttentionResidual.apply(
+                    re_x,
+                    in_x,
+                    self.attn_out.weight,
+                    None,
+                    None,
+                    weight2_s,
+                    self.qargs.group_size,
+                    self.fwobits,
+                    self.layer_id,
+                    self.config,
+                    self.qargs,
+                )
+            else:
+                # prepare for the weight
+                with torch.no_grad():
+                    weight2, weight2_t, weight2_s = self.prepare_weight(
+                        self.attn_out.weight, "attn_out", FP8Manager.is_first_microbatch
+                    )
+                return _CoatOLMoAfterAttentionResidual.apply(
+                    re_x,
+                    in_x,
+                    self.attn_out.weight,
+                    weight2,
+                    weight2_t,
+                    weight2_s,
+                    self.qargs.group_size,
+                    self.fwobits,
+                    self.layer_id,
+                    self.config,
+                    self.qargs,
+                )
+        else:
+            return re_x + self.attn_out(in_x), None, None
+class _CoatOLMoAfterAttentionResidual(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx, re_x, flash_x, weight2_origin, weight2, weight2_t, weight2_s, group_size, fwobits, layer_id, config, qargs
+    ):
+        # Quantize the FlashAttention Output
+        flash_qx, flash_s, _ = fp8_quantize_pertensor(
+            flash_x, group_size, fwobits["fabit"]
+        )  # Modified to make it memory efficient
+        # # Attention Projection Linear Layer
+        if qargs.weight_memory_efficient:
+            assert weight2 is None  # memory efficient
+            weight2, weight2_s = fp8_division(weight2_origin, qargs.group_size, fwobits["fwbit"], weight2_s)
+        fc2_x = fp8_linear_forward(flash_qx, flash_s, weight2, weight2_s, False, group_size)  #
+        # import IPython
+        # IPython.embed()
+        # Add the activations together
+        fp_x, (out_x, out_s) = fp8_add_Ifp_Ifp_Ofp_Og16(re_x, fc2_x, flash_qx.dtype, group_size)
+        # ==================== save for backward ====================
+        ctx.save_for_backward(flash_x, flash_s)
+        if qargs.weight_memory_efficient:
+            assert weight2_t is None
+            ctx.weight = weight2_origin, weight2_s
+        else:
+            ctx.weight = weight2_t, weight2_s
+        ctx.group_size = group_size
+        ctx.fwobits = fwobits
+        ctx.utils = fwobits, layer_id, config, qargs
+        # For autograd
+        out_x = out_x.view(torch.float8_e4m3fn)
+        return fp_x, out_x, out_s
+    @staticmethod
+    def backward(ctx, fp_grad, out_g, out_gs):
+        flash_x, flash_s = ctx.saved_tensors
+        weight2_t, weight2_s = ctx.weight
+        group_size = ctx.group_size
+        fwobits = ctx.fwobits
+        fwobits, layer_id, config, qargs = ctx.utils
+        # for autograd
+        if fwobits["babit"] == "E5M2":
+            # out_g = out_g.to(torch.float8_e5m2)
+            out_g = out_g.view(torch.float8_e5m2)
+        else:
+            raise ValueError("babit should be E5M2")
+        out_gs_max = out_gs.max()
+        # ==================== Begin backward ====================
+        # Output Projection
+        out_g_t = fp8_transpose(out_g, transpose_output_2d=True)
+        # We do not save an extra flash_x to save the memory usage
+        flash_x_t, flash_s = fp8_division_transpose(
+            flash_x, group_size, fwobits["fabit"], flash_s, stochastic=False, only_transposed=True
+        )
+        if qargs.weight_memory_efficient:
+            weight2_t, weight2_s = fp8_division_transpose(
+                weight2_t, qargs.group_size, fwobits["fwbit"], weight2_s, only_transposed=True
+            )
+        fc2_g, attn_out_wg = fp8_linear_backward(
+            flash_x_t, flash_s, out_g, out_gs_max, out_g_t, weight2_t, weight2_s, group_size
+        )
+        return fp_grad, fc2_g, attn_out_wg, None, None, None, None, None, None, None, None
+class CoatOLMoMLPResidual(FP8CacheWeightModule):
+    """
+    This is a typical transformer attention module that contains (1) Residual (2) LayerNorm / RMSNorm (3) 2 / 3 * Linear layers
+    (4) GELU / Silu Activation
+    """
+    def __init__(self, config: ModelConfig, qargs: QuantActivationConfig, layer_id, hidden_size: int):
+        super().__init__(config, qargs, layer_id)
+        self.qargs = qargs
+        self.fwobits = {
+            "fabit": self.qargs.fabit,
+            "fwbit": self.qargs.fwbit,
+            "fobit": self.qargs.fobit,
+            "babit": self.qargs.babit,
+            "bwbit": self.qargs.bwbit,
+            "bobit": self.qargs.bobit,
+        }
+        self.ln_normalized_shape = config.d_model
+        self.act_output_multiplier = 0.5 if config.activation_type == ActivationType.swiglu else 1
+        self.ff_proj = nn.Linear(config.d_model, hidden_size, bias=config.include_bias, device=config.init_device)
+        self.ff_out = nn.Linear(
+            int(self.act_output_multiplier * hidden_size),
+            config.d_model,
+            bias=config.include_bias,
+            device=config.init_device,
+        )
+        self.training = True
+        # below is only used when training = False
+        self.ff_norm = LayerNorm.build(config)
+        self.act = Activation.build(config)
+        assert (self.act.output_multiplier * hidden_size) % 1 == 0
+    def forward(self, re_x, x, s):
+        if self.training:
+            if self.qargs.weight_memory_efficient:  # prepare for the weight
+                with torch.no_grad():
+                    weight1_s = self.prepare_weight(self.ff_proj.weight, "ff_proj", FP8Manager.is_first_microbatch)
+                    weight2_s = self.prepare_weight(self.ff_out.weight, "ff_out", FP8Manager.is_first_microbatch)
+                return _CoatOLMoMLPResidual.apply(
+                    re_x,
+                    x,
+                    s,
+                    self.ff_proj.weight,
+                    None,
+                    None,
+                    weight1_s,
+                    self.ff_out.weight,
+                    None,
+                    None,
+                    weight2_s,
+                    self.qargs.group_size,
+                    self.fwobits,
+                    self.layer_id,
+                    self.config,
+                    self.qargs,
+                )
+            else:
+                # prepare for the weight
+                with torch.no_grad():
+                    weight1, weight1_t, weight1_s = self.prepare_weight(
+                        self.ff_proj.weight, "ff_proj", FP8Manager.is_first_microbatch
+                    )
+                    weight2, weight2_t, weight2_s = self.prepare_weight(
+                        self.ff_out.weight, "ff_out", FP8Manager.is_first_microbatch
+                    )
+                return _CoatOLMoMLPResidual.apply(
+                    re_x,
+                    x,
+                    s,
+                    self.ff_proj.weight,
+                    weight1,
+                    weight1_t,
+                    weight1_s,
+                    self.ff_out.weight,
+                    weight2,
+                    weight2_t,
+                    weight2_s,
+                    self.qargs.group_size,
+                    self.fwobits,
+                    self.layer_id,
+                    self.config,
+                    self.qargs,
+                )
+        else:
+            og_x = re_x
+            re_x = self.ff_norm(re_x)
+            re_x = self.ff_proj(re_x)
+            re_x = self.act(re_x)
+            re_x = self.ff_out(re_x)
+            re_x = og_x + re_x
+            return re_x, None, None
+class _CoatOLMoMLPResidual(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        re_x,
+        in_x,
+        in_s,
+        weight1_origin,
+        weight1,
+        weight1_t,
+        weight1_s,
+        weight2_origin,
+        weight2,
+        weight2_t,
+        weight2_s,
+        group_size,
+        fwobits,
+        layer_id,
+        config,
+        qargs,
+        eps=1e-5,
+    ):
+        # For autograd
+        if fwobits["fabit"] == "E4M3":
+            # in_x = in_x.to(torch.float8_e4m3fn)
+            in_x = in_x.view(torch.float8_e4m3fn)
+        else:
+            raise ValueError("fabit should be E4M3")
+        # LayerNorm
+        ln_x, ln_s, ln_x_t, ln_utils = fp8_layernorm_noparam_forward(
+            in_x, in_s, group_size, eps, transpose_output_2d=True
+        )
+        # Linear Layer of Up Projection and Gate Projection. They are fused as one linear layer.
+        if qargs.weight_memory_efficient:
+            assert weight1 is None  # memory efficient
+            weight1, weight1_s = fp8_division(weight1_origin, qargs.group_size, fwobits["fwbit"], weight1_s)
+        fc1_x, fc1_s = fp8_linear_forward(ln_x, ln_s, weight1, weight1_s, True, group_size)
+        # NOTE: Becareful of the order
+        up_x, gate_x = fc1_x.chunk(2, dim=-1)
+        up_s, gate_s = fc1_s.chunk(2, dim=-1)
+        # silu Activation
+        silu_x, silu_s = fp8_silu_forward(gate_x, gate_s, group_size)
+        # Element-wise Multiplication
+        mul_x, mul_s, mul_x_t = fp8_mul_forward(silu_x, silu_s, up_x, up_s, group_size, transpose_output_2d=True)
+        # Output Projection
+        if weight2 is None:  # memory efficient
+            weight2, weight2_s = fp8_division(weight2_origin, qargs.group_size, fwobits["fwbit"], weight2_s)
+        fc2_x = fp8_linear_forward(mul_x, mul_s, weight2, weight2_s, False, group_size)
+        # Add the activation together
+        fp_x, (out_x, out_s) = fp8_add_Ifp_Ifp_Ofp_Og16(re_x, fc2_x, mul_x.dtype, group_size)
+        # ==================== save for backward ====================
+        ctx.save_for_backward(in_x, in_s, ln_x_t, ln_s, gate_x, gate_s, up_x, up_s, silu_x, silu_s, mul_x_t, mul_s)
+        ctx.weight = (weight1_t, weight1_s, weight2_t, weight2_s)
+        if (
+            qargs.weight_memory_efficient
+        ):  # Weight_1/2_origin will not be saved twice, so it will be more memory efficient.
+            assert weight1_t is None
+            ctx.weight = (weight1_origin, weight1_s, weight2_origin, weight2_s)
+        else:  # Weight1/2_t is different from the origin weight, so saving it will consumes additional memory footprint.
+            ctx.weight = (weight1_t, weight1_s, weight2_t, weight2_s)
+        ctx.group_size = group_size
+        ctx.ln_utils = ln_utils
+        ctx.utils = fwobits, layer_id, config, qargs
+        out_x = out_x.view(torch.float8_e4m3fn)
+        return fp_x, out_x, out_s
+    @staticmethod
+    def backward(ctx, fp_grad, out_g, out_gs):
+        fwobits, layer_id, config, qargs = ctx.utils
+        in_x, in_s, ln_x_t, ln_s, gate_x, gate_s, up_x, up_s, silu_x, silu_s, mul_x_t, mul_s = ctx.saved_tensors
+        (weight1_t, weight1_s, weight2_t, weight2_s) = ctx.weight
+        group_size = ctx.group_size
+        mean, rstd, num_warps = ctx.ln_utils
+        fwobits, layer_id, config, qargs = ctx.utils
+        # For autograd
+        if fwobits["babit"] == "E5M2":
+            # out_g = out_g.to(torch.float8_e5m2)
+            out_g = out_g.view(torch.float8_e5m2)
+        else:
+            raise ValueError("babit should be E5M2")
+        out_gs_max = out_gs.max()
+        # ==================== Begin backward ====================
+        # Output Projection
+        out_gs = out_gs.max()
+        out_g_t = fp8_transpose(out_g, transpose_output_2d=True)
+        if qargs.weight_memory_efficient:
+            weight2_t, weight2_s = fp8_division_transpose(
+                weight2_t, qargs.group_size, fwobits["fwbit"], weight2_s, only_transposed=True
+            )
+        fc2_g, weight2_grad = fp8_linear_backward(
+            mul_x_t, mul_s, out_g, out_gs_max, out_g_t, weight2_t, weight2_s, group_size
+        )
+        # [MEM TEST]
+        del out_g, out_g_t, weight2_t
+        # Element-wise Multiplication, 1 means gate, 2 means up
+        mul_g1, (mul_g2, mul_gs2) = fp8_mul_backward(silu_x, silu_s, up_x, up_s, fc2_g, group_size, fwobits["babit"])
+        # Silu activation
+        silu_g, silu_gs = fp8_silu_backward(gate_x, gate_s, mul_g1, group_size, fwobits["babit"])
+        # Prepare the input of Linear Layer. NOTE: Becareful of the order
+        gateup_g = torch.cat([mul_g2, silu_g], dim=-1)
+        gateup_gs = torch.cat([mul_gs2, silu_gs])
+        gateup_gs = torch.max(gateup_gs)
+        gateup_g, gateup_gs, gateup_g_t = fp8_division_transpose(
+            gateup_g, group_size, fwobits["babit"], gateup_gs, stochastic=False
+        )
+        # Linear Layer of Up and Gate Projection
+        if qargs.weight_memory_efficient:
+            weight1_t, weight1_s = fp8_division_transpose(
+                weight1_t, group_size, fwobits["fwbit"], weight1_s, only_transposed=True
+            )
+        fc1_g, weight1_grad = fp8_linear_backward(
+            ln_x_t, ln_s, gateup_g, gateup_gs, gateup_g_t, weight1_t, weight1_s, group_size
+        )
+        # layerNorm
+        in_g = fp8_layernorm_noparam_backward(in_x, in_s, fc1_g, group_size, mean, rstd, num_warps)
+        # Add the gradient together
+        re_g, (in_g, in_sg, in_sg_g16) = fp8_add_Ifp_Ifp_Ofp_Opt(
+            fp_grad, in_g, group_size, fwobits["babit"], stochastic=False
+        )
+        in_g = in_g.view(torch.float8_e4m3fn)
+        return (
+            re_g,
+            in_g,
+            in_sg_g16,
+            weight1_grad,
+            None,
+            None,
+            None,
+            weight2_grad,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+class CoatOLMoBlock(nn.Module):
+    """
+    A base class for transformer block implementations.
+    """
+    def __init__(self, layer_id: int, config: ModelConfig, qargs: QuantActivationConfig, cache: BufferCache):
+        super().__init__()
+        self.layer_id = layer_id
+        self.config = config
+        self.qargs = qargs
+        self.hidden_size = (
+            config.mlp_hidden_size if config.mlp_hidden_size is not None else config.mlp_ratio * config.d_model
+        )
+        self.__cache = cache
+        assert config.d_model % config.n_heads == 0
+        self._activation_checkpoint_fn: Callable | None = None
+        # Dropout.
+        self.dropout = Dropout(config.residual_dropout)
+        # Layer norms.
+        self.k_norm: LayerNormBase | None = None
+        self.q_norm: LayerNormBase | None = None
+        if config.attention_layer_norm:
+            assert config.effective_n_kv_heads is not None
+            self.k_norm = LayerNormBase.build(
+                config,
+                size=(config.d_model // config.n_heads) * config.effective_n_kv_heads,
+                elementwise_affine=config.attention_layer_norm_with_affine,
+            )
+            self.q_norm = LayerNormBase.build(config, elementwise_affine=config.attention_layer_norm_with_affine)
+        # Make sure QKV clip coefficient is positive, otherwise it's not well-defined.
+        if config.clip_qkv is not None:
+            assert config.clip_qkv > 0
+        # Activation function.
+        self.act = Activation.build(config)
+        assert (self.act.output_multiplier * self.hidden_size) % 1 == 0
+        if not self.qargs.use_quantize_model:
+            # Attention output projection.
+            self.attn_out = nn.Linear(
+                config.d_model, config.d_model, bias=config.include_bias, device=config.init_device
+            )
+            # Feed-forward output projection.
+            self.ff_out = nn.Linear(
+                int(self.act.output_multiplier * self.hidden_size),
+                config.d_model,
+                bias=config.include_bias,
+                device=config.init_device,
+            )
+            self.ff_out._is_residual = True  # type: ignore
+        # Rotary embeddings.
+        if self.config.rope:
+            self.rotary_emb = RotaryEmbedding(config, self.__cache)
+        self.flash_attn_func = None
+        self.flash_attn_varlen_func = None
+        if config.flash_attention:
+            try:
+                from flash_attn import flash_attn_func, flash_attn_varlen_func  # type: ignore
+                self.flash_attn_func = flash_attn_func
+                self.flash_attn_varlen_func = flash_attn_varlen_func
+            except ModuleNotFoundError:
+                pass
+    def reset_parameters(self):
+        if self.k_norm is not None:
+            self.k_norm.reset_parameters()
+        if self.q_norm is not None:
+            self.q_norm.reset_parameters()
+        if not self.qargs.use_quantize_model:
+            if self.config.init_fn == InitFnType.normal:
+                attn_out_std = ff_out_std = self.config.init_std
+                cutoff_factor = self.config.init_cutoff_factor
+            elif self.config.init_fn == InitFnType.mitchell:
+                attn_out_std = 1 / (math.sqrt(2 * self.config.d_model * (self.layer_id + 1)))
+                ff_out_std = 1 / (math.sqrt(2 * self.ff_out.in_features * (self.layer_id + 1)))
+                cutoff_factor = self.config.init_cutoff_factor or 3.0
+            elif self.config.init_fn == InitFnType.full_megatron:
+                attn_out_std = ff_out_std = self.config.init_std / math.sqrt(2.0 * self.config.n_layers)
+                cutoff_factor = self.config.init_cutoff_factor or 3.0
+            else:
+                raise NotImplementedError(self.config.init_fn)
+            init_normal(self.attn_out, std=attn_out_std, init_cutoff_factor=cutoff_factor)
+            init_normal(self.ff_out, std=ff_out_std, init_cutoff_factor=cutoff_factor)
+    def set_activation_checkpointing(
+        self, strategy: ActivationCheckpointingStrategy | None, checkpoint_func: Callable | None = None
+    ):
+        if strategy == ActivationCheckpointingStrategy.fine_grained:
+            self._activation_checkpoint_fn = checkpoint_func or activation_checkpoint_function(self.config)
+        else:
+            self._activation_checkpoint_fn = None
+    @classmethod
+    def _cast_attn_bias(cls, bias: torch.Tensor, input_dtype: torch.dtype) -> torch.Tensor:
+        target_dtype = input_dtype
+        # NOTE: `is_autocast_enabled()` only checks for CUDA autocast, so we use the separate function
+        # `is_autocast_cpu_enabled()` for CPU autocast.
+        # See https://github.com/pytorch/pytorch/issues/110966.
+        if bias.device.type == "cuda" and torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        elif bias.device.type == "cpu" and torch.is_autocast_cpu_enabled():
+            target_dtype = torch.get_autocast_cpu_dtype()
+        if bias.dtype != target_dtype:
+            bias = bias.to(target_dtype)
+            ensure_finite_(bias, check_neg_inf=True, check_pos_inf=False)
+        return bias
+    def _scaled_dot_product_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attn_mask: torch.Tensor | None = None,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+        max_doc_len: int | None = None,
+        cu_doc_lens: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """
+        Computes scaled dot product attention on query, key and value tensors, using an optional
+        attention mask if passed, and applying dropout if a probability greater than 0.0 is specified.
+        """
+        if max_doc_len is not None and cu_doc_lens is not None:
+            assert self.flash_attn_varlen_func is not None, "flash-attn is required for document masking"
+            assert attn_mask is None, "attn-mask is currently not supported with document masking"
+            B, T, D = q.size(0), q.size(2), q.size(3)
+            r = self.flash_attn_varlen_func(
+                q.transpose(1, 2).view(B * T, -1, D),
+                k.transpose(1, 2).view(B * T, -1, D),
+                v.transpose(1, 2).view(B * T, -1, D),
+                cu_doc_lens,
+                cu_doc_lens,
+                max_doc_len,
+                max_doc_len,
+                dropout_p=dropout_p,
+                causal=is_causal,
+            )
+            return r.view(B, T, -1, D).transpose(1, 2)
+        elif self.flash_attn_func is not None and attn_mask is None:
+            r = self.flash_attn_func(
+                q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), dropout_p=dropout_p, causal=is_causal
+            )
+            return r.transpose(1, 2)
+        else:
+            # torch's sdpa doesn't support GQA, so we're doing this
+            assert k.size(1) == v.size(1)
+            num_kv_heads = k.size(1)
+            num_q_heads = q.size(1)
+            if num_q_heads != num_kv_heads:
+                assert num_q_heads % num_kv_heads == 0
+                k = k.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
+                v = v.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
+            return F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=attn_mask,
+                dropout_p=dropout_p,
+                is_causal=is_causal,
+            )
+    def attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_bias: torch.Tensor | None = None,
+        layer_past: tuple[torch.Tensor, torch.Tensor] | None = None,
+        use_cache: bool = False,
+        max_doc_len: int | None = None,
+        cu_doc_lens: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
+        B, T, C = q.size()  # batch size, sequence length, d_model
+        dtype = k.dtype
+        # Optionally apply layer norm to keys and queries.
+        if self.q_norm is not None and self.k_norm is not None:
+            q = self.q_norm(q).to(dtype=dtype)
+            k = self.k_norm(k).to(dtype=dtype)
+        # Move head forward to be next to the batch dim.
+        # shape: (B, nh, T, hs)
+        q = q.view(B, T, self.config.n_heads, C // self.config.n_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        k = k.view(B, T, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        v = v.view(B, T, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2)
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            k = torch.cat((past_key, k), dim=-2)
+            v = torch.cat((past_value, v), dim=-2)
+        present = (k, v) if use_cache else None
+        query_len, key_len = q.shape[-2], k.shape[-2]  # could be different if layer_past not None
+        if self.config.rope:
+            # Apply rotary embeddings.
+            q, k = self.rotary_emb(q, k)
+        if attention_bias is not None:
+            # Resize and cast attention bias.
+            # The current dtype of the attention bias might not match the dtype that the SDP attn function will
+            # run in if AMP is enabled, and this can be a problem if some tokens are masked out due to padding
+            # as down-casting the attention bias to the autocast precision will result in -infs, which will
+            # cause the SDP attn function to produce NaNs.
+            attention_bias = self._cast_attn_bias(attention_bias[:, :, key_len - query_len : key_len, :key_len], dtype)
+        # Get the attention scores.
+        # shape: (B, nh, T, hs)
+        att = self._scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=attention_bias,
+            dropout_p=0.0 if not self.training else self.config.attention_dropout,
+            is_causal=attention_bias is None,
+            max_doc_len=max_doc_len,
+            cu_doc_lens=cu_doc_lens,
+        )
+        # Re-assemble all head outputs side-by-side.
+        att = att.transpose(1, 2).contiguous().view(B, T, C)
+        # Apply output projection. NOTE: We move the attn output outside of this attention function
+        return att, present
+    @abstractmethod
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: torch.FloatTensor | None = None,
+        layer_past: tuple[torch.Tensor, torch.Tensor] | None = None,
+        use_cache: bool = False,
+        max_doc_len: int | None = None,
+        cu_doc_lens: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
+        raise NotImplementedError
+    @classmethod
+    def build(cls, layer_id: int, config: ModelConfig, qargs: QuantActivationConfig, cache: BufferCache) -> OLMoBlock:
+        if config.block_type == BlockType.sequential:
+            return CoatOLMoSequentialBlock(layer_id, config, qargs, cache)
+        elif config.block_type == BlockType.llama:
+            return CoatOLMoLlamaBlock(layer_id, config, qargs, cache)
+        else:
+            raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
+class CoatOLMoSequentialBlock(CoatOLMoBlock):
+    """
+    This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection). To compute it as ``LN(MLP(x + LN(Attention(x))))``,
+    use the flag `norm_after`.
+    """
+    def __init__(self, layer_id: int, config: ModelConfig, qargs: QuantActivationConfig, cache: BufferCache):
+        super().__init__(layer_id, config, qargs, cache)
+        # Attention input projection. Projects x -> (q, k, v)
+        assert not self.config.norm_after, "COAT currently does not support PostNorm"
+        head_dim = config.d_model // config.n_heads
+        self.fused_dims = (
+            config.d_model,
+            config.effective_n_kv_heads * head_dim,
+            config.effective_n_kv_heads * head_dim,
+        )
+        if self.qargs.use_quantize_model:
+            self.BeforeAttention = CoatOLMoBeforeAttentionResidual(config, qargs, self.layer_id, self.fused_dims)
+            self.AfterAttention = CoatOLMoAfterAttentionResidual(config, qargs, self.layer_id)
+            self.MLPResidual = CoatOLMoMLPResidual(config, qargs, self.layer_id, self.hidden_size)
+        else:
+            self.att_proj = nn.Linear(
+                config.d_model, sum(self.fused_dims), bias=config.include_bias, device=config.init_device
+            )
+            # Feed-forward input projection.
+            self.ff_proj = nn.Linear(
+                config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device
+            )
+        # Layer norms.
+        self.attn_norm = LayerNorm.build(config, size=config.d_model)
+        self.ff_norm = LayerNorm.build(config, size=config.d_model)
+    def reset_parameters(self):
+        super().reset_parameters()
+        self.attn_norm.reset_parameters()
+        self.ff_norm.reset_parameters()
+        # NOTE: the standard deviation for these weights does not depend on the layer.
+        if self.qargs.use_quantize_model:  # The initialization appears here, not in CoatOLMoBlock's reset_parameters
+            if self.config.init_fn == InitFnType.normal:
+                attn_out_std = ff_out_std = self.config.init_std
+                cutoff_factor = self.config.init_cutoff_factor
+            elif self.config.init_fn == InitFnType.mitchell:
+                attn_out_std = 1 / (math.sqrt(2 * self.config.d_model * (self.layer_id + 1)))
+                ff_out_std = 1 / (math.sqrt(2 * self.MLPResidual.ff_out.in_features * (self.layer_id + 1)))
+                cutoff_factor = self.config.init_cutoff_factor or 3.0
+            elif self.config.init_fn == InitFnType.full_megatron:
+                attn_out_std = ff_out_std = self.config.init_std / math.sqrt(2.0 * self.config.n_layers)
+                cutoff_factor = self.config.init_cutoff_factor or 3.0
+            else:
+                raise NotImplementedError(self.config.init_fn)
+            init_normal(self.AfterAttention.attn_out, std=attn_out_std, init_cutoff_factor=cutoff_factor)
+            init_normal(self.MLPResidual.ff_out, std=ff_out_std, init_cutoff_factor=cutoff_factor)
+        if self.config.init_fn == InitFnType.normal:
+            std = self.config.init_std
+            cutoff_factor = self.config.init_cutoff_factor
+        elif self.config.init_fn == InitFnType.mitchell:
+            std = 1 / math.sqrt(self.config.d_model)
+            cutoff_factor = self.config.init_cutoff_factor or 3.0
+        elif self.config.init_fn == InitFnType.full_megatron:
+            std = self.config.init_std
+            cutoff_factor = self.config.init_cutoff_factor or 3.0
+        else:
+            raise NotImplementedError(self.config.init_fn)
+        if not self.qargs.use_quantize_model:
+            init_normal(self.att_proj, std, cutoff_factor)
+            init_normal(self.ff_proj, std, cutoff_factor)
+        else:
+            init_normal(self.BeforeAttention.att_proj, std, cutoff_factor)
+            init_normal(self.MLPResidual.ff_proj, std, cutoff_factor)
+    def forward(
+        self,
+        x: torch.Tensor,
+        qx: torch.Tensor,
+        sx: torch.Tensor,
+        attention_bias: torch.Tensor | None = None,
+        layer_past: tuple[torch.Tensor, torch.Tensor] | None = None,
+        use_cache: bool = False,
+        max_doc_len: int | None = None,
+        cu_doc_lens: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
+        # Get query, key, value projections.
+        # shape:
+        #  - for regular attn q, k, v: (batch_size, seq_len, d_model)
+        #  - for multi-query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_heads)
+        #  - for group query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_kv_heads)
+        # import IPython
+        # IPython.embed()
+        if self.qargs.use_quantize_model:
+            # if False:
+            x, qkv = self.BeforeAttention(x, qx, sx)
+        else:
+            # apply norm before
+            h = self.attn_norm(x)
+            qkv = self.BeforeAttention.att_proj(h)
+        if self.config.clip_qkv is not None:
+            qkv.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
+        q, k, v = qkv.split(self.fused_dims, dim=-1)
+        # Get attention scores.
+        att, cache = self.attention(
+            q,
+            k,
+            v,
+            attention_bias,
+            layer_past=layer_past,
+            use_cache=use_cache,
+            max_doc_len=max_doc_len,
+            cu_doc_lens=cu_doc_lens,
+        )
+        # import IPython
+        # IPython.embed()
+        if self.qargs.use_quantize_model:
+            # if False:
+            x, qx, sx = self.AfterAttention(x, att)
+        else:
+            att = self.AfterAttention.attn_out(att)
+            # Add attention scores.
+            # shape: (B, T, C)
+            x = x + self.dropout(att)
+        if self.qargs.use_quantize_model:
+            # if False:
+            x, qx, sx = self.MLPResidual(x, qx, sx)
+        else:
+            # Add feed-forward projection.
+            # shape: (batch_size, seq_len, d_model)
+            og_x = x
+            x = self.ff_norm(x)
+            x = self.MLPResidual.ff_proj(x)
+            if self._activation_checkpoint_fn is not None:
+                x = self._activation_checkpoint_fn(self.act, x)  # type: ignore
+            else:
+                x = self.act(x)
+            x = self.MLPResidual.ff_out(x)
+            x = self.dropout(x)
+            x = og_x + x
+        # import IPython
+        # IPython.embed()
+        return x, qx, sx, cache
+class CoatOLMoLlamaBlock(OLMoBlock):
+    """
+    This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection). This block is similar to `OLMoSequentialBlock`
+    but some operations have slightly different implementations to imitate the
+    behavior of Llama.
+    """
+    def __init__(self, layer_id: int, config: ModelConfig, qargs: QuantActivationConfig, cache: BufferCache):
+        super().__init__(layer_id, config, qargs, cache)
+        # Layer norms.
+        self.attn_norm = LayerNorm.build(config)
+        self.ff_norm = LayerNorm.build(config)
+        self.__cache = cache
+        # Attention input projection. Projects x -> (q, k, v)
+        if config.multi_query_attention:
+            q_proj_out_dim = config.d_model
+            k_proj_out_dim = config.d_model // config.n_heads
+            v_proj_out_dim = config.d_model // config.n_heads
+        else:
+            q_proj_out_dim = config.d_model
+            k_proj_out_dim = config.d_model
+            v_proj_out_dim = config.d_model
+        self.q_proj = nn.Linear(config.d_model, q_proj_out_dim, bias=config.include_bias, device=config.init_device)
+        self.k_proj = nn.Linear(config.d_model, k_proj_out_dim, bias=config.include_bias, device=config.init_device)
+        self.v_proj = nn.Linear(config.d_model, v_proj_out_dim, bias=config.include_bias, device=config.init_device)
+        # Feed-forward input projection.
+        self.ff_proj = nn.Linear(config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device)
+    def reset_parameters(self):
+        super().reset_parameters()
+        self.attn_norm.reset_parameters()
+        self.ff_norm.reset_parameters()
+        # NOTE: the standard deviation for these weights does not depend on the layer.
+        if self.config.init_fn == InitFnType.normal:
+            std = self.config.init_std
+            cutoff_factor = self.config.init_cutoff_factor
+        elif self.config.init_fn == InitFnType.mitchell:
+            std = 1 / math.sqrt(self.config.d_model)
+            cutoff_factor = self.config.init_cutoff_factor or 3.0
+        elif self.config.init_fn == InitFnType.full_megatron:
+            std = self.config.init_std
+            cutoff_factor = self.config.init_cutoff_factor or 3.0
+        else:
+            raise NotImplementedError(self.config.init_fn)
+        init_normal(self.q_proj, std, cutoff_factor)
+        init_normal(self.k_proj, std, cutoff_factor)
+        init_normal(self.v_proj, std, cutoff_factor)
+        init_normal(self.ff_proj, std, cutoff_factor)
+    def _scaled_dot_product_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attn_mask: torch.Tensor | None = None,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+        max_doc_len: int | None = None,
+        cu_doc_lens: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if max_doc_len is not None or cu_doc_lens is not None:
+            raise NotImplementedError(f"attention document masking is not implemented for {self.__class__.__name__}")
+        attn_weights = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(q.size(-1))
+        if is_causal:
+            assert attn_mask is None
+            query_len, key_len = q.shape[-2], k.shape[-2]  # could be different if layer_past not None
+            attn_bias = get_causal_attention_bias(self.__cache, key_len, q.device)[:, :, :query_len, :key_len]
+        elif attn_mask is not None:
+            attn_bias = attn_mask.to(q.dtype)
+        else:
+            attn_bias = torch.zeros_like(attn_weights)
+        attn_weights += attn_bias
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1).to(q.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=dropout_p)
+        return torch.matmul(attn_weights, v)
+    def forward(
+        self,
+        x: torch.Tensor,
+        qx: torch.Tensor,
+        sx: torch.Tensor,
+        attention_bias: torch.Tensor | None = None,
+        layer_past: tuple[torch.Tensor, torch.Tensor] | None = None,
+        use_cache: bool = False,
+        max_doc_len: int | None = None,
+        cu_doc_lens: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
+        # Get query, key, value projections.
+        # shape:
+        #  - for regular attn q, k, v: (batch_size, seq_len, d_model)
+        #  - for multi-query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_heads)
+        x_normed = self.attn_norm(x)
+        q = self.q_proj(x_normed)
+        k = self.k_proj(x_normed)
+        v = self.v_proj(x_normed)
+        if self.config.clip_qkv is not None:
+            q.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
+            k.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
+            v.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
+        # Get attention scores.
+        att, cache = self.attention(
+            q,
+            k,
+            v,
+            attention_bias,
+            layer_past=layer_past,
+            use_cache=use_cache,
+            max_doc_len=max_doc_len,
+            cu_doc_lens=cu_doc_lens,
+        )
+        att = self.attn_out(att)  # NOTE: we move the attn_out outside the self.attention module
+        # Add attention scores.
+        # shape: (B, T, C)
+        x = x + self.dropout(att)
+        # Add feed-forward projection.
+        # shape: (batch_size, seq_len, d_model)
+        og_x = x
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.ff_norm, x)  # type: ignore
+        else:
+            x = self.ff_norm(x)
+        x = self.ff_proj(x)
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.act, x)  # type: ignore
+        else:
+            x = self.act(x)
+        x = self.ff_out(x)
+        x = self.dropout(x)
+        x = og_x + x
+        return x, cache
+class CoatOLMoBlockGroup(nn.ModuleList):
+    def __init__(self, config: ModelConfig, layer_offset: int, modules: Iterable[nn.Module] | None = None):
+        super().__init__(modules)
+        self.config = config
+        self.layer_offset = layer_offset
+        self.activation_checkpointing_strategy: ActivationCheckpointingStrategy | None = None
+        self._activation_checkpoint_fn = activation_checkpoint_function(self.config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: torch.FloatTensor | None = None,
+        layers_past: list[tuple[torch.Tensor, torch.Tensor]] | None = None,
+        use_cache: bool = False,
+        max_doc_len: int | None = None,
+        cu_doc_lens: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, list[tuple[torch.Tensor, torch.Tensor]] | None]:
+        attn_key_values: list[tuple[torch.Tensor, torch.Tensor]] | None = [] if use_cache else None
+        for block_idx, block in enumerate(self):
+            layer_past = None if layers_past is None else layers_past[block_idx]
+            block_idx += self.layer_offset
+            if should_checkpoint_block(self.activation_checkpointing_strategy, block_idx):
+                # shape: (batch_size, seq_len, d_model)
+                x, cache = self._activation_checkpoint_fn(  # type: ignore
+                    block,
+                    x,
+                    attention_bias=attention_bias,
+                    layer_past=layer_past,
+                    use_cache=use_cache,
+                    max_doc_len=max_doc_len,
+                    cu_doc_lens=cu_doc_lens,
+                )
+            else:
+                # shape: (batch_size, seq_len, d_model)
+                x, cache = block(
+                    x,
+                    attention_bias=attention_bias,
+                    layer_past=layer_past,
+                    use_cache=use_cache,
+                    max_doc_len=max_doc_len,
+                    cu_doc_lens=cu_doc_lens,
+                )
+            if attn_key_values is not None:
+                assert cache is not None
+                attn_key_values.append(cache)
+        return x, attn_key_values
+    def reset_parameters(self):
+        for block in self:
+            block.reset_parameters()
+    def set_activation_checkpointing(
+        self, strategy: ActivationCheckpointingStrategy | None, checkpoint_func: Callable | None = None
+    ):
+        self.activation_checkpointing_strategy = strategy
+        for block in self:
+            block.set_activation_checkpointing(strategy, checkpoint_func=checkpoint_func)
+class CoatOLMo(nn.Module):
+    def __init__(self, config: ModelConfig, qargs: QuantActivationConfig, init_params: bool = True):
+        super().__init__()
+        self.config = config
+        self.qargs = qargs
+        self.__cache = BufferCache()
+        # Validate config.
+        if self.config.alibi and self.config.flash_attention:
+            raise OLMoConfigurationError("ALiBi is currently not supported with FlashAttention")
+        if self.config.alibi and self.config.rope:
+            raise OLMoConfigurationError("ALiBi and RoPE are mutually exclusive")
+        if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
+            if self.config.embedding_size < self.config.vocab_size:
+                raise OLMoConfigurationError("embedding size should be at least as big as vocab size")
+            elif self.config.embedding_size % 128 != 0:
+                import warnings
+                warnings.warn(
+                    "Embedding size is not a multiple of 128! This could hurt throughput performance.", UserWarning
+                )
+        self.activation_checkpointing_strategy: ActivationCheckpointingStrategy | None = None
+        self._activation_checkpoint_fn: Callable = activation_checkpoint_function(self.config)
+        if not (
+            0 < self.config.block_group_size <= self.config.n_layers
+            and self.config.n_layers % self.config.block_group_size == 0
+        ):
+            raise OLMoConfigurationError("n layers must be divisible by block group size")
+        torch.backends.cuda.enable_flash_sdp(True)
+        torch.backends.cuda.enable_mem_efficient_sdp(False)  # this is super slow so make sure torch won't use it
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.embedding_size or config.vocab_size, config.d_model, device=config.init_device),
+                emb_drop=Dropout(config.embedding_dropout),
+                ln_f=LayerNorm.build(config),
+            )
+        )
+        blocks = [CoatOLMoBlock.build(i, config, qargs, self.__cache) for i in range(config.n_layers)]
+        if self.config.block_group_size > 1:
+            block_groups = [
+                CoatOLMoBlockGroup(config, i, blocks[i : i + config.block_group_size])
+                for i in range(0, config.n_layers, config.block_group_size)
+            ]
+            self.transformer.update({"block_groups": nn.ModuleList(block_groups)})
+        else:
+            self.transformer.update({"blocks": nn.ModuleList(blocks)})
+        if not (self.config.alibi or self.config.rope):
+            self.transformer.update(
+                {"wpe": nn.Embedding(config.max_sequence_length, config.d_model, device=config.init_device)}
+            )
+        if not config.weight_tying:
+            self.transformer.update(
+                {
+                    "ff_out": nn.Linear(
+                        config.d_model,
+                        config.embedding_size or config.vocab_size,
+                        bias=config.include_bias,
+                        device=config.init_device,
+                    )
+                }
+            )
+        if config.embedding_layer_norm:
+            self.transformer.update({"emb_norm": LayerNorm.build(config)})
+        # When `init_device="meta"` FSDP will call `reset_parameters()` to initialize weights.
+        if init_params and self.config.init_device != "meta":
+            self.reset_parameters()
+        self.__num_fwd_flops: int | None = None
+        self.__num_bck_flops: int | None = None
+        # Warm up cache.
+        if self.config.alibi:
+            get_causal_attention_bias(self.__cache, config.max_sequence_length, _non_meta_init_device(config))
+            self.get_alibi_attention_bias(config.max_sequence_length, _non_meta_init_device(config))
+        # Quantize
+        self.quantize_input_before_block = Coat_quantize_bgn(qargs)
+        self.quantize_output_after_block = Coat_quantize_end(qargs)
+    set_activation_checkpointing = OLMo.set_activation_checkpointing
+    device = OLMo.device
+    reset_parameters = OLMo.reset_parameters
+    get_alibi_attention_bias = OLMo.get_alibi_attention_bias
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        input_embeddings: torch.FloatTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        attention_bias: torch.Tensor | None = None,
+        past_key_values: Sequence[tuple[torch.Tensor, torch.Tensor]] | None = None,
+        use_cache: bool = False,
+        last_logits_only: bool = False,
+        output_hidden_states: bool | None = None,
+        doc_lens: torch.Tensor | None = None,
+        max_doc_lens: Sequence[int] | None = None,
+    ) -> OLMoOutput:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        :param input_embeddings: A tensor of shape `(batch_size, seq_len, d_model)` with input
+            embeddings. When provided, it is treated as the output of the input embedding layer.
+        :param attention_mask: A tensor of shape `(batch_size, seq_len)` that indicates
+            which input IDs are masked. A `1` value in the mask means that
+            the corresponding input ID should *not* be ignored. A `0` means
+            that the corresponding input ID is masked.
+            This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
+            library.
+        :param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
+            `(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
+            to introduce causal or other biases.
+            If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
+            indicates that the i-th element in the sequence is allowed to attend to the j-th
+            element in the sequence.
+            If the tensor is a float tensor, it will just be added to the attention
+            scores before the softmax.
+            The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
+        :param past_key_values: Pre-computed keys and values for each attention block.
+            Can be used to speed up sequential decoding. The `input_ids` which have
+            their past given to this model should not be passed as `input_ids` as they have already been computed.
+        :param use_cache: If `True`, return key and value tensors for each block.
+        :param last_logits_only: If `True`, only compute the logits for the last token of each sequence.
+            This can speed up decoding when you only care about the next token.
+        :param doc_lens: Document lengths to use in attention for intra-document masking.
+            Shape `(batch_size, max_docs)`.
+        :param max_doc_lens: Maximum document length for each instance in the batch.
+        """
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        if past_key_values:
+            assert len(past_key_values) == self.config.n_layers
+        batch_size, seq_len = input_ids.size() if input_embeddings is None else input_embeddings.size()[:2]
+        if past_key_values is None:
+            past_length = 0
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        max_doc_len: int | None = None
+        cu_doc_lens: torch.Tensor | None = None
+        if doc_lens is not None and max_doc_lens is not None:
+            max_doc_len = max(max_doc_lens)
+            cu_doc_lens = get_cumulative_document_lengths(doc_lens)
+        # Get embeddings of input.
+        # shape: (batch_size, seq_len, d_model)
+        x = self.transformer.wte(input_ids) if input_embeddings is None else input_embeddings  # type: ignore
+        # Apply embedding layer norm.
+        if self.config.embedding_layer_norm:
+            x = self.transformer.emb_norm(x)
+        if not (self.config.alibi or self.config.rope):
+            # Get positional embeddings.
+            # shape: (1, seq_len)
+            pos = torch.arange(past_length, past_length + seq_len, dtype=torch.long, device=x.device).unsqueeze(0)
+            # shape: (1, seq_len, d_model)
+            pos_emb = self.transformer.wpe(pos)  # type: ignore
+            x = pos_emb + x
+        # Apply dropout.
+        # shape: (batch_size, seq_len, d_model)
+        x = self.transformer.emb_drop(x)  # type: ignore
+        # Transform the attention mask into what the blocks expect.
+        if attention_mask is not None:
+            # shape: (batch_size, 1, 1, seq_len)
+            attention_mask = attention_mask.to(dtype=torch.float).view(batch_size, -1)[:, None, None, :]
+            attention_mask = (1.0 - attention_mask) * torch.finfo(attention_mask.dtype).min
+        # Merge attention mask with attention bias.
+        if (
+            attention_bias is not None
+            or attention_mask is not None
+            or self.config.alibi
+            # NOTE (epwalsh): we need to initialize the attn bias in order for attn to work properly
+            # with key+value cache. Otherwise `F.scaled_dot_product_attention()` doesn't seem to compute
+            # scores correctly.
+            or past_key_values is not None
+        ):
+            if attention_bias is None and self.config.alibi:
+                attention_bias = get_causal_attention_bias(
+                    self.__cache, past_length + seq_len, x.device
+                ) + self.get_alibi_attention_bias(past_length + seq_len, x.device)
+            elif attention_bias is None:
+                attention_bias = get_causal_attention_bias(self.__cache, past_length + seq_len, x.device)
+            elif attention_bias.dtype in (torch.int8, torch.bool):
+                attention_bias = attention_bias.to(dtype=torch.float)
+                attention_bias.masked_fill_(attention_bias == 0.0, torch.finfo(attention_bias.dtype).min)
+            # Transform to the right shape and data type.
+            mask_len = seq_len
+            if attention_mask is not None:
+                mask_len = attention_mask.shape[-1]
+            elif past_key_values is not None:
+                mask_len = past_key_values[0][0].shape[-2] + seq_len
+            attention_bias = attention_bias[:, :, :mask_len, :mask_len].to(dtype=torch.float)
+            # Add in the masking bias.
+            if attention_mask is not None:
+                attention_bias = attention_bias + attention_mask
+                # Might get -infs after adding attention mask, since dtype.min + dtype.min = -inf.
+                # `F.scaled_dot_product_attention()` doesn't handle -inf like you'd expect, instead
+                # it can produce NaNs.
+                ensure_finite_(attention_bias, check_neg_inf=True, check_pos_inf=False)
+        attn_key_values: list[tuple[torch.Tensor, torch.Tensor]] | None = [] if use_cache else None
+        # decoder layers
+        all_hidden_states = []
+        # Prepare the input for COAT decoderlayer
+        x, qx, sx = self.quantize_input_before_block(x)
+        # Apply blocks one-by-one.
+        if self.config.block_group_size == 1:
+            for block_idx, block in enumerate(self.transformer.blocks):
+                if output_hidden_states:
+                    # add hidden states
+                    all_hidden_states.append(x)
+                layer_past = None if past_key_values is None else past_key_values[block_idx]
+                if should_checkpoint_block(self.activation_checkpointing_strategy, block_idx):
+                    # shape: (batch_size, seq_len, d_model)
+                    x, qx, sx, cache = self._activation_checkpoint_fn(
+                        block,
+                        x,
+                        qx,
+                        sx,
+                        attention_bias=attention_bias,
+                        layer_past=layer_past,
+                        use_cache=use_cache,
+                        max_doc_len=max_doc_len,
+                        cu_doc_lens=cu_doc_lens,
+                    )
+                else:
+                    # shape: (batch_size, seq_len, d_model)
+                    x, qx, sx, cache = block(
+                        x,
+                        qx,
+                        sx,
+                        attention_bias=attention_bias,
+                        layer_past=layer_past,
+                        use_cache=use_cache,
+                        max_doc_len=max_doc_len,
+                        cu_doc_lens=cu_doc_lens,
+                    )
+                if attn_key_values is not None:
+                    assert cache is not None
+                    attn_key_values.append(cache)
+        else:
+            for group_idx, block_group in enumerate(self.transformer.block_groups):
+                if output_hidden_states:
+                    # add hidden states
+                    all_hidden_states.append(x)
+                layers_past = (
+                    None
+                    if past_key_values is None
+                    else past_key_values[
+                        group_idx * self.config.block_group_size : (group_idx + 1) * self.config.block_group_size
+                    ]
+                )
+                x, cache = block_group(
+                    x,
+                    attention_bias=attention_bias,
+                    layers_past=layers_past,
+                    use_cache=use_cache,
+                    max_doc_len=max_doc_len,
+                    cu_doc_lens=cu_doc_lens,
+                )
+                if attn_key_values is not None:
+                    assert cache is not None
+                    attn_key_values.extend(cache)
+        # Summarize the output of the Decoder Layer
+        x = self.quantize_output_after_block(x, qx, sx)
+        if last_logits_only:
+            # shape: (batch_size, 1, d_model)
+            x = x[:, -1, :].unsqueeze(1)
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        x = self.transformer.ln_f(x)  # type: ignore
+        if output_hidden_states:
+            # add final hidden state post-final-layernorm, following HuggingFace's convention
+            all_hidden_states.append(x)
+        # Get logits.
+        # shape: (batch_size, seq_len or 1, vocab_size)
+        if self.config.weight_tying:
+            logits = F.linear(x, self.transformer.wte.weight, None)  # type: ignore
+        else:
+            logits = self.transformer.ff_out(x)  # type: ignore
+        if self.config.scale_logits:
+            logits.mul_(1 / math.sqrt(self.config.d_model))
+        return OLMoOutput(
+            logits=logits,
+            attn_key_values=attn_key_values,
+            hidden_states=tuple(all_hidden_states) if output_hidden_states else None,
+        )
+    def get_fsdp_wrap_policy(self, wrap_strategy: FSDPWrapStrategy | None = None):
+        if wrap_strategy is None:
+            return None
+        # The 'recurse' mode for the wrap function does not behave like you'd expect.
+        # Even if we return False, it may still recurse because PyTorch does what it wants,
+        # not what you want. This causes issues when, for example, we want to wrap 'ff_out' (a linear layer)
+        # but not other linear layers within a block.
+        # So we have to explicitly tell PyTorch which linear layers to wrap, and we also just
+        # return True in 'recurse' mode for simplicity.
+        size_based_module_to_wrap = {self.transformer.wte}
+        if hasattr(self.transformer, "ff_out"):
+            size_based_module_to_wrap.add(self.transformer.ff_out)
+        if wrap_strategy == FSDPWrapStrategy.by_block:
+            def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
+                del nonwrapped_numel
+                wrap = isinstance(module, CoatOLMoBlock)
+                if recurse:
+                    return True
+                else:
+                    return wrap
+            return fsdp_wrap_fn
+        elif wrap_strategy == FSDPWrapStrategy.by_block_and_size:
+            def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
+                del nonwrapped_numel
+                wrap = isinstance(module, (CoatOLMoBlock,)) or module in size_based_module_to_wrap
+                if recurse:
+                    return True
+                else:
+                    return wrap
+            return fsdp_wrap_fn
+        elif wrap_strategy == FSDPWrapStrategy.by_block_group:
+            if self.config.block_group_size <= 1:
+                raise OLMoConfigurationError(
+                    "'by_block_group' FSDP wrapping strategy requires block group size greater than 1"
+                )
+            def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
+                del nonwrapped_numel
+                wrap = isinstance(module, CoatOLMoBlockGroup)
+                if recurse:
+                    return True
+                else:
+                    return wrap
+            return fsdp_wrap_fn
+        elif wrap_strategy == FSDPWrapStrategy.by_block_group_and_size:
+            if self.config.block_group_size <= 1:
+                raise OLMoConfigurationError(
+                    "'by_block_group_and_size' FSDP wrapping strategy requires block group size greater than 1"
+                )
+            def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
+                del nonwrapped_numel
+                wrap = isinstance(module, (CoatOLMoBlockGroup,)) or module in size_based_module_to_wrap
+                if recurse:
+                    return True
+                else:
+                    return wrap
+            return fsdp_wrap_fn
+        elif wrap_strategy == FSDPWrapStrategy.size_based:
+            from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy
+            return size_based_auto_wrap_policy
+        elif wrap_strategy in {
+            FSDPWrapStrategy.one_in_two,
+            FSDPWrapStrategy.one_in_three,
+            FSDPWrapStrategy.one_in_four,
+            FSDPWrapStrategy.one_in_five,
+        }:
+            c = {
+                FSDPWrapStrategy.one_in_two: 2,
+                FSDPWrapStrategy.one_in_three: 3,
+                FSDPWrapStrategy.one_in_four: 4,
+                FSDPWrapStrategy.one_in_five: 5,
+            }[wrap_strategy]
+            def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
+                del nonwrapped_numel
+                wrap = isinstance(module, CoatOLMoBlock) and module.layer_id % c == 0
+                if recurse:
+                    return True
+                else:
+                    return wrap
+            return fsdp_wrap_fn
+        else:
+            raise NotImplementedError(wrap_strategy)
+    num_params = OLMo.num_params
+    @property
+    def num_fwd_flops(self):
+        if self.__num_fwd_flops:
+            return self.__num_fwd_flops
+        # embedding table is just a lookup in the forward pass
+        n_params = self.num_params(include_embedding=False)
+        # the number of parameters is approximately the number of multiply-accumulates (MAC) in the network
+        # each MAC has 2 FLOPs - we multiply by 2 ie 2 * n_param
+        # this gets us FLOPs / token
+        params_flops_per_token = 2 * n_params
+        # there are 2 FLOPS per mac; there is A=Q*K^T and out=A*V ops (ie mult by 2)
+        attn_flops_per_token = self.config.n_layers * 2 * 2 * (self.config.d_model * self.config.max_sequence_length)
+        self.__num_fwd_flops = params_flops_per_token + attn_flops_per_token
+        return self.__num_fwd_flops
+    @property
+    def num_bck_flops(self):
+        if self.__num_bck_flops:
+            return self.__num_bck_flops
+        n_params = self.num_params()
+        params_flops_per_token = 4 * n_params
+        attn_flops_per_token = self.config.n_layers * 8 * (self.config.d_model * self.config.max_sequence_length)
+        self.__num_bck_flops = params_flops_per_token + attn_flops_per_token
+        return self.__num_bck_flops
+    generate = OLMo.generate
+    @classmethod
+    def from_checkpoint(
+        cls, checkpoint_dir: PathOrStr, device: str = "cpu", checkpoint_type: CheckpointType | None = None
+    ) -> CoatOLMo:
+        """
+        Load an OLMo model from a checkpoint.
+        """
+        from olmo.util import resource_path
+        # Guess checkpoint type.
+        if checkpoint_type is None:
+            try:
+                if resource_path(checkpoint_dir, "model.pt").is_file():
+                    checkpoint_type = CheckpointType.unsharded
+                else:
+                    checkpoint_type = CheckpointType.sharded
+            except FileNotFoundError:
+                checkpoint_type = CheckpointType.sharded
+        # Load config.
+        config_path = resource_path(checkpoint_dir, "config.yaml")
+        model_config = ModelConfig.load(config_path, key="model", validate_paths=False)
+        if checkpoint_type == CheckpointType.unsharded:
+            # Initialize model (always on CPU to start with so we don't run out of GPU memory).
+            model_config.init_device = "cpu"
+            model = CoatOLMo(model_config)
+            # Load state dict directly to target device.
+            state_dict_path = resource_path(checkpoint_dir, "model.pt")
+            state_dict = torch.load(state_dict_path, map_location="cpu")
+            model.load_state_dict(model._make_state_dict_compatible(state_dict)[0])
+            model = model.to(torch.device(device))
+        else:
+            train_config = TrainConfig.load(config_path)
+            if train_config.sharded_checkpointer == ShardedCheckpointerType.olmo_core:
+                from olmo_core.distributed.checkpoint import load_model_and_optim_state  # type: ignore
+                model_config.init_device = device
+                model = CoatOLMo(model_config)
+                load_model_and_optim_state(checkpoint_dir, model)
+            else:
+                # train_config.sharded_checkpointer == ShardedCheckpointerType.torch_new
+                from olmo.checkpoint import load_model_state
+                # Initialize model on target device. In this case the state dict is loaded in-place
+                # so it's not necessary to start on CPU if the target device is a GPU.
+                model_config.init_device = device
+                model = CoatOLMo(model_config)
+                # Load state dict in place.
+                load_model_state(checkpoint_dir, model)
+        return model.eval()
+    def _make_state_dict_compatible(
+        self, state_dict: dict[str, torch.Tensor]
+    ) -> tuple[dict[str, torch.Tensor], dict[str, set[str]]]:
+        """
+        Handles some cases where the state dict is valid yet may need to be transformed in order to
+        be loaded.
+        This modifies the state dict in-place and also returns it, along with a mapping of original key
+        names to new key names in cases where the keys were simply renamed. That mapping can be used
+        to make a corresponding optimizer state dict compatible as well.
+        """
+        import re
+        from fnmatch import fnmatch
+        new_keys_to_og_keys: dict[str, str] = {}
+        # Remove "_fsdp_wrapped_module." prefix from all keys. We don't want this prefix when the model is
+        # not wrapped in FSDP. And when the model is wrapped in FSDP, loading this state dict will still work
+        # fine without the prefixes. This also simplifies the other steps below.
+        for key in list(state_dict.keys()):
+            state_dict[(new_key := key.replace("_fsdp_wrapped_module.", ""))] = state_dict.pop(key)
+            new_keys_to_og_keys[new_key] = key
+        # For backwards compatibility prior to fixing https://github.com/allenai/LLM/issues/222
+        if self.config.block_type == BlockType.sequential:
+            for key in list(state_dict.keys()):
+                if fnmatch(key, "transformer.*.norm.weight"):
+                    tensor = state_dict.pop(key)
+                    state_dict[(new_key := key.replace("norm.weight", "attn_norm.weight"))] = tensor
+                    new_keys_to_og_keys[new_key] = new_keys_to_og_keys[key]
+                    state_dict[(new_key := key.replace("norm.weight", "ff_norm.weight"))] = tensor.clone()
+                    new_keys_to_og_keys[new_key] = new_keys_to_og_keys[key]
+                    del new_keys_to_og_keys[key]
+                elif fnmatch(key, "transformer.*.norm.bias"):
+                    tensor = state_dict.pop(key)
+                    state_dict[(new_key := key.replace("norm.bias", "attn_norm.bias"))] = tensor
+                    new_keys_to_og_keys[new_key] = new_keys_to_og_keys[key]
+                    state_dict[(new_key := key.replace("norm.bias", "ff_norm.bias"))] = tensor.clone()
+                    new_keys_to_og_keys[new_key] = new_keys_to_og_keys[key]
+                    del new_keys_to_og_keys[key]
+        # Realquantization will change the place the linear layers happen
+        if self.qargs.use_quantize_model == "coat_real":
+            for key in list(state_dict.keys()):
+                if fnmatch(key, "transformer.blocks.*.att_proj.weight") and "BeforeAttention" not in key:
+                    tensor = state_dict.pop(key)
+                    state_dict[(new_key := key.replace("att_proj.weight", "BeforeAttention.att_proj.weight"))] = tensor
+                    new_keys_to_og_keys[new_key] = new_keys_to_og_keys[key]
+                    del new_keys_to_og_keys[key]
+                elif fnmatch(key, "transformer.blocks.*.attn_out.weight") and "AfterAttention" not in key:
+                    tensor = state_dict.pop(key)
+                    state_dict[(new_key := key.replace("attn_out.weight", "AfterAttention.attn_out.weight"))] = tensor
+                    new_keys_to_og_keys[new_key] = new_keys_to_og_keys[key]
+                    del new_keys_to_og_keys[key]
+                elif fnmatch(key, "transformer.blocks.*.ff_proj.weight") and "MLPResidual" not in key:
+                    tensor = state_dict.pop(key)
+                    state_dict[(new_key := key.replace("ff_proj.weight", "MLPResidual.ff_proj.weight"))] = tensor
+                    new_keys_to_og_keys[new_key] = new_keys_to_og_keys[key]
+                    del new_keys_to_og_keys[key]
+                elif fnmatch(key, "transformer.blocks.*.ff_out.weight") and "MLPResidual" not in key:
+                    tensor = state_dict.pop(key)
+                    state_dict[(new_key := key.replace("ff_out.weight", "MLPResidual.ff_out.weight"))] = tensor
+                    new_keys_to_og_keys[new_key] = new_keys_to_og_keys[key]
+                    del new_keys_to_og_keys[key]
+        # For loading a state dict that was saved with a different `block_group_size`.
+        if "transformer.block_groups.0.0.attn_out.weight" in state_dict.keys():
+            state_dict_block_group_size = len(
+                [k for k in state_dict.keys() if fnmatch(k, "transformer.block_groups.0.*.attn_out.weight")]
+            )
+        else:
+            state_dict_block_group_size = 1
+        if self.config.block_group_size != state_dict_block_group_size:
+            log.info(
+                f"Regrouping state dict blocks from group size {state_dict_block_group_size} to "
+                f"group size {self.config.block_group_size}"
+            )
+            # For simplicity we're first going to flatten out the block groups in the state dict (if necessary)
+            # and then (re-)group them into the right block sizes.
+            if state_dict_block_group_size > 1:
+                for key in list(state_dict.keys()):
+                    if (m := re.match(r"transformer.block_groups\.(\d+)\.(\d+)\..*", key)) is not None:
+                        group_idx, group_block_idx = int(m.group(1)), int(m.group(2))
+                        block_idx = (group_idx * state_dict_block_group_size) + group_block_idx
+                        state_dict[
+                            (
+                                new_key := key.replace(
+                                    f"block_groups.{group_idx}.{group_block_idx}.", f"blocks.{block_idx}."
+                                )
+                            )
+                        ] = state_dict.pop(key)
+                        new_keys_to_og_keys[new_key] = new_keys_to_og_keys.pop(key)
+            if self.config.block_group_size > 1:
+                # Group the state dict blocks into the right block size.
+                for key in list(state_dict.keys()):
+                    if (m := re.match(r"transformer.blocks\.(\d+)\..*", key)) is not None:
+                        block_idx = int(m.group(1))
+                        group_idx, group_block_idx = (
+                            block_idx // self.config.block_group_size,
+                            block_idx % self.config.block_group_size,
+                        )
+                        state_dict[
+                            (
+                                new_key := key.replace(
+                                    f"blocks.{block_idx}.", f"block_groups.{group_idx}.{group_block_idx}."
+                                )
+                            )
+                        ] = state_dict.pop(key)
+                        new_keys_to_og_keys[new_key] = new_keys_to_og_keys.pop(key)
+        og_keys_to_new: dict[str, set[str]] = defaultdict(set)
+        for new_key, og_key in new_keys_to_og_keys.items():
+            og_keys_to_new[og_key].add(new_key)
+        return state_dict, og_keys_to_new

llava/model/coat/activation/real_quantization/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Activation
+# Utils
+from ._dequantize import fp8_dequantize
+from ._division import fp8_division
+from ._division_transpose import fp8_division_transpose
+from ._quantize import fp8_quantize
+from ._quantize_pertensor import fp8_quantize_pertensor
+from ._quantize_pertensor_transpose import fp8_quantize_pertensor_transpose
+from ._transpose import fp8_transpose
+from .add_bwd import fp8_add_Ifp_Ifp_Ofp_Opt
+from .add_fwd import fp8_add_Ifp_Ifp_Ofp_Og16
+# Normalization
+from .func_layernorm_noparam import fp8_layernorm_noparam_backward, fp8_layernorm_noparam_forward
+from .func_quantize import Coat_quantize_bgn, Coat_quantize_end
+from .func_rmsnorm import fp8_rmsnorm_backward, fp8_rmsnorm_forward
+from .gelu_bwd import fp8_gelu_backward
+from .gelu_fwd import fp8_gelu_forward
+# linear and add
+from .linear import fp8_linear_backward, fp8_linear_forward
+from .mul_bwd import fp8_mul_backward
+from .mul_fwd import fp8_mul_forward
+from .silu_bwd import fp8_silu_backward
+from .silu_fwd import fp8_silu_forward

llava/model/coat/activation/real_quantization/_dequantize.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+# 4 block
+import triton
+import triton.language as tl
+from triton.language.extra.cuda import libdevice
+from .common import FP8_MAX_VALUE, SCALE_MIN_THRES, get_configs_io_block
+"""Quantize Operator"""
+"""Input uses 1 * 16 group quantization"""
+"""Output uses 1 * 16 group quantization"""
+"""The input can be 2D or 3D, but the calculation is performed in 2D"""
+@triton.autotune(
+    configs=[] + get_configs_io_block(),
+    key=[
+        "N",
+    ],
+)
+@triton.heuristics(
+    {
+        "BLOCK_SN": lambda args: args["BLOCK_N"] // args["QB"],
+    }
+)
+@triton.jit
+def _fp8_dequantize_kernel(
+    output_ptr,  # output
+    input_ptr,
+    input_scale_ptr,  # input
+    M,
+    N,
+    SN,
+    QB: tl.constexpr,  # shape
+    input_stride_0,
+    input_stride_1,  # input stride
+    s_input_stride_0,
+    s_input_stride_1,  # scale of output stride
+    output_stride_0,
+    output_stride_1,  # output stride
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_SN: tl.constexpr,
+):  # CUDA block size
+    # Block PID
+    pid = tl.program_id(0)
+    NUM_BLOCK_N = tl.cdiv(N, BLOCK_N)
+    pid_dim0 = pid // NUM_BLOCK_N
+    pid_dim1 = pid % NUM_BLOCK_N
+    # pointers
+    input_block_ptr = tl.make_block_ptr(
+        base=input_ptr,
+        shape=(M, N),
+        strides=(input_stride_0, input_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    # input ptr
+    scale_input_ptr = tl.make_block_ptr(
+        base=input_scale_ptr,
+        shape=(M, SN),
+        strides=(s_input_stride_0, s_input_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_SN),
+        block_shape=(BLOCK_M, BLOCK_SN),
+        order=(1, 0),
+    )
+    input = tl.load(input_block_ptr)
+    scale_input = tl.load(scale_input_ptr)
+    input = input.to(tl.float32)
+    scale_input = scale_input.to(tl.float32)
+    # Dequantize and gelu calculation
+    scale_input = tl.reshape(scale_input, (BLOCK_M, BLOCK_SN, 1))
+    input = tl.reshape(input, (BLOCK_M, BLOCK_SN, QB))
+    output = input * scale_input
+    output = tl.reshape(output, (BLOCK_M, BLOCK_N))
+    output = output.to(output_ptr.dtype.element_ty)
+    # debug
+    # gelu_output = input
+    # scale_output = scale_input
+    # pointers
+    output_block_ptr = tl.make_block_ptr(
+        base=output_ptr,
+        shape=(M, N),
+        strides=(output_stride_0, output_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(output_block_ptr, output, boundary_check=(0, 1))
+def fp8_dequantize(x, s_x, QB):
+    # Change batched 3D input to 2D
+    batched = False
+    if len(x.shape) == 3:
+        batched = True
+        BS = x.shape[0]
+        x = x.reshape(-1, x.shape[-1])
+        s_x = s_x.reshape(-1, s_x.shape[-1])
+    # defining the input and output tensor
+    M, N = x.shape
+    SN = N // QB
+    y = torch.empty_like(x, dtype=torch.bfloat16)
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
+    _fp8_dequantize_kernel[grid](
+        y,
+        x,
+        s_x,
+        M,
+        N,
+        SN,
+        QB,
+        x.stride(0),
+        x.stride(1),
+        s_x.stride(0),
+        s_x.stride(1),
+        y.stride(0),
+        y.stride(1),
+    )
+    # Recover 2D to 3D
+    if batched:
+        y = y.reshape(BS, -1, y.shape[-1])
+    return y

llava/model/coat/activation/real_quantization/_division.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+# 4 block
+import triton
+import triton.language as tl
+from triton.language.extra.cuda import libdevice
+from .common import FP8_MAX_VALUE, SCALE_MIN_THRES, convert_fp8_to_embit, convert_str_to_fp8, get_configs_io_block
+"""Quantize and Transpose Operator"""
+"""Input uses 1 * 16 group quantization"""
+"""Output uses 1 * 16 group quantization"""
+"""The input can be 2D or 3D, but the calculation is performed in 2D"""
+@triton.autotune(
+    configs=[] + get_configs_io_block(),
+    key=[
+        "N",
+    ],
+)
+@triton.heuristics(
+    {
+        "BLOCK_SN": lambda args: args["BLOCK_N"] // args["QB"],
+    }
+)
+@triton.jit
+def _fp8_division_kernel(
+    output_ptr,  # output
+    input_ptr,
+    input_scale_ptr,  # input
+    noise_ptr,  # noise for stochastic
+    M,
+    N,
+    SN,
+    QB: tl.constexpr,
+    fp8_max,
+    e_bit: tl.constexpr,
+    m_bit: tl.constexpr,  # shape
+    input_stride_0,
+    input_stride_1,  # input stride
+    output_stride_0,
+    output_stride_1,  # output stride
+    SCALE_MIN_THRES: tl.constexpr,  # We do not use it since we believe SCALE_MIN_THRES should be used in previous kernel when calculating scaling factor
+    STOCHASTIC: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_SN: tl.constexpr,
+):  # CUDA block size
+    # Block PID
+    pid = tl.program_id(0)
+    NUM_BLOCK_N = tl.cdiv(N, BLOCK_N)
+    pid_dim0 = pid // NUM_BLOCK_N
+    pid_dim1 = pid % NUM_BLOCK_N
+    # pointers
+    input_block_ptr = tl.make_block_ptr(
+        base=input_ptr,
+        shape=(M, N),
+        strides=(input_stride_0, input_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    input = tl.load(input_block_ptr)
+    input = input.to(tl.float32)
+    scale_output = tl.load(input_scale_ptr)
+    scale_output = scale_output.to(tl.float32)
+    output = tl.reshape(input, (BLOCK_M, BLOCK_SN, QB))
+    # Quantize Scale calculation
+    # Quantize
+    output = tl.fdiv(output, scale_output)
+    output = tl.reshape(output, (BLOCK_M, BLOCK_N))
+    if STOCHASTIC:
+        # noise_block_ptr = tl.make_block_ptr(
+        #     base=noise_ptr,
+        #     shape=(M, N),
+        #     strides=(input_stride_0, input_stride_1),
+        #     offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        #     block_shape=(BLOCK_M, BLOCK_N),
+        #     order=(1, 0)
+        # )
+        # noise = tl.load(noise_block_ptr)
+        offs_m = pid_dim0 * BLOCK_M + tl.arange(0, BLOCK_M)
+        offs_n = pid_dim1 * BLOCK_N + tl.arange(0, BLOCK_N)
+        noise_offset = offs_m[:, None] * input_stride_0 + offs_n[None, :] * input_stride_1
+        noise = tl.rand(0, noise_offset)
+        output = _stochastic_rounding(output, noise, e_bit, m_bit)
+    output = output.to(output_ptr.type.element_ty)
+    # pointers
+    output_block_ptr = tl.make_block_ptr(
+        base=output_ptr,
+        shape=(M, N),
+        strides=(output_stride_0, output_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(output_block_ptr, output, boundary_check=(0, 1))
+@triton.jit
+def _stochastic_rounding(output, noise, e_bit: tl.constexpr, m_bit: tl.constexpr):
+    subnormal_min = tl.exp2(2 - tl.exp2(e_bit - 1) - m_bit)
+    # subnormal_should_be = tl.exp2(2 - tl.exp2(e_bit) - 1)
+    output_int32 = tl.cast(output, tl.int32, bitcast=True)
+    output_int32 = output_int32 & 0x7F800000
+    output_float32 = tl.cast(output_int32, tl.float32, bitcast=True)
+    output_exp = tl.maximum(output_float32, subnormal_min)
+    noise_rescale = tl.exp2(m_bit) + (output_exp == subnormal_min) * (
+        1 - tl.exp2(m_bit)
+    )  # 2^m_bit for normal, 1 for subnormal
+    noise = output_exp * noise / noise_rescale
+    sign = 1 - 2 * libdevice.signbit(output)
+    output = tl.abs(output) + noise
+    minmax_ratio = 2 + (output_exp == subnormal_min) * (tl.exp2(m_bit) - 2)  # 2 for normal, and 2^M for subnormal
+    output = sign * tl.clamp(output, min=output_exp, max=minmax_ratio * output_exp)
+    return output
+def fp8_division(x, QB, fp8type, s_y=None, stochastic=False):
+    # Change batched 3D input to 2D
+    batched = False
+    if len(x.shape) == 3:
+        batched = True
+        BS = x.shape[0]
+        x = x.reshape(-1, x.shape[-1])
+    if stochastic:
+        # noise = torch.zeros_like(x, dtype=torch.float32).uniform_(-0.5, 0.5)
+        noise = None
+    else:
+        noise = None
+    # defining the input and output tensor
+    M, N = x.shape
+    SN = N // QB
+    if isinstance(fp8type, str):
+        fp8type = convert_str_to_fp8[fp8type]
+    y = torch.empty_like(x, dtype=fp8type)
+    fp8MaxValue = FP8_MAX_VALUE[fp8type]  # E4M3 and E5M2 have different max value
+    e_bit, m_bit = convert_fp8_to_embit[fp8type]
+    if s_y is None:
+        s_y = (x.abs().max() + SCALE_MIN_THRES) / fp8MaxValue
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
+    _fp8_division_kernel[grid](
+        y,
+        x,
+        s_y,
+        noise,
+        M,
+        N,
+        SN,
+        QB,
+        fp8MaxValue,
+        e_bit,
+        m_bit,
+        x.stride(0),
+        x.stride(1),
+        y.stride(0),
+        y.stride(1),
+        SCALE_MIN_THRES=SCALE_MIN_THRES,
+        STOCHASTIC=stochastic,
+    )
+    # Recover 2D to 3D
+    if batched:
+        y = y.reshape(BS, -1, y.shape[-1])
+    return y, s_y  # y_t is expected to be 2D tensor

llava/model/coat/activation/real_quantization/_division_transpose.py ADDED Viewed

	@@ -0,0 +1,215 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+# 4 block
+import triton
+import triton.language as tl
+from triton.language.extra.cuda import libdevice
+from ._division import _stochastic_rounding
+from .common import FP8_MAX_VALUE, SCALE_MIN_THRES, convert_fp8_to_embit, convert_str_to_fp8, get_configs_io_block
+"""Division and Transpose Operator"""
+"""Input uses full-precision/BF16"""
+"""Output uses per tensor quantization"""
+"""Output_t uses per tensor quantization and is transposed, but is flattened to 2D"""
+"""The input can be 2D or 3D, but the calculation is performed in 2D"""
+@triton.autotune(
+    configs=[] + get_configs_io_block(),  # triton.Config({'BLOCK_M': 1, 'BLOCK_N': 16}, num_stages=4, num_warps=1,)
+    # configs=[triton.Config({'BLOCK_M': 1, 'BLOCK_N': 16}, num_stages=4, num_warps=1,)], #
+    key=[
+        "N",
+    ],
+)
+@triton.heuristics(
+    {
+        "BLOCK_SN": lambda args: args["BLOCK_N"] // args["QB"],
+    }
+)
+@triton.jit
+def _fp8_division_transpose_kernel(
+    output_ptr,
+    output_t_ptr,  # output
+    input_ptr,
+    input_scale_ptr,  # input
+    noise_ptr,  # noise for stochastic
+    M,
+    N,
+    SN,
+    QB: tl.constexpr,
+    fp8_max,
+    e_bit,
+    m_bit,  # shape
+    input_stride_0,
+    input_stride_1,  # input stride
+    output_stride_0,
+    output_stride_1,  # output stride
+    output_t_stride_0,
+    output_t_stride_1,  # output stride
+    SCALE_MIN_THRES: tl.constexpr,  # We do not use it since we believe SCALE_MIN_THRES should be used in previous kernel when calculating scaling factor
+    STOCHASTIC: tl.constexpr,
+    ONLY_TRANSPOSED: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_SN: tl.constexpr,
+):  # CUDA block size
+    # Block PID
+    pid = tl.program_id(0)
+    NUM_BLOCK_N = tl.cdiv(N, BLOCK_N)
+    pid_dim0 = pid // NUM_BLOCK_N
+    pid_dim1 = pid % NUM_BLOCK_N
+    # pointers
+    input_block_ptr = tl.make_block_ptr(
+        base=input_ptr,
+        shape=(M, N),
+        strides=(input_stride_0, input_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    input = tl.load(input_block_ptr)
+    input = input.to(tl.float32)
+    scale_output = tl.load(input_scale_ptr)
+    scale_output = scale_output.to(tl.float32)
+    output = tl.reshape(input, (BLOCK_M, BLOCK_SN, QB))
+    # Quantize Scale calculation
+    # Quantize
+    output = tl.fdiv(output, scale_output)
+    output = tl.reshape(output, (BLOCK_M, BLOCK_N))
+    if STOCHASTIC:
+        # noise_block_ptr = tl.make_block_ptr(
+        #     base=noise_ptr,
+        #     shape=(M, N),
+        #     strides=(input_stride_0, input_stride_1),
+        #     offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        #     block_shape=(BLOCK_M, BLOCK_N),
+        #     order=(1, 0)
+        # )
+        # noise = tl.load(noise_block_ptr)
+        offs_m = pid_dim0 * BLOCK_M + tl.arange(0, BLOCK_M)
+        offs_n = pid_dim1 * BLOCK_N + tl.arange(0, BLOCK_N)
+        noise_offset = offs_m[:, None] * input_stride_0 + offs_n[None, :] * input_stride_1
+        noise = tl.rand(0, noise_offset)
+        output = _stochastic_rounding(output, noise, e_bit, m_bit)
+    output = output.to(output_ptr.type.element_ty)
+    # tl.device_print("3: ", output)
+    output_t = tl.trans(output)
+    # pointers
+    output_block_ptr = tl.make_block_ptr(
+        base=output_ptr,
+        shape=(M, N),
+        strides=(output_stride_0, output_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    output_t_block_ptr = tl.make_block_ptr(
+        base=output_t_ptr,
+        shape=(N, M),
+        strides=(output_t_stride_0, output_t_stride_1),
+        offsets=(pid_dim1 * BLOCK_N, pid_dim0 * BLOCK_M),
+        block_shape=(BLOCK_N, BLOCK_M),
+        order=(1, 0),
+    )
+    if not ONLY_TRANSPOSED:
+        tl.store(output_block_ptr, output, boundary_check=(0, 1))
+    tl.store(output_t_block_ptr, output_t, boundary_check=(0, 1))
+def fp8_division_transpose(x, QB, fp8type, s_y=None, stochastic=False, only_transposed=False):
+    # Change batched 3D input to 2D
+    batched = False
+    if len(x.shape) == 3:
+        batched = True
+        BS = x.shape[0]
+        x = x.reshape(-1, x.shape[-1])
+    if stochastic:
+        # noise = torch.empty_like(x, dtype=torch.float32).uniform_(-0.5, 0.5)
+        noise = None
+    else:
+        noise = None
+    # defining the input and output tensor
+    M, N = x.shape
+    SN = N // QB
+    if isinstance(fp8type, str):
+        fp8type = convert_str_to_fp8[fp8type]
+    y = torch.empty_like(x, dtype=fp8type)
+    y_t = torch.empty((N, M), dtype=fp8type, device=x.device)
+    fp8MaxValue = FP8_MAX_VALUE[fp8type]  # E4M3 and E5M2 have different max value
+    e_bit, m_bit = convert_fp8_to_embit[fp8type]
+    if s_y is None:
+        # print("Warning: do not specify s_y in fp8_division_transpose")
+        s_y = (x.abs().max() + SCALE_MIN_THRES) / fp8MaxValue
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
+    _fp8_division_transpose_kernel[grid](
+        y,
+        y_t,
+        x,
+        s_y,
+        noise,
+        M,
+        N,
+        SN,
+        QB,
+        fp8MaxValue,
+        e_bit,
+        m_bit,
+        x.stride(0),
+        x.stride(1),
+        y.stride(0),
+        y.stride(1),
+        y_t.stride(0),
+        y_t.stride(1),
+        SCALE_MIN_THRES=SCALE_MIN_THRES,
+        STOCHASTIC=stochastic,
+        ONLY_TRANSPOSED=only_transposed,
+    )
+    if not only_transposed:
+        # Recover 2D to 3D
+        if batched:
+            y = y.reshape(BS, -1, y.shape[-1])
+        return y, s_y, y_t  # y_t is expected to be 2D tensor
+    else:
+        return y_t, s_y

llava/model/coat/activation/real_quantization/_memory_io.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+# 4 block
+import triton
+import triton.language as tl
+from triton.language.extra.cuda import libdevice
+CONST_BLOCK = 32
+# The kernel with 1 load operation and 4 store operation
+def get_configs_io_block():
+    configs = []
+    for nstages in [3, 4, 5, 6]:
+        for block_m in [32, 64, 128]:
+            for block_n in [32, 64, 128]:
+                for nwarps in [4, 8, 16, 32]:
+                    configs.append(
+                        triton.Config(
+                            {"BLOCK_M": block_m, "BLOCK_N": block_n},
+                            num_stages=nstages,
+                            num_warps=nwarps,
+                        )
+                    )
+    return configs
+@triton.autotune(
+    configs=[] + get_configs_io_block(),
+    key=[
+        "N",
+    ],
+)
+@triton.jit
+def bench_memory_io_kernel_forward(
+    output_ptr,
+    input_ptr,
+    M,
+    N,
+    B: tl.constexpr,
+    input_stride_0,
+    input_stride_1,
+    output_stride_0,
+    output_stride_1,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    # Block PID
+    pid = tl.program_id(0)
+    NUM_BLOCK_M = tl.cdiv(M, BLOCK_M)
+    NUM_BLOCK_N = tl.cdiv(N, BLOCK_N)
+    pid_dim0 = pid // NUM_BLOCK_N
+    pid_dim1 = pid % NUM_BLOCK_N
+    # pointers
+    input_block_ptr = tl.make_block_ptr(
+        base=input_ptr,
+        shape=(M, N),
+        strides=(input_stride_0, input_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    input = tl.load(input_block_ptr)
+    input = input.to(tl.float32)
+    output = input * 2
+    # pointers
+    output_block_ptr = tl.make_block_ptr(
+        base=output_ptr,
+        shape=(M, N),
+        strides=(output_stride_0, output_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    output = output.to(output_ptr.type.element_ty)
+    tl.store(output_block_ptr, output, boundary_check=(0, 1))
+def bench_memory_io_forward(x, B):
+    # defining the input and output tensor
+    M, N = x.shape
+    y = torch.empty_like(x, dtype=x.dtype)
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
+    bench_memory_io_kernel_forward[grid](
+        y,
+        x,
+        M,
+        N,
+        B,
+        x.stride(0),
+        x.stride(1),
+        y.stride(0),
+        y.stride(1),
+    )
+    return y
+configs = []
+for SL in [8192]:
+    configs.append(
+        triton.testing.Benchmark(  # test different matrix size influence
+            x_names=["CDIM"],
+            x_vals=[1024, 2048, 4096, 8192],
+            line_arg="dtype",
+            line_vals=[torch.int8, torch.float16, torch.float32],
+            line_names=["float8", "float16", "float32"],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="time-cost",
+            plot_name=f"INT8GELU<BLSZ={CONST_BLOCK}><SL={SL}>",
+            args={"SL": SL, "B": CONST_BLOCK, "provider": "triton", "mode": "time-consuming"},
+        )
+    )
+@triton.testing.perf_report(configs)
+def bench_load_store(
+    SL, CDIM, B, provider, dtype, mode="forward"
+):  # I only use triton as the provider, and mode when benchmarking
+    # create data
+    x = torch.randn(SL, CDIM, dtype=torch.float32).cuda()
+    x = x.to(dtype)
+    quantiles = [0.5, 0.2, 0.8]
+    # utility functions
+    if provider == "triton":
+        def y_fwd():
+            bench_memory_io_forward(x, B)
+    if provider == "torch":
+        torch_gelu = torch.nn.GELU()
+        def y_fwd():
+            return torch_gelu(x)
+    # forward pass
+    if mode == "time-consuming":
+        convert_func = lambda ms: ms
+        ms, min_ms, max_ms = triton.testing.do_bench(y_fwd, quantiles=quantiles, rep=100)
+    # backward pass
+    if mode == "gbps":
+        convert_func = lambda ms: 2 * x.numel() * x.element_size() / ms * 1e-6
+        ms, min_ms, max_ms = triton.testing.do_bench(y_fwd, quantiles=quantiles, rep=100)
+    return convert_func(ms), convert_func(max_ms), convert_func(min_ms)
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    torch.set_printoptions(precision=8, linewidth=1600, sci_mode=False, edgeitems=3)
+    bench_load_store.run(print_data=True)

llava/model/coat/activation/real_quantization/_quantize.py ADDED Viewed

	@@ -0,0 +1,176 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+# 4 block
+import triton
+import triton.language as tl
+from triton.language.extra.cuda import libdevice
+from .common import FP8_MAX_VALUE, SCALE_MIN_THRES, convert_fp8_to_embit, convert_str_to_fp8, get_configs_io_block
+"""Quantize Operator"""
+"""Input uses 1 * 16 group quantization"""
+"""Output uses 1 * 16 group quantization"""
+"""The input can be 2D or 3D, but the calculation is performed in 2D"""
+@triton.autotune(
+    configs=[] + get_configs_io_block(),
+    key=[
+        "N",
+    ],
+)
+@triton.heuristics(
+    {
+        "BLOCK_SN": lambda args: args["BLOCK_N"] // args["QB"],
+    }
+)
+@triton.jit
+def _fp8_quantize_kernel(
+    output_ptr,
+    output_scale_ptr,  # output
+    input_ptr,  # input
+    M,
+    N,
+    SN,
+    QB: tl.constexpr,
+    fp8_max,  # shape
+    input_stride_0,
+    input_stride_1,  # input stride
+    output_stride_0,
+    output_stride_1,  # output stride
+    s_output_stride_0,
+    s_output_stride_1,  # scale of output stride
+    SCALE_MIN_THRES: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_SN: tl.constexpr,
+):  # CUDA block size
+    # Block PID
+    pid = tl.program_id(0)
+    NUM_BLOCK_N = tl.cdiv(N, BLOCK_N)
+    pid_dim0 = pid // NUM_BLOCK_N
+    pid_dim1 = pid % NUM_BLOCK_N
+    # pointers
+    input_block_ptr = tl.make_block_ptr(
+        base=input_ptr,
+        shape=(M, N),
+        strides=(input_stride_0, input_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    input = tl.load(input_block_ptr)
+    input = input.to(tl.float32)
+    output = tl.reshape(input, (BLOCK_M, BLOCK_SN, QB))
+    # Quantize Scale calculation
+    abs_output = tl.abs(output)
+    max_val = tl.max(abs_output, axis=2) + SCALE_MIN_THRES
+    scale_output = max_val / fp8_max
+    scale_output = tl.reshape(scale_output, (BLOCK_M, BLOCK_SN, 1))
+    # Quantize
+    output = tl.fdiv(output, scale_output)
+    output = output.to(output_ptr.type.element_ty)
+    scale_output = scale_output.to(output_scale_ptr.type.element_ty)
+    scale_output = tl.reshape(scale_output, (BLOCK_M, BLOCK_SN))
+    output = tl.reshape(output, (BLOCK_M, BLOCK_N))
+    # debug
+    # gelu_output = input
+    # scale_output = scale_input
+    # pointers
+    output_block_ptr = tl.make_block_ptr(
+        base=output_ptr,
+        shape=(M, N),
+        strides=(output_stride_0, output_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    scale_output_ptr = tl.make_block_ptr(
+        base=output_scale_ptr,
+        shape=(M, SN),
+        strides=(s_output_stride_0, s_output_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_SN),
+        block_shape=(BLOCK_M, BLOCK_SN),
+        order=(1, 0),
+    )
+    tl.store(output_block_ptr, output, boundary_check=(0, 1))
+    tl.store(scale_output_ptr, scale_output, boundary_check=(0, 1))
+def fp8_quantize(x, QB, fp8type):
+    # Change batched 3D input to 2D
+    batched = False
+    if len(x.shape) == 3:
+        batched = True
+        BS = x.shape[0]
+        x = x.reshape(-1, x.shape[-1])
+    # defining the input and output tensor
+    M, N = x.shape
+    SN = N // QB
+    if isinstance(fp8type, str):
+        fp8type = convert_str_to_fp8[fp8type]
+    y = torch.empty_like(x, dtype=fp8type)
+    s_y = torch.empty((M, SN), dtype=torch.bfloat16, device=x.device)
+    fp8MaxValue = FP8_MAX_VALUE[fp8type]  # E4M3 and E5M2 have different max value
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
+    _fp8_quantize_kernel[grid](
+        y,
+        s_y,
+        x,
+        M,
+        N,
+        SN,
+        QB,
+        fp8MaxValue,
+        x.stride(0),
+        x.stride(1),
+        y.stride(0),
+        y.stride(1),
+        s_y.stride(0),
+        s_y.stride(1),
+        SCALE_MIN_THRES=SCALE_MIN_THRES,
+    )
+    # Recover 2D to 3D
+    if batched:
+        y = y.reshape(BS, -1, y.shape[-1])
+        s_y = s_y.reshape(BS, -1, s_y.shape[-1])
+    return y, s_y

llava/model/coat/activation/real_quantization/_quantize_pertensor.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+# 4 block
+import triton
+import triton.language as tl
+from triton.language.extra.cuda import libdevice
+from ._division import fp8_division
+from .common import FP8_MAX_VALUE, SCALE_MIN_THRES, convert_str_to_fp8, get_configs_io_block
+"""Per Tensor Quantize Operator"""
+"""Input uses full precision"""
+"""Output uses per tensor quantization"""
+"""The input can be 2D or 3D, but the calculation is performed in 2D"""
+@triton.autotune(
+    configs=[] + get_configs_io_block(),
+    key=[
+        "N",
+    ],
+)
+@triton.heuristics(
+    {
+        "BLOCK_SN": lambda args: args["BLOCK_N"] // args["QB"],
+    }
+)
+@triton.jit
+def _fp8_quantize_pertensor_kernel(
+    output_scale_ptr,  # output
+    input_ptr,  # input
+    M,
+    N,
+    SN,
+    QB: tl.constexpr,
+    fp8_max,  # shape
+    input_stride_0,
+    input_stride_1,  # input stride
+    s_output_stride_0,
+    s_output_stride_1,  # scale of output stride
+    SCALE_MIN_THRES: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_SN: tl.constexpr,
+):  # CUDA block size
+    # Block PID
+    pid = tl.program_id(0)
+    NUM_BLOCK_N = tl.cdiv(N, BLOCK_N)
+    pid_dim0 = pid // NUM_BLOCK_N
+    pid_dim1 = pid % NUM_BLOCK_N
+    # pointers
+    input_block_ptr = tl.make_block_ptr(
+        base=input_ptr,
+        shape=(M, N),
+        strides=(input_stride_0, input_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    input = tl.load(input_block_ptr)
+    input = input.to(tl.float32)
+    output = tl.reshape(input, (BLOCK_M, BLOCK_SN, QB))
+    # Quantize Scale calculation
+    abs_output = tl.abs(output)
+    max_val = tl.max(abs_output, axis=2) + SCALE_MIN_THRES
+    scale_output = max_val / fp8_max
+    scale_output = tl.reshape(scale_output, (BLOCK_M, BLOCK_SN, 1))
+    scale_output = scale_output.to(output_scale_ptr.type.element_ty)
+    scale_output = tl.reshape(scale_output, (BLOCK_M, BLOCK_SN))
+    scale_output_ptr = tl.make_block_ptr(
+        base=output_scale_ptr,
+        shape=(M, SN),
+        strides=(s_output_stride_0, s_output_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_SN),
+        block_shape=(BLOCK_M, BLOCK_SN),
+        order=(1, 0),
+    )
+    tl.store(scale_output_ptr, scale_output, boundary_check=(0, 1))
+def fp8_quantize_pertensor(x, QB, fp8type, stochastic=False):
+    # Change batched 3D input to 2D
+    batched = False
+    if len(x.shape) == 3:
+        batched = True
+        BS = x.shape[0]
+        x = x.reshape(-1, x.shape[-1])
+    # defining the input and output tensor
+    M, N = x.shape
+    SN = N // QB
+    fp8type = convert_str_to_fp8[fp8type]
+    s_y = torch.empty((M, SN), dtype=torch.bfloat16, device=x.device)
+    fp8MaxValue = FP8_MAX_VALUE[fp8type]  # E4M3 and E5M2 have different max value
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
+    _fp8_quantize_pertensor_kernel[grid](
+        s_y,
+        x,
+        M,
+        N,
+        SN,
+        QB,
+        fp8MaxValue,
+        x.stride(0),
+        x.stride(1),
+        s_y.stride(0),
+        s_y.stride(1),
+        SCALE_MIN_THRES=SCALE_MIN_THRES,
+    )
+    s_y_max = s_y.max()
+    y, s_y_max = fp8_division(x, QB, fp8type, s_y_max, stochastic=stochastic)  # reuse the floating point output y1
+    # Recover 2D to 3D
+    if batched:
+        y = y.reshape(BS, -1, y.shape[-1])
+        s_y = s_y.reshape(BS, -1, s_y.shape[-1])
+    return y, s_y_max, s_y

llava/model/coat/activation/real_quantization/_quantize_pertensor_transpose.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+# 4 block
+import triton
+import triton.language as tl
+from triton.language.extra.cuda import libdevice
+from ._division_transpose import fp8_division_transpose
+from .common import FP8_MAX_VALUE, SCALE_MIN_THRES, convert_str_to_fp8, get_configs_io_block
+"""Per Tensor Quantize and Transpose Operator"""
+"""Input uses floating point tensor"""
+"""Output uses per-tensor quantization, returns a non-transpose version and a transpose version"""
+"""The input can be 2D or 3D, but the calculation is performed in 2D"""
+@triton.autotune(
+    configs=[] + get_configs_io_block(),
+    key=[
+        "N",
+    ],
+)
+@triton.heuristics(
+    {
+        "BLOCK_SN": lambda args: args["BLOCK_N"] // args["QB"],
+    }
+)
+@triton.jit
+def _fp8_quantize_pertensor_transpose_kernel(
+    output_scale_ptr,  # output
+    input_ptr,  # input
+    M,
+    N,
+    SN,
+    QB: tl.constexpr,
+    fp8_max,  # shape
+    input_stride_0,
+    input_stride_1,  # input stride
+    s_output_stride_0,
+    s_output_stride_1,  # scale of output stride
+    SCALE_MIN_THRES: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_SN: tl.constexpr,
+):  # CUDA block size
+    # Block PID
+    pid = tl.program_id(0)
+    NUM_BLOCK_N = tl.cdiv(N, BLOCK_N)
+    pid_dim0 = pid // NUM_BLOCK_N
+    pid_dim1 = pid % NUM_BLOCK_N
+    # pointers
+    input_block_ptr = tl.make_block_ptr(
+        base=input_ptr,
+        shape=(M, N),
+        strides=(input_stride_0, input_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    input = tl.load(input_block_ptr)
+    input = input.to(tl.float32)
+    output = tl.reshape(input, (BLOCK_M, BLOCK_SN, QB))
+    # Quantize Scale calculation
+    abs_output = tl.abs(output)
+    max_val = tl.max(abs_output, axis=2) + SCALE_MIN_THRES
+    scale_output = max_val / fp8_max
+    scale_output = tl.reshape(scale_output, (BLOCK_M, BLOCK_SN, 1))
+    scale_output = scale_output.to(output_scale_ptr.type.element_ty)
+    scale_output = tl.reshape(scale_output, (BLOCK_M, BLOCK_SN))
+    scale_output_ptr = tl.make_block_ptr(
+        base=output_scale_ptr,
+        shape=(M, SN),
+        strides=(s_output_stride_0, s_output_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_SN),
+        block_shape=(BLOCK_M, BLOCK_SN),
+        order=(1, 0),
+    )
+    tl.store(scale_output_ptr, scale_output, boundary_check=(0, 1))
+def fp8_quantize_pertensor_transpose(x, QB, fp8type, transpose_output_2d=False, stochastic=False):
+    # Change batched 3D input to 2D
+    batched = False
+    if len(x.shape) == 3:
+        batched = True
+        BS = x.shape[0]
+        x = x.reshape(-1, x.shape[-1])
+    # defining the input and output tensor
+    M, N = x.shape
+    SN = N // QB
+    fp8type = convert_str_to_fp8[fp8type]
+    s_y = torch.empty((M, SN), dtype=torch.bfloat16, device=x.device)
+    fp8MaxValue = FP8_MAX_VALUE[fp8type]  # E4M3 and E5M2 have different max value
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
+    _fp8_quantize_pertensor_transpose_kernel[grid](
+        s_y,
+        x,
+        M,
+        N,
+        SN,
+        QB,
+        fp8MaxValue,
+        x.stride(0),
+        x.stride(1),
+        s_y.stride(0),
+        s_y.stride(1),
+        SCALE_MIN_THRES=SCALE_MIN_THRES,
+    )
+    s_y_max = s_y.max()
+    qy, s_y_max, qy_t = fp8_division_transpose(
+        x, QB, fp8type, s_y_max, stochastic=stochastic
+    )  # Stochastic Rounding happens here
+    # Recover 2D to 3D
+    if batched:
+        qy = qy.reshape(BS, -1, qy.shape[-1])
+        if not transpose_output_2d:
+            qy_t = qy_t.reshape(BS, -1, qy_t.shape[-1])
+    return qy, s_y_max, qy_t  # y_t is expected to be 2D tensor

llava/model/coat/activation/real_quantization/_transpose.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+# 4 block
+import triton
+import triton.language as tl
+from triton.language.extra.cuda import libdevice
+from .common import get_configs_io_block
+"""Quantize Operator"""
+"""Input uses 1 * 16 group quantization"""
+"""Output uses 1 * 16 group quantization"""
+"""The input can be 2D or 3D, but the calculation is performed in 2D"""
+@triton.autotune(
+    configs=[] + get_configs_io_block(),
+    key=[
+        "N",
+    ],
+)
+@triton.jit
+def _fp8_transpose_kernel(
+    output_ptr,  # output
+    input_ptr,  # input
+    M,
+    N,  # shape
+    input_stride_0,
+    input_stride_1,  # input stride
+    output_stride_0,
+    output_stride_1,  # output stride
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):  # CUDA block size
+    # Block PID
+    pid = tl.program_id(0)
+    NUM_BLOCK_N = tl.cdiv(N, BLOCK_N)
+    pid_dim0 = pid // NUM_BLOCK_N
+    pid_dim1 = pid % NUM_BLOCK_N
+    # pointers
+    input_block_ptr = tl.make_block_ptr(
+        base=input_ptr,
+        shape=(M, N),
+        strides=(input_stride_0, input_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    input = tl.load(input_block_ptr)
+    output = tl.trans(input)
+    # pointers
+    output_block_ptr = tl.make_block_ptr(
+        base=output_ptr,
+        shape=(N, M),
+        strides=(output_stride_0, output_stride_1),
+        offsets=(pid_dim1 * BLOCK_N, pid_dim0 * BLOCK_M),
+        block_shape=(BLOCK_N, BLOCK_M),
+        order=(1, 0),
+    )
+    tl.store(output_block_ptr, output, boundary_check=(0, 1))
+def fp8_transpose(x, transpose_output_2d=False):
+    # Change batched 3D input to 2D
+    batched = False
+    if len(x.shape) == 3:
+        batched = True
+        BS = x.shape[0]
+        x = x.reshape(-1, x.shape[-1])
+    # defining the input and output tensor
+    M, N = x.shape
+    y = torch.empty((N, M), dtype=x.dtype, device=x.device)
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
+    _fp8_transpose_kernel[grid](
+        y,
+        x,
+        M,
+        N,
+        x.stride(0),
+        x.stride(1),
+        y.stride(0),
+        y.stride(1),
+    )
+    # Recover 2D to 3D
+    if batched and not transpose_output_2d:
+        y = y.reshape(BS, -1, y.shape[-1])
+    return y

llava/model/coat/activation/real_quantization/add_bwd.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+# 4 block
+import triton
+import triton.language as tl
+from triton.language.extra.cuda import libdevice
+from ._division import fp8_division
+from .common import FP8_MAX_VALUE, SCALE_MIN_THRES, convert_str_to_fp8, get_configs_io_block
+"""Element-wise Add, useful for backward"""
+"""Input1 (Residual) uses full-precision/BF16"""
+"""Input2 (Backbone) uses full-precision/BF16"""
+"""Output1 uses full-precision/BF16"""
+"""Output2 uses per-tensor quantization"""
+"""The input can be 2D or 3D, but the calculation is performed in 2D"""
+@triton.autotune(
+    configs=[] + get_configs_io_block(),
+    key=[
+        "N",
+    ],
+)
+@triton.heuristics(
+    {
+        "BLOCK_SN": lambda args: args["BLOCK_N"] // args["QB"],
+    }
+)
+@triton.jit
+def _fp8_add_Ifp_Ifp_Ofp_Opt_kernel(
+    output1_ptr,  # output
+    output2_scale_ptr,
+    input1_ptr,  # input
+    input2_ptr,  # input
+    M,
+    N,
+    SN,
+    QB: tl.constexpr,
+    fp8_max,  # shape
+    input1_stride_0,
+    input1_stride_1,  # input1 stride
+    input2_stride_0,
+    input2_stride_1,  # input2 stride
+    output1_stride_0,
+    output1_stride_1,  # output stride
+    s_output2_stride_0,
+    s_output2_stride_1,  # scale of output stride
+    SCALE_MIN_THRES: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_SN: tl.constexpr,
+):  # CUDA block size
+    # Block PID
+    pid = tl.program_id(0)
+    NUM_BLOCK_N = tl.cdiv(N, BLOCK_N)
+    pid_dim0 = pid // NUM_BLOCK_N
+    pid_dim1 = pid % NUM_BLOCK_N
+    # --- The first input ---
+    input1_block_ptr = tl.make_block_ptr(
+        base=input1_ptr,
+        shape=(M, N),
+        strides=(input1_stride_0, input1_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    input1 = tl.load(input1_block_ptr)
+    input1 = input1.to(tl.float32)
+    input1 = tl.reshape(input1, (BLOCK_M, BLOCK_SN, QB))
+    # --- The second input ---
+    input2_block_ptr = tl.make_block_ptr(
+        base=input2_ptr,
+        shape=(M, N),
+        strides=(input2_stride_0, input2_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    input2 = tl.load(input2_block_ptr)
+    input2 = input2.to(tl.float32)
+    input2 = tl.reshape(input2, (BLOCK_M, BLOCK_SN, QB))
+    # Actual Calculation of Add
+    add_output = input1 + input2
+    # Quantize the grad 1 - Scale calculation
+    abs_add_output = tl.abs(add_output)
+    max_val = tl.max(abs_add_output, axis=2) + SCALE_MIN_THRES
+    scale_output2 = max_val / fp8_max
+    scale_output2 = tl.reshape(scale_output2, (BLOCK_M, BLOCK_SN, 1))
+    # save the fp add output
+    fp_add_output = add_output.to(output1_ptr.type.element_ty)
+    fp_add_output = tl.reshape(fp_add_output, (BLOCK_M, BLOCK_N))
+    # pointers
+    output1_block_ptr = tl.make_block_ptr(
+        base=output1_ptr,
+        shape=(M, N),
+        strides=(output1_stride_0, output1_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_N),
+        block_shape=(BLOCK_M, BLOCK_N),
+        order=(1, 0),
+    )
+    tl.store(output1_block_ptr, fp_add_output, boundary_check=(0, 1))
+    # Quantize
+    scale_output2 = scale_output2.to(output2_scale_ptr.type.element_ty)
+    scale_output2 = tl.reshape(scale_output2, (BLOCK_M, BLOCK_SN))
+    # pointers
+    scale_output2_ptr = tl.make_block_ptr(
+        base=output2_scale_ptr,
+        shape=(M, SN),
+        strides=(s_output2_stride_0, s_output2_stride_1),
+        offsets=(pid_dim0 * BLOCK_M, pid_dim1 * BLOCK_SN),
+        block_shape=(BLOCK_M, BLOCK_SN),
+        order=(1, 0),
+    )
+    tl.store(scale_output2_ptr, scale_output2, boundary_check=(0, 1))
+def fp8_add_Ifp_Ifp_Ofp_Opt(x1, x2, QB, fp8type, stochastic=False):  # suppose x1 is full precision or BF16
+    # Change batched 3D input to 2D
+    batched = False
+    if len(x1.shape) == 3:
+        assert len(x2.shape) == 3
+        batched = True
+        BS = x1.shape[0]
+        x1 = x1.reshape(-1, x1.shape[-1])
+        x2 = x2.reshape(-1, x2.shape[-1])
+    # defining the input and output tensor
+    M, N = x1.shape
+    SN = N // QB
+    assert x1.shape == x2.shape
+    if isinstance(fp8type, str):
+        fp8type = convert_str_to_fp8[fp8type]
+    y1 = torch.empty_like(x1, dtype=torch.bfloat16)
+    s_y2 = torch.empty((M, SN), dtype=torch.bfloat16, device=x2.device)
+    fp8MaxValue = FP8_MAX_VALUE[fp8type]  # E4M3 and E5M2 have different max value
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
+    _fp8_add_Ifp_Ifp_Ofp_Opt_kernel[grid](
+        y1,
+        s_y2,
+        x1,
+        x2,
+        M,
+        N,
+        SN,
+        QB,
+        fp8MaxValue,
+        x1.stride(0),
+        x1.stride(1),
+        x2.stride(0),
+        x2.stride(1),
+        y1.stride(0),
+        y1.stride(1),
+        s_y2.stride(0),
+        s_y2.stride(1),
+        SCALE_MIN_THRES=SCALE_MIN_THRES,
+    )
+    s_y2_max = s_y2.max()
+    qy2, s_y2_max = fp8_division(y1, QB, fp8type, s_y2_max, stochastic=stochastic)  # reuse the floating point output y1
+    # Recover 2D to 3D
+    if batched:
+        y1 = y1.reshape(BS, -1, y1.shape[-1])
+        qy2 = qy2.reshape(BS, -1, qy2.shape[-1])
+        s_y2 = s_y2.reshape(BS, -1, s_y2.shape[-1])
+    return y1, (qy2, s_y2_max, s_y2)