Spaces:
Running
on
Zero
Running
on
Zero
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +16 -0
- .gitignore +9 -0
- app.py +640 -0
- assets/bot.png +0 -0
- assets/user.png +0 -0
- data/10309844035.mp4 +3 -0
- data/13887487955.mp4 +3 -0
- data/4167294363.mp4 +3 -0
- data/4742652230.mp4 +3 -0
- data/4766274786.mp4 +3 -0
- data/5012237466.mp4 +3 -0
- data/5188348585.mp4 +3 -0
- data/9383140374.mp4 +3 -0
- data/DTInxNfWXVc_210.0_360.0.mp4 +3 -0
- data/RoripwjYFp8_210.0_360.0.mp4 +3 -0
- data/UFWQKrcbhjI_360.0_510.0.mp4 +3 -0
- data/Z3-IZ3HAmIA_60.0_210.0.mp4 +3 -0
- data/h6QKDqomIPk_210.0_360.0.mp4 +3 -0
- data/pA6Z-qYhSNg_60.0_210.0.mp4 +3 -0
- data/rrTIeJRVGjg_60.0_210.0.mp4 +3 -0
- data/yId2wIocTys_210.0_360.0.mp4 +3 -0
- requirements.txt +26 -0
- setup.cfg +16 -0
- videomind/constants.py +42 -0
- videomind/conversation.py +49 -0
- videomind/dataset/__init__.py +61 -0
- videomind/dataset/collator.py +40 -0
- videomind/dataset/hybrid.py +180 -0
- videomind/dataset/sub_classes/__init__.py +69 -0
- videomind/dataset/sub_classes/activitynet_captions.py +96 -0
- videomind/dataset/sub_classes/activitynet_rtl.py +68 -0
- videomind/dataset/sub_classes/cgbench.py +47 -0
- videomind/dataset/sub_classes/charades_sta.py +45 -0
- videomind/dataset/sub_classes/cosmo_cap.py +37 -0
- videomind/dataset/sub_classes/didemo.py +59 -0
- videomind/dataset/sub_classes/ego4d_naq.py +81 -0
- videomind/dataset/sub_classes/ego4d_nlq.py +41 -0
- videomind/dataset/sub_classes/ego_timeqa.py +93 -0
- videomind/dataset/sub_classes/hirest.py +150 -0
- videomind/dataset/sub_classes/internvit_vtime.py +45 -0
- videomind/dataset/sub_classes/longvideobench.py +53 -0
- videomind/dataset/sub_classes/lvbench.py +52 -0
- videomind/dataset/sub_classes/mlvu.py +55 -0
- videomind/dataset/sub_classes/mvbench.py +74 -0
- videomind/dataset/sub_classes/nextgqa.py +87 -0
- videomind/dataset/sub_classes/nextqa.py +63 -0
- videomind/dataset/sub_classes/qa_ego4d.py +98 -0
- videomind/dataset/sub_classes/queryd.py +49 -0
- videomind/dataset/sub_classes/qvhighlights.py +78 -0
- videomind/dataset/sub_classes/rextime.py +81 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/10309844035.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
data/13887487955.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
data/4167294363.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
data/4742652230.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
data/4766274786.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
data/5012237466.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
data/5188348585.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
data/9383140374.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
data/DTInxNfWXVc_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
data/RoripwjYFp8_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
data/UFWQKrcbhjI_360.0_510.0.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
data/Z3-IZ3HAmIA_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
data/h6QKDqomIPk_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
data/pA6Z-qYhSNg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
data/rrTIeJRVGjg_60.0_210.0.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
data/yId2wIocTys_210.0_360.0.mp4 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__
|
| 3 |
+
*.egg-info
|
| 4 |
+
*.py[cod]
|
| 5 |
+
*$py.class
|
| 6 |
+
|
| 7 |
+
# Temporary data
|
| 8 |
+
.DS_Store
|
| 9 |
+
._*
|
app.py
ADDED
|
@@ -0,0 +1,640 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2024 Ye Liu. Licensed under the BSD-3-Clause license.
|
| 2 |
+
|
| 3 |
+
import html
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import random
|
| 7 |
+
import time
|
| 8 |
+
from functools import partial
|
| 9 |
+
from threading import Thread
|
| 10 |
+
|
| 11 |
+
import gradio as gr
|
| 12 |
+
import nncore
|
| 13 |
+
import torch
|
| 14 |
+
from huggingface_hub import snapshot_download
|
| 15 |
+
from transformers import TextIteratorStreamer
|
| 16 |
+
|
| 17 |
+
from videomind.constants import GROUNDER_PROMPT, PLANNER_PROMPT, VERIFIER_PROMPT
|
| 18 |
+
from videomind.dataset.utils import process_vision_info
|
| 19 |
+
from videomind.model.builder import build_model
|
| 20 |
+
from videomind.utils.io import get_duration
|
| 21 |
+
from videomind.utils.parser import parse_query, parse_span
|
| 22 |
+
|
| 23 |
+
BASE_MODEL = 'model_zoo/Qwen2-VL-2B-Instruct'
|
| 24 |
+
BASE_MODEL_HF = 'Qwen/Qwen2-VL-2B-Instruct'
|
| 25 |
+
|
| 26 |
+
MODEL = 'model_zoo/VideoMind-2B'
|
| 27 |
+
MODEL_HF = 'yeliudev/VideoMind-2B'
|
| 28 |
+
|
| 29 |
+
TITLE = 'VideoMind: A Chain-of-LoRA Agent for Long Video Reasoning'
|
| 30 |
+
|
| 31 |
+
TITLE_MD = f'<h1 align="center">💡 {TITLE}</h1>'
|
| 32 |
+
DESCRIPTION_MD = """VideoMind is a multi-modal agent framework that enhances video reasoning by emulating *human-like* processes, such as *breaking down tasks*, *localizing and verifying moments*, and *synthesizing answers*. This approach addresses the unique challenges of temporal-grounded reasoning in a progressive strategy. Please find more details at our <a href="https://videomind.github.io/" target="_blank">Project Page</a>, <a href="https://arxiv.org/abs/2503.13444" target="_blank">Tech Report</a> and <a href="https://github.com/yeliudev/VideoMind" target="_blank">GitHub Repo</a>.""" # noqa
|
| 33 |
+
|
| 34 |
+
# yapf:disable
|
| 35 |
+
EXAMPLES = [
|
| 36 |
+
('data/4167294363.mp4', 'Why did the old man stand up?', ['pla', 'gnd', 'ver', 'ans']),
|
| 37 |
+
('data/5012237466.mp4', 'How does the child in stripes react about the fountain?', ['pla', 'gnd', 'ver', 'ans']),
|
| 38 |
+
('data/13887487955.mp4', 'What did the excavator do after it pushed the cement forward?', ['pla', 'gnd', 'ver', 'ans']),
|
| 39 |
+
('data/5188348585.mp4', 'What did the person do before pouring the liquor?', ['pla', 'gnd', 'ver', 'ans']),
|
| 40 |
+
('data/4766274786.mp4', 'What did the girl do after the baby lost the balloon?', ['pla', 'gnd', 'ver', 'ans']),
|
| 41 |
+
('data/4742652230.mp4', 'Why is the girl pushing the boy only around the toy but not to other places?', ['pla', 'gnd', 'ver', 'ans']),
|
| 42 |
+
('data/9383140374.mp4', 'How does the girl in pink control the movement of the claw?', ['pla', 'gnd', 'ver', 'ans']),
|
| 43 |
+
('data/10309844035.mp4', 'Why are they holding up the phones?', ['pla', 'gnd', 'ver', 'ans']),
|
| 44 |
+
('data/pA6Z-qYhSNg_60.0_210.0.mp4', 'Different types of meat products are being cut, shaped and prepared', ['gnd', 'ver']),
|
| 45 |
+
('data/UFWQKrcbhjI_360.0_510.0.mp4', 'A man talks to the camera whilst walking along a roadside in a rural area', ['gnd', 'ver']),
|
| 46 |
+
('data/RoripwjYFp8_210.0_360.0.mp4', 'A woman wearing glasses eating something at a street market', ['gnd', 'ver']),
|
| 47 |
+
('data/h6QKDqomIPk_210.0_360.0.mp4', 'A toddler sits in his car seat, holding his yellow tablet', ['gnd', 'ver']),
|
| 48 |
+
('data/Z3-IZ3HAmIA_60.0_210.0.mp4', 'A view from the window as the plane accelerates and takes off from the runway', ['gnd', 'ver']),
|
| 49 |
+
('data/yId2wIocTys_210.0_360.0.mp4', "Temporally locate the visual content mentioned in the text query 'kids exercise in front of parked cars' within the video.", ['pla', 'gnd', 'ver']),
|
| 50 |
+
('data/rrTIeJRVGjg_60.0_210.0.mp4', "Localize the moment that provides relevant context about 'man stands in front of a white building monologuing'.", ['pla', 'gnd', 'ver']),
|
| 51 |
+
('data/DTInxNfWXVc_210.0_360.0.mp4', "Find the video segment that corresponds to the given textual query 'man with headphones talking'.", ['pla', 'gnd', 'ver']),
|
| 52 |
+
]
|
| 53 |
+
# yapf:enable
|
| 54 |
+
|
| 55 |
+
CSS = """button .box { text-align: left }"""
|
| 56 |
+
|
| 57 |
+
JS = """
|
| 58 |
+
function init() {
|
| 59 |
+
var info = document.getElementById('role').querySelectorAll('[class^="svelte"]')[1]
|
| 60 |
+
info.innerHTML = info.innerHTML.replace(/</g, '<').replace(/>/g, '>')
|
| 61 |
+
}
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class CustomStreamer(TextIteratorStreamer):
|
| 66 |
+
|
| 67 |
+
def put(self, value):
|
| 68 |
+
if len(value.shape) > 1 and value.shape[0] > 1:
|
| 69 |
+
raise ValueError('TextStreamer only supports batch size 1')
|
| 70 |
+
elif len(value.shape) > 1:
|
| 71 |
+
value = value[0]
|
| 72 |
+
|
| 73 |
+
if self.skip_prompt and self.next_tokens_are_prompt:
|
| 74 |
+
self.next_tokens_are_prompt = False
|
| 75 |
+
return
|
| 76 |
+
|
| 77 |
+
self.token_cache.extend(value.tolist())
|
| 78 |
+
|
| 79 |
+
# force skipping eos token
|
| 80 |
+
if self.token_cache[-1] == self.tokenizer.eos_token_id:
|
| 81 |
+
self.token_cache = self.token_cache[:-1]
|
| 82 |
+
|
| 83 |
+
text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
|
| 84 |
+
|
| 85 |
+
# cache decoded text for future use
|
| 86 |
+
self.text_cache = text
|
| 87 |
+
|
| 88 |
+
if text.endswith('\n'):
|
| 89 |
+
printable_text = text[self.print_len:]
|
| 90 |
+
self.token_cache = []
|
| 91 |
+
self.print_len = 0
|
| 92 |
+
elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
|
| 93 |
+
printable_text = text[self.print_len:]
|
| 94 |
+
self.print_len += len(printable_text)
|
| 95 |
+
else:
|
| 96 |
+
printable_text = text[self.print_len:text.rfind(' ') + 1]
|
| 97 |
+
self.print_len += len(printable_text)
|
| 98 |
+
|
| 99 |
+
self.on_finalized_text(printable_text)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def seconds_to_hms(seconds):
|
| 103 |
+
hours, remainder = divmod(round(seconds), 3600)
|
| 104 |
+
minutes, seconds = divmod(remainder, 60)
|
| 105 |
+
return f'{hours:02}:{minutes:02}:{seconds:02}'
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def enable_btns():
|
| 109 |
+
return (gr.Button(interactive=True), ) * 3
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def disable_btns():
|
| 113 |
+
return (gr.Button(interactive=False), ) * 3
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def update_placeholder(role):
|
| 117 |
+
placeholder = 'Ask a question about the video...' if 'ans' in role else 'Write a query to search for a moment...'
|
| 118 |
+
return gr.Textbox(placeholder=placeholder)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def main(video, prompt, role, temperature, max_new_tokens, model, processor, streamer, device):
|
| 122 |
+
history = []
|
| 123 |
+
|
| 124 |
+
if not video:
|
| 125 |
+
gr.Warning('Please upload a video or click [Random] to sample one.')
|
| 126 |
+
return history
|
| 127 |
+
|
| 128 |
+
if not prompt:
|
| 129 |
+
gr.Warning('Please provide a prompt or click [Random] to sample one.')
|
| 130 |
+
return history
|
| 131 |
+
|
| 132 |
+
if 'gnd' not in role and 'ans' not in role:
|
| 133 |
+
gr.Warning('Please at least select Grounder or Answerer.')
|
| 134 |
+
return history
|
| 135 |
+
|
| 136 |
+
if 'ver' in role and 'gnd' not in role:
|
| 137 |
+
gr.Warning('Verifier cannot be used without Grounder.')
|
| 138 |
+
return history
|
| 139 |
+
|
| 140 |
+
if 'pla' in role and any(k not in role for k in ('gnd', 'ver', 'ans')):
|
| 141 |
+
gr.Warning('Planner can only be used when all other roles are selected.')
|
| 142 |
+
return history
|
| 143 |
+
|
| 144 |
+
history.append({'role': 'user', 'content': prompt})
|
| 145 |
+
yield history
|
| 146 |
+
|
| 147 |
+
duration = get_duration(video)
|
| 148 |
+
|
| 149 |
+
# do grounding and answering by default
|
| 150 |
+
do_grounding = True
|
| 151 |
+
do_answering = True
|
| 152 |
+
|
| 153 |
+
# initialize grounding query as prompt
|
| 154 |
+
query = prompt
|
| 155 |
+
|
| 156 |
+
if 'pla' in role:
|
| 157 |
+
text = PLANNER_PROMPT.format(prompt)
|
| 158 |
+
|
| 159 |
+
history.append({
|
| 160 |
+
'metadata': {
|
| 161 |
+
'title': '🗺️ Working as Planner...'
|
| 162 |
+
},
|
| 163 |
+
'role': 'assistant',
|
| 164 |
+
'content': f'##### Planner Prompt:\n\n{html.escape(text)}\n\n##### Planner Response:\n\n...'
|
| 165 |
+
})
|
| 166 |
+
yield history
|
| 167 |
+
|
| 168 |
+
start_time = time.perf_counter()
|
| 169 |
+
|
| 170 |
+
messages = [{
|
| 171 |
+
'role':
|
| 172 |
+
'user',
|
| 173 |
+
'content': [{
|
| 174 |
+
'type': 'video',
|
| 175 |
+
'video': video,
|
| 176 |
+
'num_threads': 1,
|
| 177 |
+
'min_pixels': 36 * 28 * 28,
|
| 178 |
+
'max_pixels': 64 * 28 * 28,
|
| 179 |
+
'max_frames': 100,
|
| 180 |
+
'fps': 1.0
|
| 181 |
+
}, {
|
| 182 |
+
'type': 'text',
|
| 183 |
+
'text': text
|
| 184 |
+
}]
|
| 185 |
+
}]
|
| 186 |
+
|
| 187 |
+
text = processor.apply_chat_template(messages, add_generation_prompt=True)
|
| 188 |
+
|
| 189 |
+
images, videos = process_vision_info(messages)
|
| 190 |
+
data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
|
| 191 |
+
data = data.to(device)
|
| 192 |
+
|
| 193 |
+
model.base_model.disable_adapter_layers()
|
| 194 |
+
model.base_model.enable_adapter_layers()
|
| 195 |
+
model.set_adapter('planner')
|
| 196 |
+
|
| 197 |
+
generation_kwargs = dict(
|
| 198 |
+
**data,
|
| 199 |
+
streamer=streamer,
|
| 200 |
+
do_sample=temperature > 0,
|
| 201 |
+
temperature=temperature if temperature > 0 else None,
|
| 202 |
+
top_p=None,
|
| 203 |
+
top_k=None,
|
| 204 |
+
repetition_penalty=None,
|
| 205 |
+
max_new_tokens=max_new_tokens)
|
| 206 |
+
|
| 207 |
+
t = Thread(target=model.generate, kwargs=generation_kwargs)
|
| 208 |
+
t.start()
|
| 209 |
+
|
| 210 |
+
skipped = False
|
| 211 |
+
for i, text in enumerate(streamer):
|
| 212 |
+
if text and not skipped:
|
| 213 |
+
history[-1]['content'] = history[-1]['content'].rstrip('.')
|
| 214 |
+
skipped = True
|
| 215 |
+
history[-1]['content'] += text
|
| 216 |
+
yield history
|
| 217 |
+
|
| 218 |
+
elapsed_time = round(time.perf_counter() - start_time, 1)
|
| 219 |
+
history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
|
| 220 |
+
yield history
|
| 221 |
+
|
| 222 |
+
try:
|
| 223 |
+
parsed = json.loads(streamer.text_cache)
|
| 224 |
+
action = parsed[0] if isinstance(parsed, list) else parsed
|
| 225 |
+
if action['type'].lower() == 'grounder' and action['value']:
|
| 226 |
+
query = action['value']
|
| 227 |
+
elif action['type'].lower() == 'answerer':
|
| 228 |
+
do_grounding = False
|
| 229 |
+
do_answering = True
|
| 230 |
+
except Exception:
|
| 231 |
+
pass
|
| 232 |
+
|
| 233 |
+
response = 'After browsing the video and the question. My plan to figure out the answer is as follows:\n'
|
| 234 |
+
step_idx = 1
|
| 235 |
+
if 'gnd' in role and do_grounding:
|
| 236 |
+
response += f'\n{step_idx}. Localize the relevant moment in this video using the query "<span style="color:red">{query}</span>".'
|
| 237 |
+
step_idx += 1
|
| 238 |
+
if 'ver' in role and do_grounding:
|
| 239 |
+
response += f'\n{step_idx}. Verify the grounded moments one-by-one and select the best cancdidate.'
|
| 240 |
+
step_idx += 1
|
| 241 |
+
if 'ans' in role and do_answering:
|
| 242 |
+
if step_idx > 1:
|
| 243 |
+
response += f'\n{step_idx}. Crop the video segment and zoom-in to higher resolution.'
|
| 244 |
+
else:
|
| 245 |
+
response += f'\n{step_idx}. Analyze the whole video directly without cropping.'
|
| 246 |
+
|
| 247 |
+
history.append({'role': 'assistant', 'content': ''})
|
| 248 |
+
for i, text in enumerate(response.split(' ')):
|
| 249 |
+
history[-1]['content'] += ' ' + text if i > 0 else text
|
| 250 |
+
yield history
|
| 251 |
+
|
| 252 |
+
if 'gnd' in role and do_grounding:
|
| 253 |
+
query = parse_query(query)
|
| 254 |
+
|
| 255 |
+
text = GROUNDER_PROMPT.format(query)
|
| 256 |
+
|
| 257 |
+
history.append({
|
| 258 |
+
'metadata': {
|
| 259 |
+
'title': '🔍 Working as Grounder...'
|
| 260 |
+
},
|
| 261 |
+
'role': 'assistant',
|
| 262 |
+
'content': f'##### Grounder Prompt:\n\n{html.escape(text)}\n\n##### Grounder Response:\n\n...'
|
| 263 |
+
})
|
| 264 |
+
yield history
|
| 265 |
+
|
| 266 |
+
start_time = time.perf_counter()
|
| 267 |
+
|
| 268 |
+
messages = [{
|
| 269 |
+
'role':
|
| 270 |
+
'user',
|
| 271 |
+
'content': [{
|
| 272 |
+
'type': 'video',
|
| 273 |
+
'video': video,
|
| 274 |
+
'num_threads': 1,
|
| 275 |
+
'min_pixels': 36 * 28 * 28,
|
| 276 |
+
'max_pixels': 64 * 28 * 28,
|
| 277 |
+
'max_frames': 150,
|
| 278 |
+
'fps': 1.0
|
| 279 |
+
}, {
|
| 280 |
+
'type': 'text',
|
| 281 |
+
'text': text
|
| 282 |
+
}]
|
| 283 |
+
}]
|
| 284 |
+
|
| 285 |
+
text = processor.apply_chat_template(messages, add_generation_prompt=True)
|
| 286 |
+
images, videos = process_vision_info(messages)
|
| 287 |
+
data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
|
| 288 |
+
data = data.to(device)
|
| 289 |
+
|
| 290 |
+
model.base_model.disable_adapter_layers()
|
| 291 |
+
model.base_model.enable_adapter_layers()
|
| 292 |
+
model.set_adapter('grounder')
|
| 293 |
+
|
| 294 |
+
generation_kwargs = dict(
|
| 295 |
+
**data,
|
| 296 |
+
streamer=streamer,
|
| 297 |
+
do_sample=temperature > 0,
|
| 298 |
+
temperature=temperature if temperature > 0 else None,
|
| 299 |
+
top_p=None,
|
| 300 |
+
top_k=None,
|
| 301 |
+
repetition_penalty=None,
|
| 302 |
+
max_new_tokens=max_new_tokens)
|
| 303 |
+
|
| 304 |
+
t = Thread(target=model.generate, kwargs=generation_kwargs)
|
| 305 |
+
t.start()
|
| 306 |
+
|
| 307 |
+
skipped = False
|
| 308 |
+
for i, text in enumerate(streamer):
|
| 309 |
+
if text and not skipped:
|
| 310 |
+
history[-1]['content'] = history[-1]['content'].rstrip('.')
|
| 311 |
+
skipped = True
|
| 312 |
+
history[-1]['content'] += text
|
| 313 |
+
yield history
|
| 314 |
+
|
| 315 |
+
elapsed_time = round(time.perf_counter() - start_time, 1)
|
| 316 |
+
history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
|
| 317 |
+
yield history
|
| 318 |
+
|
| 319 |
+
if len(model.reg) > 0:
|
| 320 |
+
# 1. extract timestamps and confidences
|
| 321 |
+
blob = model.reg[0].cpu().float()
|
| 322 |
+
pred, conf = blob[:, :2] * duration, blob[:, -1].tolist()
|
| 323 |
+
|
| 324 |
+
# 2. clamp timestamps
|
| 325 |
+
pred = pred.clamp(min=0, max=duration)
|
| 326 |
+
|
| 327 |
+
# 3. sort timestamps
|
| 328 |
+
inds = (pred[:, 1] - pred[:, 0] < 0).nonzero()[:, 0]
|
| 329 |
+
pred[inds] = pred[inds].roll(1)
|
| 330 |
+
|
| 331 |
+
# 4. convert timestamps to list
|
| 332 |
+
pred = pred.tolist()
|
| 333 |
+
else:
|
| 334 |
+
if 'ver' in role:
|
| 335 |
+
pred = [[i * duration / 6, (i + 2) * duration / 6] for i in range(5)]
|
| 336 |
+
conf = [0] * 5
|
| 337 |
+
else:
|
| 338 |
+
pred = [[0, duration]]
|
| 339 |
+
conf = [0]
|
| 340 |
+
|
| 341 |
+
response = 'The candidate moments and confidence scores are as follows:\n'
|
| 342 |
+
response += '\n| ID | Start Time | End Time | Confidence |'
|
| 343 |
+
response += '\n| :-: | :-: | :-: | :-: |'
|
| 344 |
+
|
| 345 |
+
# using top-5 predictions
|
| 346 |
+
for i, (p, c) in enumerate(zip(pred[:5], conf[:5])):
|
| 347 |
+
response += f'\n| {i} | {seconds_to_hms(p[0])} | {seconds_to_hms(p[1])} | {c:.2f} |'
|
| 348 |
+
|
| 349 |
+
response += f'\n\nTherefore, the target moment might happens from <span style="color:red">{seconds_to_hms(pred[0][0])}</span> to <span style="color:red">{seconds_to_hms(pred[0][1])}</span>.'
|
| 350 |
+
|
| 351 |
+
history.append({'role': 'assistant', 'content': ''})
|
| 352 |
+
for i, text in enumerate(response.split(' ')):
|
| 353 |
+
history[-1]['content'] += ' ' + text if i > 0 else text
|
| 354 |
+
yield history
|
| 355 |
+
|
| 356 |
+
if 'ver' in role and do_grounding:
|
| 357 |
+
text = VERIFIER_PROMPT.format(query)
|
| 358 |
+
|
| 359 |
+
history.append({
|
| 360 |
+
'metadata': {
|
| 361 |
+
'title': '📊 Working as Verifier...'
|
| 362 |
+
},
|
| 363 |
+
'role': 'assistant',
|
| 364 |
+
'content': f'##### Verifier Prompt:\n\n{html.escape(text)}\n\n##### Verifier Response:\n\n...'
|
| 365 |
+
})
|
| 366 |
+
yield history
|
| 367 |
+
|
| 368 |
+
start_time = time.perf_counter()
|
| 369 |
+
|
| 370 |
+
# using top-5 predictions
|
| 371 |
+
prob = []
|
| 372 |
+
for i, cand in enumerate(pred[:5]):
|
| 373 |
+
s0, e0 = parse_span(cand, duration, 2)
|
| 374 |
+
offset = (e0 - s0) / 2
|
| 375 |
+
s1, e1 = parse_span([s0 - offset, e0 + offset], duration)
|
| 376 |
+
|
| 377 |
+
# percentage of s0, e0 within s1, e1
|
| 378 |
+
s = (s0 - s1) / (e1 - s1)
|
| 379 |
+
e = (e0 - s1) / (e1 - s1)
|
| 380 |
+
|
| 381 |
+
messages = [{
|
| 382 |
+
'role':
|
| 383 |
+
'user',
|
| 384 |
+
'content': [{
|
| 385 |
+
'type': 'video',
|
| 386 |
+
'video': video,
|
| 387 |
+
'num_threads': 1,
|
| 388 |
+
'video_start': s1,
|
| 389 |
+
'video_end': e1,
|
| 390 |
+
'min_pixels': 36 * 28 * 28,
|
| 391 |
+
'max_pixels': 64 * 28 * 28,
|
| 392 |
+
'max_frames': 64,
|
| 393 |
+
'fps': 2.0
|
| 394 |
+
}, {
|
| 395 |
+
'type': 'text',
|
| 396 |
+
'text': text
|
| 397 |
+
}]
|
| 398 |
+
}]
|
| 399 |
+
|
| 400 |
+
text = processor.apply_chat_template(messages, add_generation_prompt=True)
|
| 401 |
+
images, videos = process_vision_info(messages)
|
| 402 |
+
data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
|
| 403 |
+
|
| 404 |
+
# ===== insert segment start/end tokens =====
|
| 405 |
+
video_grid_thw = data['video_grid_thw'][0]
|
| 406 |
+
num_frames, window = int(video_grid_thw[0]), int(video_grid_thw[1] * video_grid_thw[2] / 4)
|
| 407 |
+
assert num_frames * window * 4 == data['pixel_values_videos'].size(0)
|
| 408 |
+
|
| 409 |
+
pos_s, pos_e = round(s * num_frames), round(e * num_frames)
|
| 410 |
+
pos_s, pos_e = min(max(0, pos_s), num_frames), min(max(0, pos_e), num_frames)
|
| 411 |
+
assert pos_s <= pos_e, (num_frames, s, e)
|
| 412 |
+
|
| 413 |
+
base_idx = torch.nonzero(data['input_ids'][0] == model.config.vision_start_token_id).item()
|
| 414 |
+
pos_s, pos_e = pos_s * window + base_idx + 1, pos_e * window + base_idx + 2
|
| 415 |
+
|
| 416 |
+
input_ids = data['input_ids'][0].tolist()
|
| 417 |
+
input_ids.insert(pos_s, model.config.seg_s_token_id)
|
| 418 |
+
input_ids.insert(pos_e, model.config.seg_e_token_id)
|
| 419 |
+
data['input_ids'] = torch.LongTensor([input_ids])
|
| 420 |
+
data['attention_mask'] = torch.ones_like(data['input_ids'])
|
| 421 |
+
# ===========================================
|
| 422 |
+
|
| 423 |
+
data = data.to(device)
|
| 424 |
+
|
| 425 |
+
model.base_model.disable_adapter_layers()
|
| 426 |
+
model.base_model.enable_adapter_layers()
|
| 427 |
+
model.set_adapter('verifier')
|
| 428 |
+
|
| 429 |
+
with torch.inference_mode():
|
| 430 |
+
logits = model(**data).logits[0, -1].softmax(dim=-1)
|
| 431 |
+
|
| 432 |
+
# NOTE: magic numbers here
|
| 433 |
+
# In Qwen2-VL vocab: 9454 -> Yes, 2753 -> No
|
| 434 |
+
score = (logits[9454] - logits[2753]).sigmoid().item()
|
| 435 |
+
prob.append(score)
|
| 436 |
+
|
| 437 |
+
if i == 0:
|
| 438 |
+
history[-1]['content'] = history[-1]['content'].rstrip('.')[:-1]
|
| 439 |
+
|
| 440 |
+
response = f'\nCandidate ID {i}: P(Yes) = {score:.2f}'
|
| 441 |
+
for j, text in enumerate(response.split(' ')):
|
| 442 |
+
history[-1]['content'] += ' ' + text if j > 0 else text
|
| 443 |
+
yield history
|
| 444 |
+
|
| 445 |
+
elapsed_time = round(time.perf_counter() - start_time, 1)
|
| 446 |
+
history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
|
| 447 |
+
yield history
|
| 448 |
+
|
| 449 |
+
ranks = torch.Tensor(prob).argsort(descending=True).tolist()
|
| 450 |
+
|
| 451 |
+
prob = [prob[idx] for idx in ranks]
|
| 452 |
+
pred = [pred[idx] for idx in ranks]
|
| 453 |
+
conf = [conf[idx] for idx in ranks]
|
| 454 |
+
|
| 455 |
+
response = 'After verification, the candidate moments are re-ranked as follows:\n'
|
| 456 |
+
response += '\n| ID | Start Time | End Time | Score |'
|
| 457 |
+
response += '\n| :-: | :-: | :-: | :-: |'
|
| 458 |
+
|
| 459 |
+
ids = list(range(len(ranks)))
|
| 460 |
+
for r, p, c in zip(ranks, pred, prob):
|
| 461 |
+
response += f'\n| {ids[r]} | {seconds_to_hms(p[0])} | {seconds_to_hms(p[1])} | {c:.2f} |'
|
| 462 |
+
|
| 463 |
+
response += f'\n\nTherefore, the target moment should be from <span style="color:red">{seconds_to_hms(pred[0][0])}</span> to <span style="color:red">{seconds_to_hms(pred[0][1])}</span>.'
|
| 464 |
+
|
| 465 |
+
history.append({'role': 'assistant', 'content': ''})
|
| 466 |
+
for i, text in enumerate(response.split(' ')):
|
| 467 |
+
history[-1]['content'] += ' ' + text if i > 0 else text
|
| 468 |
+
yield history
|
| 469 |
+
|
| 470 |
+
if 'ans' in role and do_answering:
|
| 471 |
+
text = f'{prompt} Please think step by step and provide your response.'
|
| 472 |
+
|
| 473 |
+
history.append({
|
| 474 |
+
'metadata': {
|
| 475 |
+
'title': '📝 Working as Answerer...'
|
| 476 |
+
},
|
| 477 |
+
'role': 'assistant',
|
| 478 |
+
'content': f'##### Answerer Prompt:\n\n{html.escape(text)}\n\n##### Answerer Response:\n\n...'
|
| 479 |
+
})
|
| 480 |
+
yield history
|
| 481 |
+
|
| 482 |
+
start_time = time.perf_counter()
|
| 483 |
+
|
| 484 |
+
# choose the potential best moment
|
| 485 |
+
selected = pred[0] if 'gnd' in role and do_grounding else [0, duration]
|
| 486 |
+
s, e = parse_span(selected, duration, 32)
|
| 487 |
+
|
| 488 |
+
messages = [{
|
| 489 |
+
'role':
|
| 490 |
+
'user',
|
| 491 |
+
'content': [{
|
| 492 |
+
'type': 'video',
|
| 493 |
+
'video': video,
|
| 494 |
+
'num_threads': 1,
|
| 495 |
+
'video_start': s,
|
| 496 |
+
'video_end': e,
|
| 497 |
+
'min_pixels': 128 * 28 * 28,
|
| 498 |
+
'max_pixels': 256 * 28 * 28,
|
| 499 |
+
'max_frames': 32,
|
| 500 |
+
'fps': 2.0
|
| 501 |
+
}, {
|
| 502 |
+
'type': 'text',
|
| 503 |
+
'text': text
|
| 504 |
+
}]
|
| 505 |
+
}]
|
| 506 |
+
|
| 507 |
+
text = processor.apply_chat_template(messages, add_generation_prompt=True)
|
| 508 |
+
images, videos = process_vision_info(messages)
|
| 509 |
+
data = processor(text=[text], images=images, videos=videos, return_tensors='pt')
|
| 510 |
+
data = data.to(device)
|
| 511 |
+
|
| 512 |
+
with model.disable_adapter():
|
| 513 |
+
generation_kwargs = dict(
|
| 514 |
+
**data,
|
| 515 |
+
streamer=streamer,
|
| 516 |
+
do_sample=temperature > 0,
|
| 517 |
+
temperature=temperature if temperature > 0 else None,
|
| 518 |
+
top_p=None,
|
| 519 |
+
top_k=None,
|
| 520 |
+
repetition_penalty=None,
|
| 521 |
+
max_new_tokens=max_new_tokens)
|
| 522 |
+
|
| 523 |
+
t = Thread(target=model.generate, kwargs=generation_kwargs)
|
| 524 |
+
t.start()
|
| 525 |
+
|
| 526 |
+
skipped = False
|
| 527 |
+
for i, text in enumerate(streamer):
|
| 528 |
+
if text and not skipped:
|
| 529 |
+
history[-1]['content'] = history[-1]['content'].rstrip('.')
|
| 530 |
+
skipped = True
|
| 531 |
+
history[-1]['content'] += text
|
| 532 |
+
yield history
|
| 533 |
+
|
| 534 |
+
elapsed_time = round(time.perf_counter() - start_time, 1)
|
| 535 |
+
history[-1]['metadata']['title'] += f' ({elapsed_time} seconds)'
|
| 536 |
+
yield history
|
| 537 |
+
|
| 538 |
+
if 'gnd' in role and do_grounding:
|
| 539 |
+
response = f'After zooming in and analyzing the target moment, I finalize my answer: <span style="color:green">{streamer.text_cache}</span>'
|
| 540 |
+
else:
|
| 541 |
+
response = f'After watching the whole video, my answer is: <span style="color:green">{streamer.text_cache}</span>'
|
| 542 |
+
|
| 543 |
+
history.append({'role': 'assistant', 'content': ''})
|
| 544 |
+
for i, text in enumerate(response.split(' ')):
|
| 545 |
+
history[-1]['content'] += ' ' + text if i > 0 else text
|
| 546 |
+
yield history
|
| 547 |
+
|
| 548 |
+
|
| 549 |
+
if __name__ == '__main__':
|
| 550 |
+
if not nncore.is_dir(BASE_MODEL):
|
| 551 |
+
snapshot_download(BASE_MODEL_HF, local_dir=BASE_MODEL)
|
| 552 |
+
|
| 553 |
+
if not nncore.is_dir(MODEL):
|
| 554 |
+
snapshot_download(MODEL_HF, local_dir=MODEL)
|
| 555 |
+
|
| 556 |
+
print('Initializing role *grounder*')
|
| 557 |
+
model, processor = build_model(MODEL)
|
| 558 |
+
|
| 559 |
+
print('Initializing role *planner*')
|
| 560 |
+
model.load_adapter(nncore.join(MODEL, 'planner'), adapter_name='planner')
|
| 561 |
+
|
| 562 |
+
print('Initializing role *verifier*')
|
| 563 |
+
model.load_adapter(nncore.join(MODEL, 'verifier'), adapter_name='verifier')
|
| 564 |
+
|
| 565 |
+
streamer = CustomStreamer(processor.tokenizer, skip_prompt=True)
|
| 566 |
+
|
| 567 |
+
device = next(model.parameters()).device
|
| 568 |
+
|
| 569 |
+
main = partial(main, model=model, processor=processor, streamer=streamer, device=device)
|
| 570 |
+
|
| 571 |
+
path = os.path.dirname(os.path.realpath(__file__))
|
| 572 |
+
|
| 573 |
+
chat = gr.Chatbot(
|
| 574 |
+
type='messages',
|
| 575 |
+
height='70vh',
|
| 576 |
+
avatar_images=[f'{path}/assets/user.png', f'{path}/assets/bot.png'],
|
| 577 |
+
placeholder='A conversation with VideoMind',
|
| 578 |
+
label='VideoMind')
|
| 579 |
+
|
| 580 |
+
prompt = gr.Textbox(label='Text Prompt', placeholder='Ask a question about the video...')
|
| 581 |
+
|
| 582 |
+
with gr.Blocks(title=TITLE, css=CSS, js=JS) as demo:
|
| 583 |
+
gr.Markdown(TITLE_MD)
|
| 584 |
+
gr.Markdown(DESCRIPTION_MD)
|
| 585 |
+
|
| 586 |
+
with gr.Row():
|
| 587 |
+
with gr.Column(scale=3):
|
| 588 |
+
video = gr.Video()
|
| 589 |
+
|
| 590 |
+
with gr.Group():
|
| 591 |
+
role = gr.CheckboxGroup(
|
| 592 |
+
choices=[('🗺️ Planner', 'pla'), ('🔍 Grounder', 'gnd'), ('📊 Verifier', 'ver'),
|
| 593 |
+
('📝 Answerer', 'ans')],
|
| 594 |
+
value=['pla', 'gnd', 'ver', 'ans'],
|
| 595 |
+
interactive=True,
|
| 596 |
+
elem_id='role',
|
| 597 |
+
label='Role(s) To Use',
|
| 598 |
+
info='[Auto Planning]: Planner + Grounder + Verifier + Answerer<br>'
|
| 599 |
+
'[Grounded Video Question-Answering]: Grounder + Verifier + Answerer<br>'
|
| 600 |
+
'[Video Temporal Grounding]: Grounder + Verifier<br>'
|
| 601 |
+
'[Direct Video Question-Answering]: Answerer<br>')
|
| 602 |
+
role.change(update_placeholder, role, prompt)
|
| 603 |
+
|
| 604 |
+
with gr.Accordion(label='Hyperparameters', open=False):
|
| 605 |
+
temperature = gr.Slider(
|
| 606 |
+
0,
|
| 607 |
+
1,
|
| 608 |
+
value=0,
|
| 609 |
+
step=0.1,
|
| 610 |
+
interactive=True,
|
| 611 |
+
label='Temperature',
|
| 612 |
+
info='Higher value leads to more creativity and randomness (Default: 0)')
|
| 613 |
+
max_new_tokens = gr.Slider(
|
| 614 |
+
1,
|
| 615 |
+
1024,
|
| 616 |
+
value=256,
|
| 617 |
+
interactive=True,
|
| 618 |
+
label='Max Output Tokens',
|
| 619 |
+
info='The maximum number of output tokens for each role (Default: 256)')
|
| 620 |
+
|
| 621 |
+
prompt.render()
|
| 622 |
+
|
| 623 |
+
with gr.Row():
|
| 624 |
+
random_btn = gr.Button(value='🔮 Random')
|
| 625 |
+
random_btn.click(lambda: random.choice(EXAMPLES), None, [video, prompt, role])
|
| 626 |
+
|
| 627 |
+
reset_btn = gr.ClearButton([video, prompt, chat], value='🗑️ Reset')
|
| 628 |
+
reset_btn.click(lambda: (['pla', 'gnd', 'ver', 'ans'], 0, 256), None,
|
| 629 |
+
[role, temperature, max_new_tokens])
|
| 630 |
+
|
| 631 |
+
submit_btn = gr.Button(value='🚀 Submit', variant='primary')
|
| 632 |
+
submit_ctx = submit_btn.click(disable_btns, None, [random_btn, reset_btn, submit_btn])
|
| 633 |
+
submit_ctx = submit_ctx.then(main, [video, prompt, role, temperature, max_new_tokens], chat)
|
| 634 |
+
submit_ctx.then(enable_btns, None, [random_btn, reset_btn, submit_btn])
|
| 635 |
+
|
| 636 |
+
with gr.Column(scale=5):
|
| 637 |
+
chat.render()
|
| 638 |
+
|
| 639 |
+
demo.queue()
|
| 640 |
+
demo.launch(server_name='0.0.0.0')
|
assets/bot.png
ADDED
|
assets/user.png
ADDED
|
data/10309844035.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8996ff134787d6b769c2491b9079a02c05953465ad770f07a8d9138e2668d24f
|
| 3 |
+
size 4041678
|
data/13887487955.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e5fecab1076ee42b3804718f9f64bef06cbfafd6995ad5f5ee42ba6354721429
|
| 3 |
+
size 5544739
|
data/4167294363.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3d0e0a4a381836f68e16a816d87f241fed3e31ea321f544b921743d6c1c50666
|
| 3 |
+
size 6611151
|
data/4742652230.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8733ab4b0716d13ea7a79fc4ddacaf9eede567db364f0ecddfa4582c2f237f82
|
| 3 |
+
size 2200304
|
data/4766274786.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:afa38a9ce9e89f934293214d79755c89159664223b3ca366813fd5fe524ed013
|
| 3 |
+
size 3395545
|
data/5012237466.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd1929aa93d037f809f402e9801047125dc9fe8060301e69ded9ba1f2d785cc8
|
| 3 |
+
size 4822293
|
data/5188348585.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b225f448a546ba2f65958f18c6731a6dde9b1f437014e90036b22eb40e9ad0a5
|
| 3 |
+
size 5051675
|
data/9383140374.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30b6b3eb43f711bef194150d473a59850ff5d7fec0f5cc30e7526aa9e382303f
|
| 3 |
+
size 2518081
|
data/DTInxNfWXVc_210.0_360.0.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a09eee0dc404688731fb768c120d3519605f2343376b9bd727a71b91379fd9a9
|
| 3 |
+
size 4999970
|
data/RoripwjYFp8_210.0_360.0.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b39b15158dc20c0bc6f1758a9239c8f3eed20ba4a90953338eec2246fa8f1f0
|
| 3 |
+
size 9287252
|
data/UFWQKrcbhjI_360.0_510.0.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8669153d9ffac4b5534c20fab8d795347f5babe588da9b8330e049d623ebb443
|
| 3 |
+
size 14510618
|
data/Z3-IZ3HAmIA_60.0_210.0.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b3a342993ee61efc5f3b859cd9c1e0d360b3331eed9deb8466891e4bcacc554
|
| 3 |
+
size 14397799
|
data/h6QKDqomIPk_210.0_360.0.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:103820de2b8a1a3935b39ed80d91cd08e546e5617310b3d1bb3dadb06b2ffb95
|
| 3 |
+
size 13485144
|
data/pA6Z-qYhSNg_60.0_210.0.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c84660fd4ebd8c23a2a7364174b1e819fec8b0e1cb8b9d9cd86f9e429cbdf66c
|
| 3 |
+
size 8658509
|
data/rrTIeJRVGjg_60.0_210.0.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:efe6f48a49963bd4880ef5065840e05dd25e2aa975870140bcdaf4220bbd2827
|
| 3 |
+
size 11410412
|
data/yId2wIocTys_210.0_360.0.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:447fcb1fd1f94ed6a88d56dd0f6f859646cb8c58ed8e3b7a82f374e2cfee1646
|
| 3 |
+
size 14769130
|
requirements.txt
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accelerate==1.2.1
|
| 2 |
+
decord==0.6.0
|
| 3 |
+
gradio==4.44.1
|
| 4 |
+
pandas==2.2.3
|
| 5 |
+
peft==0.14.0
|
| 6 |
+
pysrt==1.1.2
|
| 7 |
+
scikit-image==0.25.0
|
| 8 |
+
scikit-learn==1.6.1
|
| 9 |
+
sentencepiece==0.2.0
|
| 10 |
+
termplotlib==0.3.9
|
| 11 |
+
triton==3.0.0
|
| 12 |
+
|
| 13 |
+
# our codebase contains necessary patches for 4.45.2
|
| 14 |
+
transformers==4.45.2
|
| 15 |
+
|
| 16 |
+
# https://github.com/microsoft/DeepSpeed/issues/6793
|
| 17 |
+
deepspeed==0.15.4
|
| 18 |
+
|
| 19 |
+
# https://github.com/pytorch/pytorch/issues/138386
|
| 20 |
+
torch==2.4.1
|
| 21 |
+
torchvision==0.19.1
|
| 22 |
+
|
| 23 |
+
# torch-npu only supports torch 2.4.0
|
| 24 |
+
# torch==2.4.0+cpu
|
| 25 |
+
# torch-npu==2.4.0.post2
|
| 26 |
+
# torchvision==0.19.0+cpu
|
setup.cfg
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[yapf]
|
| 2 |
+
column_limit = 120
|
| 3 |
+
based_on_style = pep8
|
| 4 |
+
blank_line_before_nested_class_or_def = true
|
| 5 |
+
split_before_expression_after_opening_paren = true
|
| 6 |
+
|
| 7 |
+
[isort]
|
| 8 |
+
line_length = 120
|
| 9 |
+
multi_line_output = 0
|
| 10 |
+
known_third_party = decord,deepspeed,gradio,huggingface_hub,nncore,numpy,pandas,peft,PIL,pysrt,safetensors,tabulate,termplotlib,torch,torchvision,transformers
|
| 11 |
+
no_lines_before = STDLIB,LOCALFOLDER
|
| 12 |
+
default_section = FIRSTPARTY
|
| 13 |
+
|
| 14 |
+
[flake8]
|
| 15 |
+
max-line-length = 500
|
| 16 |
+
extend-ignore = E741
|
videomind/constants.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
IGNORE_INDEX = -100
|
| 4 |
+
|
| 5 |
+
REG_TOKEN = '<|reg|>'
|
| 6 |
+
|
| 7 |
+
SEG_S_TOKEN = '<|seg_start|>'
|
| 8 |
+
SEG_E_TOKEN = '<|seg_end|>'
|
| 9 |
+
|
| 10 |
+
PLANNER_PROMPT = (
|
| 11 |
+
'You are acting as the planner now. '
|
| 12 |
+
'Given a question about the video, your task is to analyze the question and identify the best way to answer this question. '
|
| 13 |
+
'You have access to the following tools:\n\n'
|
| 14 |
+
'Grounder: Accepts a text query and localize the relevant video segment according to the query.\n'
|
| 15 |
+
'Verifier: A tool supporting grounder by verifying the reliability of its outputs.\n'
|
| 16 |
+
'Answerer: Answer a given question directly based on the whole video or a cropped video segment.\n\n'
|
| 17 |
+
'Your response must be a list in JSON format. '
|
| 18 |
+
'A valid plan for reasoning could be "grounder, verifier, answer", "grounder, verifier", or "answerer", depending on the given question. '
|
| 19 |
+
'Please see an example for the format below.\n\n'
|
| 20 |
+
'[{{"type": "grounder", "value": "<text query>"}}, {{"type": "verifier"}}, {{"type": "answerer"}}]\n\n'
|
| 21 |
+
'Note that only the grounder can accept an argument called "value", which is the text query used for grounding. '
|
| 22 |
+
"Now I give you the question: '{}'. "
|
| 23 |
+
'Please think carefully and respond with your plan in JSON directly.')
|
| 24 |
+
|
| 25 |
+
GROUNDER_PROMPT = (
|
| 26 |
+
'You are acting as the grounder now. '
|
| 27 |
+
'Given a video and a text query, your goal is to temporally localize the video moment described by the query. '
|
| 28 |
+
'If the query is directly describing a moment, simply localize it according to its content. '
|
| 29 |
+
"Otherwise, if the moment is described as 'before/after a pivotal event', you need to determine the actual event it refers to. "
|
| 30 |
+
'The localized moment should only cover the target event. '
|
| 31 |
+
"Now I give you the query: '{}'. "
|
| 32 |
+
'Please think carefully and provide your response.')
|
| 33 |
+
|
| 34 |
+
VERIFIER_PROMPT = (
|
| 35 |
+
'You are acting as the verifier now. '
|
| 36 |
+
'You will be presented a text query describing a moment that potentialy happens in the given video. '
|
| 37 |
+
f'Your task is to identify whether the video segment between {SEG_S_TOKEN} and {SEG_E_TOKEN} perfectly covers the moment. '
|
| 38 |
+
f'If the described moment can be seen in the video, please focus on verifying whether the moment starts at {SEG_S_TOKEN} and ends at {SEG_E_TOKEN}. '
|
| 39 |
+
"Respond with 'Yes' if you think the moment boundaries are correct, otherwise 'No'. "
|
| 40 |
+
"If the described moment cannot be seen in the video, respond with 'No' directly. "
|
| 41 |
+
"Now I give you the query: '{}'. "
|
| 42 |
+
"Please think carefully and respond with 'Yes' or 'No' directly.")
|
videomind/conversation.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@dataclass
|
| 8 |
+
class Conversation:
|
| 9 |
+
style: str
|
| 10 |
+
system: str
|
| 11 |
+
roles: List[str]
|
| 12 |
+
seps: List[str]
|
| 13 |
+
messages: List[str]
|
| 14 |
+
|
| 15 |
+
def append_message(self, role, msg):
|
| 16 |
+
self.messages.append([role, msg])
|
| 17 |
+
|
| 18 |
+
def clear(self):
|
| 19 |
+
self.messages = []
|
| 20 |
+
|
| 21 |
+
def get_prompt(self):
|
| 22 |
+
assert self.style in ('chatml', )
|
| 23 |
+
|
| 24 |
+
prompt = self.system + self.seps[0] if self.system is not None else ''
|
| 25 |
+
|
| 26 |
+
for i, (role, msg) in enumerate(self.messages):
|
| 27 |
+
prompt += role
|
| 28 |
+
sep = self.seps[i % 2]
|
| 29 |
+
if msg is not None:
|
| 30 |
+
prompt += msg
|
| 31 |
+
if not prompt.endswith(sep):
|
| 32 |
+
prompt += sep
|
| 33 |
+
|
| 34 |
+
prompt = prompt.lstrip('\n')
|
| 35 |
+
return prompt
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def get_conv(conv_type):
|
| 39 |
+
if conv_type == 'chatml':
|
| 40 |
+
conv = Conversation(
|
| 41 |
+
style='chatml',
|
| 42 |
+
system='<|im_start|>system\nYou are a helpful assistant.',
|
| 43 |
+
roles=('\n<|im_start|>user\n', '\n<|im_start|>assistant\n'),
|
| 44 |
+
seps=('<|im_end|>', '<|im_end|>'),
|
| 45 |
+
messages=[])
|
| 46 |
+
else:
|
| 47 |
+
raise ValueError(f'unknown conversation type: {conv_type}')
|
| 48 |
+
|
| 49 |
+
return conv
|
videomind/dataset/__init__.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .collator import HybridDataCollator
|
| 2 |
+
from .hybrid import HybridDataset
|
| 3 |
+
from .sub_classes import (ActivitynetCaptionsBiasDataset, ActivitynetCaptionsDataset, ActivitynetRTLDataset,
|
| 4 |
+
CGBenchDataset, CharadesSTADataset, CosMoCapDataset, DiDeMoDataset, Ego4DNaQDataset,
|
| 5 |
+
Ego4DNLQDataset, EgoTimeQACropDataset, EgoTimeQADataset, EgoTimeQAGroundingDataset,
|
| 6 |
+
HiRESTGroundingDataset, HiRESTStepBiasDataset, HiRESTStepDataset, InternVidVTimeDataset,
|
| 7 |
+
LongVideoBenchDataset, LVBenchDataset, MLVUDataset, MVBenchDataset, NExTGQACropDataset,
|
| 8 |
+
NExTGQADataset, NExTGQAGroundingDataset, NExTQADataset, QAEgo4DCropDataset, QAEgo4DDataset,
|
| 9 |
+
QAEgo4DGroundingDataset, QuerYDDataset, QVHighlightsDataset, ReXTimeCropDataset,
|
| 10 |
+
ReXTimeDataset, ReXTimeGroundingDataset, STARDataset, TACoSDataset, VideoMMEDataset,
|
| 11 |
+
VideoXumDataset, VidMorpDataset, YouCook2BiasDataset, YouCook2Dataset)
|
| 12 |
+
from .wrappers import AnsweringCropDataset, AnsweringDataset, GroundingDataset, PlanningDataset, VerifyingDataset
|
| 13 |
+
|
| 14 |
+
__all__ = [
|
| 15 |
+
'HybridDataCollator',
|
| 16 |
+
'HybridDataset',
|
| 17 |
+
'ActivitynetCaptionsBiasDataset',
|
| 18 |
+
'ActivitynetCaptionsDataset',
|
| 19 |
+
'ActivitynetRTLDataset',
|
| 20 |
+
'CGBenchDataset',
|
| 21 |
+
'CharadesSTADataset',
|
| 22 |
+
'CosMoCapDataset',
|
| 23 |
+
'DiDeMoDataset',
|
| 24 |
+
'Ego4DNaQDataset',
|
| 25 |
+
'Ego4DNLQDataset',
|
| 26 |
+
'EgoTimeQACropDataset',
|
| 27 |
+
'EgoTimeQADataset',
|
| 28 |
+
'EgoTimeQAGroundingDataset',
|
| 29 |
+
'HiRESTGroundingDataset',
|
| 30 |
+
'HiRESTStepBiasDataset',
|
| 31 |
+
'HiRESTStepDataset',
|
| 32 |
+
'InternVidVTimeDataset',
|
| 33 |
+
'LongVideoBenchDataset',
|
| 34 |
+
'LVBenchDataset',
|
| 35 |
+
'MLVUDataset',
|
| 36 |
+
'MVBenchDataset',
|
| 37 |
+
'NExTGQACropDataset',
|
| 38 |
+
'NExTGQADataset',
|
| 39 |
+
'NExTGQAGroundingDataset',
|
| 40 |
+
'NExTQADataset',
|
| 41 |
+
'QAEgo4DCropDataset',
|
| 42 |
+
'QAEgo4DDataset',
|
| 43 |
+
'QAEgo4DGroundingDataset',
|
| 44 |
+
'QuerYDDataset',
|
| 45 |
+
'QVHighlightsDataset',
|
| 46 |
+
'ReXTimeCropDataset',
|
| 47 |
+
'ReXTimeDataset',
|
| 48 |
+
'ReXTimeGroundingDataset',
|
| 49 |
+
'STARDataset',
|
| 50 |
+
'TACoSDataset',
|
| 51 |
+
'VideoMMEDataset',
|
| 52 |
+
'VideoXumDataset',
|
| 53 |
+
'VidMorpDataset',
|
| 54 |
+
'YouCook2BiasDataset',
|
| 55 |
+
'YouCook2Dataset',
|
| 56 |
+
'AnsweringCropDataset',
|
| 57 |
+
'AnsweringDataset',
|
| 58 |
+
'GroundingDataset',
|
| 59 |
+
'PlanningDataset',
|
| 60 |
+
'VerifyingDataset',
|
| 61 |
+
]
|
videomind/dataset/collator.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import warnings
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from torch.nn.utils.rnn import pad_sequence
|
| 7 |
+
|
| 8 |
+
from videomind.constants import IGNORE_INDEX
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class HybridDataCollator(object):
|
| 12 |
+
|
| 13 |
+
def __init__(self, tokenizer):
|
| 14 |
+
self.tokenizer = tokenizer
|
| 15 |
+
|
| 16 |
+
def __call__(self, batch):
|
| 17 |
+
input_ids = [d['input_ids'] for d in batch]
|
| 18 |
+
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
|
| 19 |
+
|
| 20 |
+
labels = [d['labels'] for d in batch]
|
| 21 |
+
labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
|
| 22 |
+
|
| 23 |
+
assert input_ids.size() == labels.size()
|
| 24 |
+
|
| 25 |
+
seq_len, max_len = input_ids.size(1), self.tokenizer.model_max_length
|
| 26 |
+
if seq_len > max_len:
|
| 27 |
+
warnings.warn(f'The length of input sequence is exceeding model max length: {seq_len} > {max_len}')
|
| 28 |
+
input_ids, labels = input_ids[:, :max_len], labels[:, :max_len]
|
| 29 |
+
|
| 30 |
+
data = dict(input_ids=input_ids, labels=labels, attention_mask=input_ids != self.tokenizer.pad_token_id)
|
| 31 |
+
|
| 32 |
+
for key in ('pixel_values', 'pixel_values_videos', 'image_grid_thw', 'video_grid_thw'):
|
| 33 |
+
if key in batch[0]:
|
| 34 |
+
data[key] = torch.cat([d[key] for d in batch])
|
| 35 |
+
|
| 36 |
+
for key in ('timestamps', 'saliency', 'pos_clip'):
|
| 37 |
+
if key in batch[0]:
|
| 38 |
+
data[key] = [d[key] for d in batch]
|
| 39 |
+
|
| 40 |
+
return data
|
videomind/dataset/hybrid.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import math
|
| 4 |
+
import random
|
| 5 |
+
from collections import defaultdict
|
| 6 |
+
from itertools import accumulate
|
| 7 |
+
|
| 8 |
+
import nncore
|
| 9 |
+
import numpy as np
|
| 10 |
+
import termplotlib as tpl
|
| 11 |
+
import torch
|
| 12 |
+
from tabulate import tabulate
|
| 13 |
+
from torch.utils.data import Dataset
|
| 14 |
+
|
| 15 |
+
from videomind.constants import IGNORE_INDEX
|
| 16 |
+
from videomind.dataset.utils import preprocess, process_vision_info
|
| 17 |
+
from videomind.utils.parser import parse_span
|
| 18 |
+
|
| 19 |
+
DATASETS = nncore.Registry('datasets')
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class HybridDataset(Dataset):
|
| 23 |
+
|
| 24 |
+
def __init__(self, processor, model_config, model_args, data_args, training_args):
|
| 25 |
+
super().__init__()
|
| 26 |
+
|
| 27 |
+
datasets = []
|
| 28 |
+
for key in data_args.datasets.split(','):
|
| 29 |
+
datasets.append(DATASETS.get(key)(processor, model_args, data_args, training_args))
|
| 30 |
+
|
| 31 |
+
data_types = [a['data_type'] for d in datasets for a in d.annos]
|
| 32 |
+
|
| 33 |
+
cum_length = [0] + list(accumulate([len(d) for d in datasets]))
|
| 34 |
+
idx_ranges = [[cum_length[i], cum_length[i + 1]] for i in range(len(cum_length) - 1)]
|
| 35 |
+
|
| 36 |
+
if training_args.local_rank in (0, -1):
|
| 37 |
+
raw_length = sum(d.raw_length for d in datasets)
|
| 38 |
+
cur_length = idx_ranges[-1][-1]
|
| 39 |
+
|
| 40 |
+
ratio = round(cur_length / raw_length * 100, 2)
|
| 41 |
+
print(f'Number of samples: {raw_length} (original) -> {cur_length} (filtered) {ratio}%')
|
| 42 |
+
|
| 43 |
+
data_type_cnt = ' '.join([f'{data_types.count(t)} ({t})' for t in list(set(data_types))])
|
| 44 |
+
print(f'Data types: {data_type_cnt}')
|
| 45 |
+
|
| 46 |
+
tab = defaultdict(int)
|
| 47 |
+
for dataset in datasets:
|
| 48 |
+
for anno in dataset.annos:
|
| 49 |
+
tab[anno.get('source', 'unknown')] += 1
|
| 50 |
+
|
| 51 |
+
tab = [[k, v, round(v / cur_length, 3)] for k, v in tab.items()]
|
| 52 |
+
print(tabulate(tab, headers=['Source', '#Samples', 'Ratio'], tablefmt='pretty', stralign='left'))
|
| 53 |
+
|
| 54 |
+
d, _ = torch.Tensor([a['duration'] for d in datasets for a in d.annos if 'duration' in a]).sort()
|
| 55 |
+
if d.size(0) > 0:
|
| 56 |
+
n, r = min(d.size(0), 10), d.flip(0)
|
| 57 |
+
print(f'Top-{n} max video durations: {[round(r[i].item(), 1) for i in range(n)]}')
|
| 58 |
+
print(f'Top-{n} min video durations: {[round(d[i].item(), 1) for i in range(n)]}')
|
| 59 |
+
print(f'Average video duration ({d.size(0)} samples): {round(d.mean().item(), 1)}s')
|
| 60 |
+
|
| 61 |
+
print('Video duration histogram:')
|
| 62 |
+
counts, edges = np.histogram(d)
|
| 63 |
+
labels = [f'{edges[i]:.2f}s - {edges[i + 1]:.2f}s' for i in range(len(edges) - 1)]
|
| 64 |
+
fig = tpl.figure()
|
| 65 |
+
fig.barh(counts, labels)
|
| 66 |
+
fig.show()
|
| 67 |
+
|
| 68 |
+
d, _ = torch.Tensor([abs(b[0] - b[1]) for d in datasets for a in d.annos if 'span' in a
|
| 69 |
+
for b in a['span']]).sort()
|
| 70 |
+
if d.size(0) > 0:
|
| 71 |
+
n, r = min(d.size(0), 10), d.flip(0)
|
| 72 |
+
print(f'Top-{n} max span durations: {[round(r[i].item(), 1) for i in range(n)]}')
|
| 73 |
+
print(f'Top-{n} min span durations: {[round(d[i].item(), 1) for i in range(n)]}')
|
| 74 |
+
print(f'Average span duration ({d.size(0)} samples): {round(d.mean().item(), 1)}s')
|
| 75 |
+
|
| 76 |
+
print('Span duration histogram:')
|
| 77 |
+
counts, edges = np.histogram(d)
|
| 78 |
+
labels = [f'{edges[i]:.2f}s - {edges[i + 1]:.2f}s' for i in range(len(edges) - 1)]
|
| 79 |
+
fig = tpl.figure()
|
| 80 |
+
fig.barh(counts, labels)
|
| 81 |
+
fig.show()
|
| 82 |
+
|
| 83 |
+
self.datasets = datasets
|
| 84 |
+
self.data_types = data_types
|
| 85 |
+
self.idx_ranges = idx_ranges
|
| 86 |
+
self.processor = processor
|
| 87 |
+
self.model_config = model_config
|
| 88 |
+
self.model_args = model_args
|
| 89 |
+
self.data_args = data_args
|
| 90 |
+
self.training_args = training_args
|
| 91 |
+
|
| 92 |
+
def __len__(self):
|
| 93 |
+
return self.idx_ranges[-1][-1]
|
| 94 |
+
|
| 95 |
+
def __getitem__(self, idx):
|
| 96 |
+
for retry in range(self.data_args.max_retries + 1):
|
| 97 |
+
try:
|
| 98 |
+
return self.fetch_data(idx)
|
| 99 |
+
except Exception as e:
|
| 100 |
+
print(f'Error in loading {idx}: {type(e).__name__}({e})')
|
| 101 |
+
idx = random.choice([i for i, t in enumerate(self.data_types) if t == self.data_types[idx]])
|
| 102 |
+
|
| 103 |
+
raise RuntimeError(f'Data loading failed after {retry} retries')
|
| 104 |
+
|
| 105 |
+
def map(self, *args, **kwargs):
|
| 106 |
+
return self
|
| 107 |
+
|
| 108 |
+
def fetch_data(self, idx):
|
| 109 |
+
for (s, e), dataset in zip(self.idx_ranges, self.datasets):
|
| 110 |
+
if s <= idx < e:
|
| 111 |
+
meta = dataset[idx - s]
|
| 112 |
+
break
|
| 113 |
+
|
| 114 |
+
text = self.processor.apply_chat_template(meta['messages'])
|
| 115 |
+
text = [text.strip()]
|
| 116 |
+
|
| 117 |
+
images, videos = process_vision_info(meta['messages'], sanity_check=True)
|
| 118 |
+
|
| 119 |
+
data = self.processor(text=text, images=images, videos=videos, return_tensors='pt')
|
| 120 |
+
assert data['input_ids'].size(0) == 1
|
| 121 |
+
|
| 122 |
+
data['input_ids'] = data['input_ids'][0]
|
| 123 |
+
data['labels'] = preprocess(data['input_ids'], text[0], self.processor.tokenizer, self.model_args.conv_type)
|
| 124 |
+
|
| 125 |
+
# insert segment start/end tokens
|
| 126 |
+
if 'ss' in meta and 'se' in meta:
|
| 127 |
+
video_grid_thw = data['video_grid_thw'][0]
|
| 128 |
+
num_frames, window = int(video_grid_thw[0]), int(video_grid_thw[1] * video_grid_thw[2] / 4)
|
| 129 |
+
assert num_frames * window * 4 == data['pixel_values_videos'].size(0)
|
| 130 |
+
|
| 131 |
+
pos_s, pos_e = round(meta['ss'] * num_frames), round(meta['se'] * num_frames)
|
| 132 |
+
pos_s, pos_e = min(max(0, pos_s), num_frames), min(max(0, pos_e), num_frames)
|
| 133 |
+
assert pos_s <= pos_e, (num_frames, meta['ss'], meta['se'])
|
| 134 |
+
|
| 135 |
+
base_idx = torch.nonzero(data['input_ids'] == self.model_config.vision_start_token_id).item()
|
| 136 |
+
pos_s, pos_e = pos_s * window + base_idx + 1, pos_e * window + base_idx + 2
|
| 137 |
+
|
| 138 |
+
input_ids = data['input_ids'].tolist()
|
| 139 |
+
input_ids.insert(pos_s, self.model_config.seg_s_token_id)
|
| 140 |
+
input_ids.insert(pos_e, self.model_config.seg_e_token_id)
|
| 141 |
+
data['input_ids'] = torch.LongTensor(input_ids)
|
| 142 |
+
|
| 143 |
+
labels = data['labels'].tolist()
|
| 144 |
+
labels.insert(pos_s, IGNORE_INDEX)
|
| 145 |
+
labels.insert(pos_e, IGNORE_INDEX)
|
| 146 |
+
data['labels'] = torch.LongTensor(labels)
|
| 147 |
+
|
| 148 |
+
if 'span' in meta:
|
| 149 |
+
span, duration = meta['span'], meta['duration']
|
| 150 |
+
|
| 151 |
+
pixel_values_videos, video_grid_thw = data['pixel_values_videos'], data['video_grid_thw']
|
| 152 |
+
num_frames = int(video_grid_thw[0][0])
|
| 153 |
+
|
| 154 |
+
assert video_grid_thw.size(0) == 1
|
| 155 |
+
assert video_grid_thw.prod() == pixel_values_videos.size(0)
|
| 156 |
+
|
| 157 |
+
# actual fps would be 1/2 of config (temporal patch size = 2)
|
| 158 |
+
fps = num_frames / duration
|
| 159 |
+
|
| 160 |
+
safe_span = [parse_span(b, duration, 1 / fps) for b in span]
|
| 161 |
+
|
| 162 |
+
# num_reg_tokens -> num_bnds -> s & e
|
| 163 |
+
timestamps = [[[s / duration, e / duration] for s, e in safe_span]]
|
| 164 |
+
|
| 165 |
+
saliency, pos_inds = torch.zeros(num_frames), []
|
| 166 |
+
for s, e in safe_span:
|
| 167 |
+
span_ind = max(0, s * fps), min(e * fps, num_frames)
|
| 168 |
+
pos_inds = list(range(math.ceil(span_ind[0]), math.ceil(span_ind[1])))
|
| 169 |
+
assert len(pos_inds) > 0, f'empty pos_inds ({idx}): {fps} {num_frames} {duration} {span}'
|
| 170 |
+
saliency[pos_inds] = 1
|
| 171 |
+
|
| 172 |
+
assert saliency.any(), f'empty saliency ({idx}): {pos_inds} {fps} {num_frames} {duration} {span}'
|
| 173 |
+
pos_clip = random.sample(saliency.nonzero()[:, 0].tolist(), 1)
|
| 174 |
+
pos_clip = torch.LongTensor(pos_clip)
|
| 175 |
+
|
| 176 |
+
data['timestamps'] = timestamps
|
| 177 |
+
data['saliency'] = saliency
|
| 178 |
+
data['pos_clip'] = pos_clip
|
| 179 |
+
|
| 180 |
+
return data
|
videomind/dataset/sub_classes/__init__.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .activitynet_captions import ActivitynetCaptionsBiasDataset, ActivitynetCaptionsDataset
|
| 2 |
+
from .activitynet_rtl import ActivitynetRTLDataset
|
| 3 |
+
from .cgbench import CGBenchDataset
|
| 4 |
+
from .charades_sta import CharadesSTADataset
|
| 5 |
+
from .cosmo_cap import CosMoCapDataset
|
| 6 |
+
from .didemo import DiDeMoDataset
|
| 7 |
+
from .ego4d_naq import Ego4DNaQDataset
|
| 8 |
+
from .ego4d_nlq import Ego4DNLQDataset
|
| 9 |
+
from .ego_timeqa import EgoTimeQACropDataset, EgoTimeQADataset, EgoTimeQAGroundingDataset
|
| 10 |
+
from .hirest import HiRESTGroundingDataset, HiRESTStepBiasDataset, HiRESTStepDataset
|
| 11 |
+
from .internvit_vtime import InternVidVTimeDataset
|
| 12 |
+
from .longvideobench import LongVideoBenchDataset
|
| 13 |
+
from .lvbench import LVBenchDataset
|
| 14 |
+
from .mlvu import MLVUDataset
|
| 15 |
+
from .mvbench import MVBenchDataset
|
| 16 |
+
from .nextgqa import NExTGQACropDataset, NExTGQADataset, NExTGQAGroundingDataset
|
| 17 |
+
from .nextqa import NExTQADataset
|
| 18 |
+
from .qa_ego4d import QAEgo4DCropDataset, QAEgo4DDataset, QAEgo4DGroundingDataset
|
| 19 |
+
from .queryd import QuerYDDataset
|
| 20 |
+
from .qvhighlights import QVHighlightsDataset
|
| 21 |
+
from .rextime import ReXTimeCropDataset, ReXTimeDataset, ReXTimeGroundingDataset
|
| 22 |
+
from .star import STARDataset
|
| 23 |
+
from .tacos import TACoSDataset
|
| 24 |
+
from .vid_morp import VidMorpDataset
|
| 25 |
+
from .videomme import VideoMMEDataset
|
| 26 |
+
from .videoxum import VideoXumDataset
|
| 27 |
+
from .youcook2 import YouCook2BiasDataset, YouCook2Dataset
|
| 28 |
+
|
| 29 |
+
__all__ = [
|
| 30 |
+
'ActivitynetCaptionsBiasDataset',
|
| 31 |
+
'ActivitynetCaptionsDataset',
|
| 32 |
+
'ActivitynetRTLDataset',
|
| 33 |
+
'CGBenchDataset',
|
| 34 |
+
'CharadesSTADataset',
|
| 35 |
+
'CosMoCapDataset',
|
| 36 |
+
'DiDeMoDataset',
|
| 37 |
+
'Ego4DNaQDataset',
|
| 38 |
+
'Ego4DNLQDataset',
|
| 39 |
+
'EgoTimeQACropDataset',
|
| 40 |
+
'EgoTimeQADataset',
|
| 41 |
+
'EgoTimeQAGroundingDataset',
|
| 42 |
+
'HiRESTGroundingDataset',
|
| 43 |
+
'HiRESTStepBiasDataset',
|
| 44 |
+
'HiRESTStepDataset',
|
| 45 |
+
'InternVidVTimeDataset',
|
| 46 |
+
'LongVideoBenchDataset',
|
| 47 |
+
'LVBenchDataset',
|
| 48 |
+
'MLVUDataset',
|
| 49 |
+
'MVBenchDataset',
|
| 50 |
+
'NExTGQACropDataset',
|
| 51 |
+
'NExTGQADataset',
|
| 52 |
+
'NExTGQAGroundingDataset',
|
| 53 |
+
'NExTQADataset',
|
| 54 |
+
'QAEgo4DCropDataset',
|
| 55 |
+
'QAEgo4DDataset',
|
| 56 |
+
'QAEgo4DGroundingDataset',
|
| 57 |
+
'QuerYDDataset',
|
| 58 |
+
'QVHighlightsDataset',
|
| 59 |
+
'ReXTimeCropDataset',
|
| 60 |
+
'ReXTimeDataset',
|
| 61 |
+
'ReXTimeGroundingDataset',
|
| 62 |
+
'STARDataset',
|
| 63 |
+
'TACoSDataset',
|
| 64 |
+
'VidMorpDataset',
|
| 65 |
+
'VideoMMEDataset',
|
| 66 |
+
'VideoXumDataset',
|
| 67 |
+
'YouCook2BiasDataset',
|
| 68 |
+
'YouCook2Dataset',
|
| 69 |
+
]
|
videomind/dataset/sub_classes/activitynet_captions.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
from collections import OrderedDict
|
| 4 |
+
|
| 5 |
+
import nncore
|
| 6 |
+
|
| 7 |
+
from videomind.dataset.hybrid import DATASETS
|
| 8 |
+
from videomind.dataset.wrappers import GroundingDataset
|
| 9 |
+
from videomind.utils.parser import parse_query
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@DATASETS.register(name='activitynet_captions')
|
| 13 |
+
class ActivitynetCaptionsDataset(GroundingDataset):
|
| 14 |
+
|
| 15 |
+
ANNO_PATH_TRAIN = 'data/activitynet_captions/train.json'
|
| 16 |
+
ANNO_PATH_VALID = 'data/activitynet_captions/val_1.json'
|
| 17 |
+
ANNO_PATH_TEST = 'data/activitynet_captions/val_2.json'
|
| 18 |
+
|
| 19 |
+
VIDEO_ROOT = 'data/activitynet/videos_3fps_480_noaudio'
|
| 20 |
+
DURATIONS = 'data/activitynet/durations.json'
|
| 21 |
+
|
| 22 |
+
UNIT = 0.01
|
| 23 |
+
|
| 24 |
+
@classmethod
|
| 25 |
+
def load_annos(self, split='train'):
|
| 26 |
+
if split == 'train':
|
| 27 |
+
raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
|
| 28 |
+
elif split == 'valid':
|
| 29 |
+
raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
|
| 30 |
+
else:
|
| 31 |
+
raw_annos = nncore.load(self.ANNO_PATH_TEST, object_pairs_hook=OrderedDict)
|
| 32 |
+
|
| 33 |
+
durations = nncore.load(self.DURATIONS)
|
| 34 |
+
|
| 35 |
+
annos = []
|
| 36 |
+
for vid, raw_anno in raw_annos.items():
|
| 37 |
+
for query, span in zip(raw_anno['sentences'], raw_anno['timestamps']):
|
| 38 |
+
anno = dict(
|
| 39 |
+
source='activitynet_captions',
|
| 40 |
+
data_type='grounding',
|
| 41 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 42 |
+
duration=durations[vid],
|
| 43 |
+
query=parse_query(query),
|
| 44 |
+
span=[span])
|
| 45 |
+
|
| 46 |
+
annos.append(anno)
|
| 47 |
+
|
| 48 |
+
return annos
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
@DATASETS.register(name='activitynet_captions_bias')
|
| 52 |
+
class ActivitynetCaptionsBiasDataset(ActivitynetCaptionsDataset):
|
| 53 |
+
|
| 54 |
+
@classmethod
|
| 55 |
+
def load_annos(self, split='train'):
|
| 56 |
+
if split == 'train':
|
| 57 |
+
raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
|
| 58 |
+
elif split == 'valid':
|
| 59 |
+
raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
|
| 60 |
+
else:
|
| 61 |
+
raw_annos = nncore.load(self.ANNO_PATH_TEST, object_pairs_hook=OrderedDict)
|
| 62 |
+
|
| 63 |
+
durations = nncore.load(self.DURATIONS)
|
| 64 |
+
|
| 65 |
+
annos = []
|
| 66 |
+
for vid, raw_anno in raw_annos.items():
|
| 67 |
+
assert len(raw_anno['sentences']) == len(raw_anno['timestamps'])
|
| 68 |
+
|
| 69 |
+
for i in range(len(raw_anno['sentences']) - 1):
|
| 70 |
+
span_a = raw_anno['timestamps'][i]
|
| 71 |
+
span_b = raw_anno['timestamps'][i + 1]
|
| 72 |
+
|
| 73 |
+
if span_b[0] - span_a[1] < 3:
|
| 74 |
+
query_a = parse_query(f"The moment before {raw_anno['sentences'][i + 1]}")
|
| 75 |
+
query_b = parse_query(f"The moment after {raw_anno['sentences'][i]}")
|
| 76 |
+
|
| 77 |
+
anno_a = dict(
|
| 78 |
+
source='activitynet_captions_bias',
|
| 79 |
+
data_type='grounding',
|
| 80 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 81 |
+
duration=durations[vid],
|
| 82 |
+
query=query_a,
|
| 83 |
+
span=[span_a])
|
| 84 |
+
|
| 85 |
+
anno_b = dict(
|
| 86 |
+
source='activitynet_captions_bias',
|
| 87 |
+
data_type='grounding',
|
| 88 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 89 |
+
duration=durations[vid],
|
| 90 |
+
query=query_b,
|
| 91 |
+
span=[span_b])
|
| 92 |
+
|
| 93 |
+
annos.append(anno_a)
|
| 94 |
+
annos.append(anno_b)
|
| 95 |
+
|
| 96 |
+
return annos
|
videomind/dataset/sub_classes/activitynet_rtl.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from collections import OrderedDict
|
| 5 |
+
|
| 6 |
+
import nncore
|
| 7 |
+
|
| 8 |
+
from videomind.dataset.hybrid import DATASETS
|
| 9 |
+
from videomind.dataset.wrappers import GroundingDataset
|
| 10 |
+
from videomind.utils.parser import parse_query
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@DATASETS.register(name='activitynet_rtl')
|
| 14 |
+
class ActivitynetRTLDataset(GroundingDataset):
|
| 15 |
+
|
| 16 |
+
ANNO_PATH_TRAIN = 'data/activitynet_rtl/activitynet_train_gpt-4-0613_temp_6_f10009.json'
|
| 17 |
+
ANNO_PATH_TEST = 'data/activitynet_rtl/annot_val_1_q229.json'
|
| 18 |
+
|
| 19 |
+
VIDEO_ROOT = 'data/activitynet/videos_3fps_480_noaudio'
|
| 20 |
+
|
| 21 |
+
UNIT = 0.01
|
| 22 |
+
|
| 23 |
+
@classmethod
|
| 24 |
+
def load_annos(self, split='train'):
|
| 25 |
+
if split == 'train':
|
| 26 |
+
raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
|
| 27 |
+
|
| 28 |
+
annos = []
|
| 29 |
+
for vid, raw_anno in raw_annos.items():
|
| 30 |
+
for meta in raw_anno['QA']:
|
| 31 |
+
match = re.findall(r'<(\d+(\.\d+)?)>', meta['a'])
|
| 32 |
+
span = [float(m[0]) for m in match[:2]]
|
| 33 |
+
|
| 34 |
+
# some samples do not have timestamps
|
| 35 |
+
if len(span) != 2:
|
| 36 |
+
continue
|
| 37 |
+
|
| 38 |
+
anno = dict(
|
| 39 |
+
source='activitynet_rtl',
|
| 40 |
+
data_type='grounding',
|
| 41 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 42 |
+
duration=raw_anno['duration'],
|
| 43 |
+
query=parse_query(meta['q']),
|
| 44 |
+
span=[span])
|
| 45 |
+
|
| 46 |
+
annos.append(anno)
|
| 47 |
+
else:
|
| 48 |
+
raw_annos = nncore.load(self.ANNO_PATH_TEST, object_pairs_hook=OrderedDict)
|
| 49 |
+
|
| 50 |
+
annos = []
|
| 51 |
+
for raw_anno in raw_annos:
|
| 52 |
+
vid = f"v_{raw_anno['vid']}"
|
| 53 |
+
|
| 54 |
+
match = re.findall(r'<(\d+(\.\d+)?)>', raw_anno['answer'])
|
| 55 |
+
span = [float(m[0]) for m in match[:2]]
|
| 56 |
+
assert len(span) == 2
|
| 57 |
+
|
| 58 |
+
anno = dict(
|
| 59 |
+
source='activitynet_rtl',
|
| 60 |
+
data_type='grounding',
|
| 61 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 62 |
+
duration=raw_anno['duration'],
|
| 63 |
+
query=parse_query(raw_anno['question']),
|
| 64 |
+
span=[span])
|
| 65 |
+
|
| 66 |
+
annos.append(anno)
|
| 67 |
+
|
| 68 |
+
return annos
|
videomind/dataset/sub_classes/cgbench.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import nncore
|
| 4 |
+
from torch.utils.data import Dataset
|
| 5 |
+
|
| 6 |
+
from videomind.dataset.hybrid import DATASETS
|
| 7 |
+
from videomind.utils.parser import parse_query, parse_question
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@DATASETS.register(name='cgbench')
|
| 11 |
+
class CGBenchDataset(Dataset):
|
| 12 |
+
|
| 13 |
+
ANNO_PATH_TEST = 'data/cgbench/cgbench_mini.json'
|
| 14 |
+
|
| 15 |
+
VIDEO_ROOT = 'data/cgbench/videos_3fps_480_noaudio'
|
| 16 |
+
SUBTITLE_ROOT = 'data/cgbench/subtitles'
|
| 17 |
+
|
| 18 |
+
UNIT = 0.001
|
| 19 |
+
|
| 20 |
+
@classmethod
|
| 21 |
+
def load_annos(self, split='test'):
|
| 22 |
+
assert split == 'test'
|
| 23 |
+
|
| 24 |
+
raw_annos = nncore.load(self.ANNO_PATH_TEST)
|
| 25 |
+
|
| 26 |
+
annos = []
|
| 27 |
+
for raw_anno in raw_annos:
|
| 28 |
+
vid = raw_anno['video_uid']
|
| 29 |
+
|
| 30 |
+
anno = dict(
|
| 31 |
+
source='cgbench',
|
| 32 |
+
data_type='multimodal',
|
| 33 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 34 |
+
subtitle_path=nncore.join(self.SUBTITLE_ROOT, vid + '.srt'),
|
| 35 |
+
duration=raw_anno['duration'],
|
| 36 |
+
query=parse_query(raw_anno['question']),
|
| 37 |
+
question=parse_question(raw_anno['question']),
|
| 38 |
+
options=[o.capitalize() for o in raw_anno['choices']],
|
| 39 |
+
answer=raw_anno['answer'].capitalize(),
|
| 40 |
+
ans=raw_anno['right_answer'],
|
| 41 |
+
span=raw_anno['clue_intervals'],
|
| 42 |
+
task=raw_anno['sub_category'],
|
| 43 |
+
domain=raw_anno['domain'])
|
| 44 |
+
|
| 45 |
+
annos.append(anno)
|
| 46 |
+
|
| 47 |
+
return annos
|
videomind/dataset/sub_classes/charades_sta.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import nncore
|
| 4 |
+
|
| 5 |
+
from videomind.dataset.hybrid import DATASETS
|
| 6 |
+
from videomind.dataset.wrappers import GroundingDataset
|
| 7 |
+
from videomind.utils.parser import parse_query
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@DATASETS.register(name='charades_sta')
|
| 11 |
+
class CharadesSTADataset(GroundingDataset):
|
| 12 |
+
|
| 13 |
+
ANNO_PATH_TRAIN = 'data/charades_sta/charades_sta_train.txt'
|
| 14 |
+
ANNO_PATH_TEST = 'data/charades_sta/charades_sta_test.txt'
|
| 15 |
+
|
| 16 |
+
VIDEO_ROOT = 'data/charades_sta/videos_3fps_480_noaudio'
|
| 17 |
+
DURATIONS = 'data/charades_sta/durations.json'
|
| 18 |
+
|
| 19 |
+
UNIT = 0.1
|
| 20 |
+
|
| 21 |
+
@classmethod
|
| 22 |
+
def load_annos(self, split='train'):
|
| 23 |
+
if split == 'train':
|
| 24 |
+
raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
|
| 25 |
+
else:
|
| 26 |
+
raw_annos = nncore.load(self.ANNO_PATH_TEST)
|
| 27 |
+
|
| 28 |
+
durations = nncore.load(self.DURATIONS)
|
| 29 |
+
|
| 30 |
+
annos = []
|
| 31 |
+
for raw_anno in raw_annos:
|
| 32 |
+
info, query = raw_anno.split('##')
|
| 33 |
+
vid, s, e = info.split()
|
| 34 |
+
|
| 35 |
+
anno = dict(
|
| 36 |
+
source='charades_sta',
|
| 37 |
+
data_type='grounding',
|
| 38 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 39 |
+
duration=durations[vid],
|
| 40 |
+
query=parse_query(query),
|
| 41 |
+
span=[[float(s), float(e)]])
|
| 42 |
+
|
| 43 |
+
annos.append(anno)
|
| 44 |
+
|
| 45 |
+
return annos
|
videomind/dataset/sub_classes/cosmo_cap.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import nncore
|
| 4 |
+
|
| 5 |
+
from videomind.dataset.hybrid import DATASETS
|
| 6 |
+
from videomind.dataset.wrappers import GroundingDataset
|
| 7 |
+
from videomind.utils.parser import parse_query
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@DATASETS.register(name='cosmo_cap')
|
| 11 |
+
class CosMoCapDataset(GroundingDataset):
|
| 12 |
+
|
| 13 |
+
ANNO_PATH = 'data/cosmo_cap/anno_cosmo_cap.jsonl'
|
| 14 |
+
|
| 15 |
+
VIDEO_ROOT = 'data/cosmo_cap/videos_3fps_480_noaudio'
|
| 16 |
+
|
| 17 |
+
UNIT = 1.0
|
| 18 |
+
|
| 19 |
+
@classmethod
|
| 20 |
+
def load_annos(self, split='train'):
|
| 21 |
+
assert split == 'train'
|
| 22 |
+
|
| 23 |
+
raw_annos = nncore.load(self.ANNO_PATH)
|
| 24 |
+
|
| 25 |
+
annos = []
|
| 26 |
+
for raw_anno in raw_annos:
|
| 27 |
+
anno = dict(
|
| 28 |
+
source='cosmo_cap',
|
| 29 |
+
data_type='grounding',
|
| 30 |
+
video_path=nncore.join(self.VIDEO_ROOT, raw_anno['vid'] + '.mp4'),
|
| 31 |
+
duration=raw_anno['duration'],
|
| 32 |
+
query=parse_query(raw_anno['query']),
|
| 33 |
+
span=[raw_anno['span']])
|
| 34 |
+
|
| 35 |
+
annos.append(anno)
|
| 36 |
+
|
| 37 |
+
return annos
|
videomind/dataset/sub_classes/didemo.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import random
|
| 4 |
+
|
| 5 |
+
import nncore
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
from videomind.dataset.hybrid import DATASETS
|
| 9 |
+
from videomind.dataset.wrappers import GroundingDataset
|
| 10 |
+
from videomind.utils.parser import parse_query
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@DATASETS.register(name='didemo')
|
| 14 |
+
class DiDeMoDataset(GroundingDataset):
|
| 15 |
+
|
| 16 |
+
ANNO_PATH_TRAIN = 'data/didemo/train_data.json'
|
| 17 |
+
ANNO_PATH_VALID = 'data/didemo/val_data.json'
|
| 18 |
+
ANNO_PATH_TEST = 'data/didemo/test_data.json'
|
| 19 |
+
|
| 20 |
+
VIDEO_ROOT = 'data/didemo/videos_3fps_480_noaudio'
|
| 21 |
+
DURATIONS = 'data/didemo/durations.json'
|
| 22 |
+
|
| 23 |
+
UNIT = 1.0
|
| 24 |
+
|
| 25 |
+
@classmethod
|
| 26 |
+
def load_annos(self, split='train'):
|
| 27 |
+
if split == 'train':
|
| 28 |
+
raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
|
| 29 |
+
elif split == 'valid':
|
| 30 |
+
raw_annos = nncore.load(self.ANNO_PATH_VALID)
|
| 31 |
+
else:
|
| 32 |
+
raw_annos = nncore.load(self.ANNO_PATH_TEST)
|
| 33 |
+
|
| 34 |
+
durations = nncore.load(self.DURATIONS)
|
| 35 |
+
|
| 36 |
+
annos = []
|
| 37 |
+
for raw_anno in raw_annos:
|
| 38 |
+
vid = raw_anno['video'].split('.')[0]
|
| 39 |
+
|
| 40 |
+
# apply mean on multiple spans
|
| 41 |
+
span = np.array(raw_anno['times']).mean(axis=0).tolist()
|
| 42 |
+
span = [round(span[0] * 5), round((span[1] + 1) * 5)]
|
| 43 |
+
|
| 44 |
+
# augment spans during training
|
| 45 |
+
if split == 'train':
|
| 46 |
+
offset = random.randint(-2, 2)
|
| 47 |
+
span = [span[0] + offset, span[1] + offset]
|
| 48 |
+
|
| 49 |
+
anno = dict(
|
| 50 |
+
source='didemo',
|
| 51 |
+
data_type='grounding',
|
| 52 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 53 |
+
duration=durations[vid],
|
| 54 |
+
query=parse_query(raw_anno['description']),
|
| 55 |
+
span=[span])
|
| 56 |
+
|
| 57 |
+
annos.append(anno)
|
| 58 |
+
|
| 59 |
+
return annos
|
videomind/dataset/sub_classes/ego4d_naq.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
from collections import OrderedDict
|
| 4 |
+
|
| 5 |
+
import nncore
|
| 6 |
+
|
| 7 |
+
from videomind.dataset.hybrid import DATASETS
|
| 8 |
+
from videomind.dataset.wrappers import GroundingDataset
|
| 9 |
+
from videomind.utils.parser import parse_query
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@DATASETS.register(name='ego4d_naq')
|
| 13 |
+
class Ego4DNaQDataset(GroundingDataset):
|
| 14 |
+
|
| 15 |
+
ANNO_PATH_TRAIN = 'data/ego4d_naq/train.json'
|
| 16 |
+
ANNO_PATH_VALID = 'data/ego4d_naq/val.json'
|
| 17 |
+
ANNO_PATH_TEST = 'data/ego4d_naq/test.json'
|
| 18 |
+
|
| 19 |
+
VIDEO_ROOT = 'data/ego4d/v2/videos_3fps_480_noaudio'
|
| 20 |
+
|
| 21 |
+
UNIT = 0.001
|
| 22 |
+
|
| 23 |
+
@classmethod
|
| 24 |
+
def load_annos(self, split='train'):
|
| 25 |
+
if split == 'train':
|
| 26 |
+
raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
|
| 27 |
+
elif split == 'valid':
|
| 28 |
+
raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
|
| 29 |
+
else:
|
| 30 |
+
raw_annos = nncore.load(self.ANNO_PATH_TEST, object_pairs_hook=OrderedDict)
|
| 31 |
+
|
| 32 |
+
annos = []
|
| 33 |
+
for vid, raw_anno in raw_annos.items():
|
| 34 |
+
duration = raw_anno['num_frames'] / raw_anno['fps']
|
| 35 |
+
|
| 36 |
+
# 300s: 254k samples (dropped 121k samples merged 156k samples)
|
| 37 |
+
# 480s: 567k samples (dropped 249k samples merged 328k samples)
|
| 38 |
+
if split == 'train' and (duration < 10 or duration > 600):
|
| 39 |
+
continue
|
| 40 |
+
|
| 41 |
+
meta = dict()
|
| 42 |
+
for span, query in zip(raw_anno['exact_times'], raw_anno['sentences']):
|
| 43 |
+
span = [round(span[0], 3), round(span[1], 3)]
|
| 44 |
+
|
| 45 |
+
query = parse_query(query)
|
| 46 |
+
|
| 47 |
+
# these annotations might be from nlq
|
| 48 |
+
nlq_keys = ('who', 'what', 'when', 'in what', 'did', 'where', 'how', 'i what')
|
| 49 |
+
if split == 'train' and any(query.startswith(k) for k in nlq_keys):
|
| 50 |
+
continue
|
| 51 |
+
|
| 52 |
+
# bad samples
|
| 53 |
+
if split == 'train' and '#unsure' in query:
|
| 54 |
+
continue
|
| 55 |
+
|
| 56 |
+
# too short or too long samples
|
| 57 |
+
num_words = len(query.split(' '))
|
| 58 |
+
if split == 'train' and (num_words < 3 or num_words > 30):
|
| 59 |
+
continue
|
| 60 |
+
|
| 61 |
+
if query not in meta:
|
| 62 |
+
meta[query] = []
|
| 63 |
+
|
| 64 |
+
meta[query].append(span)
|
| 65 |
+
|
| 66 |
+
for query, span in meta.items():
|
| 67 |
+
# skip samples with multiple moments
|
| 68 |
+
if len(span) > 1:
|
| 69 |
+
continue
|
| 70 |
+
|
| 71 |
+
anno = dict(
|
| 72 |
+
source='ego4d_naq',
|
| 73 |
+
data_type='grounding',
|
| 74 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 75 |
+
duration=duration,
|
| 76 |
+
query=query,
|
| 77 |
+
span=span)
|
| 78 |
+
|
| 79 |
+
annos.append(anno)
|
| 80 |
+
|
| 81 |
+
return annos
|
videomind/dataset/sub_classes/ego4d_nlq.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import nncore
|
| 4 |
+
|
| 5 |
+
from videomind.dataset.hybrid import DATASETS
|
| 6 |
+
from videomind.dataset.wrappers import GroundingDataset
|
| 7 |
+
from videomind.utils.parser import parse_query
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@DATASETS.register(name='ego4d_nlq')
|
| 11 |
+
class Ego4DNLQDataset(GroundingDataset):
|
| 12 |
+
|
| 13 |
+
ANNO_PATH_TRAIN = 'data/ego4d_nlq/nlq_train.jsonl'
|
| 14 |
+
ANNO_PATH_VALID = 'data/ego4d_nlq/nlq_val.jsonl'
|
| 15 |
+
|
| 16 |
+
VIDEO_ROOT = 'data/ego4d/v2/videos_3fps_480_noaudio'
|
| 17 |
+
|
| 18 |
+
UNIT = 0.001
|
| 19 |
+
|
| 20 |
+
@classmethod
|
| 21 |
+
def load_annos(self, split='train'):
|
| 22 |
+
if split == 'train':
|
| 23 |
+
raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
|
| 24 |
+
else:
|
| 25 |
+
raw_annos = nncore.load(self.ANNO_PATH_VALID)
|
| 26 |
+
|
| 27 |
+
annos = []
|
| 28 |
+
for raw_anno in raw_annos:
|
| 29 |
+
assert len(raw_anno['relevant_windows']) == 1
|
| 30 |
+
|
| 31 |
+
anno = dict(
|
| 32 |
+
source='ego4d_nlq',
|
| 33 |
+
data_type='grounding',
|
| 34 |
+
video_path=nncore.join(self.VIDEO_ROOT, raw_anno['vid'] + '.mp4'),
|
| 35 |
+
duration=raw_anno['duration'],
|
| 36 |
+
query=parse_query(raw_anno['query']),
|
| 37 |
+
span=raw_anno['relevant_windows'])
|
| 38 |
+
|
| 39 |
+
annos.append(anno)
|
| 40 |
+
|
| 41 |
+
return annos
|
videomind/dataset/sub_classes/ego_timeqa.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import random
|
| 4 |
+
|
| 5 |
+
import nncore
|
| 6 |
+
|
| 7 |
+
from videomind.dataset.hybrid import DATASETS
|
| 8 |
+
from videomind.dataset.wrappers import AnsweringCropDataset, AnsweringDataset, GroundingDataset
|
| 9 |
+
from videomind.utils.parser import parse_query, parse_question
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@DATASETS.register(name='ego_timeqa')
|
| 13 |
+
class EgoTimeQADataset(AnsweringDataset):
|
| 14 |
+
|
| 15 |
+
ANNO_PATH_TRAIN = 'data/ego_timeqa/annotations.EgoTimeQA.json'
|
| 16 |
+
|
| 17 |
+
VIDEO_ROOT = 'data/ego4d/v2/videos_3fps_480_noaudio'
|
| 18 |
+
DURATIONS = 'data/ego4d/v2/durations.json'
|
| 19 |
+
|
| 20 |
+
SOURCE = 'ego_timeqa'
|
| 21 |
+
DATA_TYPE = 'multimodal'
|
| 22 |
+
|
| 23 |
+
UNIT = 0.001
|
| 24 |
+
|
| 25 |
+
@classmethod
|
| 26 |
+
def load_annos(self, split='train'):
|
| 27 |
+
assert split == 'train'
|
| 28 |
+
|
| 29 |
+
raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
|
| 30 |
+
durations = nncore.load(self.DURATIONS)
|
| 31 |
+
|
| 32 |
+
annos = []
|
| 33 |
+
for raw_anno in raw_annos:
|
| 34 |
+
vid = raw_anno['video_id']
|
| 35 |
+
|
| 36 |
+
duration = durations[vid]
|
| 37 |
+
|
| 38 |
+
# 303k -> 284k (to be verified)
|
| 39 |
+
if duration < 10 or duration > 600:
|
| 40 |
+
continue
|
| 41 |
+
|
| 42 |
+
span = [raw_anno['moment_start_frame'] / 30, raw_anno['moment_end_frame'] / 30]
|
| 43 |
+
span = [round(span[0], 3), round(span[1], 3)]
|
| 44 |
+
|
| 45 |
+
# this would remove many samples (284k -> 37k)
|
| 46 |
+
# if span[1] - span[0] < 2:
|
| 47 |
+
# continue
|
| 48 |
+
|
| 49 |
+
question = raw_anno['question'].replace(' l ', ' I ').capitalize()
|
| 50 |
+
question = parse_question(question)
|
| 51 |
+
query = parse_query(question)
|
| 52 |
+
|
| 53 |
+
# too short or too long samples
|
| 54 |
+
num_words = len(query.split(' '))
|
| 55 |
+
if split == 'train' and (num_words < 3 or num_words > 30):
|
| 56 |
+
continue
|
| 57 |
+
|
| 58 |
+
answer = raw_anno['answer'].capitalize()
|
| 59 |
+
|
| 60 |
+
assert len(raw_anno['wrong_answers']) == 3
|
| 61 |
+
idx = random.randint(0, 3)
|
| 62 |
+
ans = chr(ord('A') + idx)
|
| 63 |
+
options = [o.capitalize() for o in raw_anno['wrong_answers']]
|
| 64 |
+
options.insert(idx, answer)
|
| 65 |
+
|
| 66 |
+
anno = dict(
|
| 67 |
+
source=self.SOURCE,
|
| 68 |
+
data_type=self.DATA_TYPE,
|
| 69 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 70 |
+
duration=duration,
|
| 71 |
+
query=query,
|
| 72 |
+
question=question,
|
| 73 |
+
options=options,
|
| 74 |
+
answer=answer,
|
| 75 |
+
ans=ans,
|
| 76 |
+
span=[span])
|
| 77 |
+
|
| 78 |
+
annos.append(anno)
|
| 79 |
+
|
| 80 |
+
return annos
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
@DATASETS.register(name='ego_timeqa_crop')
|
| 84 |
+
class EgoTimeQACropDataset(AnsweringCropDataset, EgoTimeQADataset):
|
| 85 |
+
|
| 86 |
+
SOURCE = 'ego_timeqa_crop'
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
@DATASETS.register(name='ego_timeqa_grounding')
|
| 90 |
+
class EgoTimeQAGroundingDataset(GroundingDataset, EgoTimeQADataset):
|
| 91 |
+
|
| 92 |
+
SOURCE = 'ego_timeqa_grounding'
|
| 93 |
+
DATA_TYPE = 'grounding'
|
videomind/dataset/sub_classes/hirest.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
from collections import OrderedDict
|
| 4 |
+
|
| 5 |
+
import nncore
|
| 6 |
+
|
| 7 |
+
from videomind.dataset.hybrid import DATASETS
|
| 8 |
+
from videomind.dataset.wrappers import GroundingDataset
|
| 9 |
+
from videomind.utils.parser import parse_query
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@DATASETS.register(name='hirest_grounding')
|
| 13 |
+
class HiRESTGroundingDataset(GroundingDataset):
|
| 14 |
+
|
| 15 |
+
ANNO_PATH_TRAIN = 'data/hirest/all_data_train.json'
|
| 16 |
+
ANNO_PATH_VALID = 'data/hirest/all_data_val.json'
|
| 17 |
+
|
| 18 |
+
VIDEO_ROOT = 'data/hirest/videos_3fps_480_noaudio'
|
| 19 |
+
|
| 20 |
+
UNIT = 1.0
|
| 21 |
+
|
| 22 |
+
@classmethod
|
| 23 |
+
def load_annos(self, split='train'):
|
| 24 |
+
if split == 'train':
|
| 25 |
+
raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
|
| 26 |
+
else:
|
| 27 |
+
raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
|
| 28 |
+
|
| 29 |
+
all_videos = nncore.ls(self.VIDEO_ROOT, ext='.mp4')
|
| 30 |
+
all_videos = set(v[:11] for v in all_videos)
|
| 31 |
+
|
| 32 |
+
annos = []
|
| 33 |
+
for query, videos in raw_annos.items():
|
| 34 |
+
for video_name, raw_anno in videos.items():
|
| 35 |
+
if not raw_anno['relevant'] or not raw_anno['clip']:
|
| 36 |
+
continue
|
| 37 |
+
|
| 38 |
+
assert len(raw_anno['bounds']) == 2
|
| 39 |
+
|
| 40 |
+
vid = video_name.split('.')[0]
|
| 41 |
+
|
| 42 |
+
if vid not in all_videos:
|
| 43 |
+
continue
|
| 44 |
+
|
| 45 |
+
anno = dict(
|
| 46 |
+
source='hirest_grounding',
|
| 47 |
+
data_type='grounding',
|
| 48 |
+
video_path=nncore.join(self.VIDEO_ROOT, video_name),
|
| 49 |
+
duration=raw_anno['v_duration'],
|
| 50 |
+
query=parse_query(query),
|
| 51 |
+
span=[raw_anno['bounds']])
|
| 52 |
+
|
| 53 |
+
annos.append(anno)
|
| 54 |
+
|
| 55 |
+
return annos
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@DATASETS.register(name='hirest_step')
|
| 59 |
+
class HiRESTStepDataset(HiRESTGroundingDataset):
|
| 60 |
+
|
| 61 |
+
@classmethod
|
| 62 |
+
def load_annos(self, split='train'):
|
| 63 |
+
if split == 'train':
|
| 64 |
+
raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
|
| 65 |
+
else:
|
| 66 |
+
raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
|
| 67 |
+
|
| 68 |
+
all_videos = nncore.ls(self.VIDEO_ROOT, ext='.mp4')
|
| 69 |
+
all_videos = set(v[:11] for v in all_videos)
|
| 70 |
+
|
| 71 |
+
annos = []
|
| 72 |
+
for query, videos in raw_annos.items():
|
| 73 |
+
for video_name, raw_anno in videos.items():
|
| 74 |
+
if not raw_anno['relevant'] or not raw_anno['clip'] or len(raw_anno['steps']) == 0:
|
| 75 |
+
continue
|
| 76 |
+
|
| 77 |
+
vid = video_name.split('.')[0]
|
| 78 |
+
|
| 79 |
+
if vid not in all_videos:
|
| 80 |
+
continue
|
| 81 |
+
|
| 82 |
+
for step in raw_anno['steps']:
|
| 83 |
+
assert len(step['absolute_bounds']) == 2
|
| 84 |
+
|
| 85 |
+
anno = dict(
|
| 86 |
+
source='hirest_step',
|
| 87 |
+
data_type='grounding',
|
| 88 |
+
video_path=nncore.join(self.VIDEO_ROOT, video_name),
|
| 89 |
+
duration=raw_anno['v_duration'],
|
| 90 |
+
query=parse_query(step['heading']),
|
| 91 |
+
span=[step['absolute_bounds']])
|
| 92 |
+
|
| 93 |
+
annos.append(anno)
|
| 94 |
+
|
| 95 |
+
return annos
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
@DATASETS.register(name='hirest_step_bias')
|
| 99 |
+
class HiRESTStepBiasDataset(HiRESTStepDataset):
|
| 100 |
+
|
| 101 |
+
@classmethod
|
| 102 |
+
def load_annos(self, split='train'):
|
| 103 |
+
if split == 'train':
|
| 104 |
+
raw_annos = nncore.load(self.ANNO_PATH_TRAIN, object_pairs_hook=OrderedDict)
|
| 105 |
+
else:
|
| 106 |
+
raw_annos = nncore.load(self.ANNO_PATH_VALID, object_pairs_hook=OrderedDict)
|
| 107 |
+
|
| 108 |
+
all_videos = nncore.ls(self.VIDEO_ROOT, ext='.mp4')
|
| 109 |
+
all_videos = set(v[:11] for v in all_videos)
|
| 110 |
+
|
| 111 |
+
annos = []
|
| 112 |
+
for query, videos in raw_annos.items():
|
| 113 |
+
for video_name, raw_anno in videos.items():
|
| 114 |
+
if not raw_anno['relevant'] or not raw_anno['clip'] or len(raw_anno['steps']) == 0:
|
| 115 |
+
continue
|
| 116 |
+
|
| 117 |
+
vid = video_name.split('.')[0]
|
| 118 |
+
|
| 119 |
+
if vid not in all_videos:
|
| 120 |
+
continue
|
| 121 |
+
|
| 122 |
+
for i in range(len(raw_anno['steps']) - 1):
|
| 123 |
+
span_a = raw_anno['steps'][i]['absolute_bounds']
|
| 124 |
+
span_b = raw_anno['steps'][i + 1]['absolute_bounds']
|
| 125 |
+
|
| 126 |
+
assert len(span_a) == 2 and len(span_b) == 2 and span_a[1] == span_b[0]
|
| 127 |
+
|
| 128 |
+
query_a = parse_query(f"The moment before {raw_anno['steps'][i + 1]['heading']}")
|
| 129 |
+
query_b = parse_query(f"The moment after {raw_anno['steps'][i]['heading']}")
|
| 130 |
+
|
| 131 |
+
anno_a = dict(
|
| 132 |
+
source='hirest_step_bias',
|
| 133 |
+
data_type='grounding',
|
| 134 |
+
video_path=nncore.join(self.VIDEO_ROOT, video_name),
|
| 135 |
+
duration=raw_anno['v_duration'],
|
| 136 |
+
query=query_a,
|
| 137 |
+
span=[span_a])
|
| 138 |
+
|
| 139 |
+
anno_b = dict(
|
| 140 |
+
source='hirest_step_bias',
|
| 141 |
+
data_type='grounding',
|
| 142 |
+
video_path=nncore.join(self.VIDEO_ROOT, video_name),
|
| 143 |
+
duration=raw_anno['v_duration'],
|
| 144 |
+
query=query_b,
|
| 145 |
+
span=[span_b])
|
| 146 |
+
|
| 147 |
+
annos.append(anno_a)
|
| 148 |
+
annos.append(anno_b)
|
| 149 |
+
|
| 150 |
+
return annos
|
videomind/dataset/sub_classes/internvit_vtime.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import nncore
|
| 4 |
+
|
| 5 |
+
from videomind.dataset.hybrid import DATASETS
|
| 6 |
+
from videomind.dataset.wrappers import GroundingDataset
|
| 7 |
+
from videomind.utils.parser import parse_query
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@DATASETS.register(name='internvid_vtime')
|
| 11 |
+
class InternVidVTimeDataset(GroundingDataset):
|
| 12 |
+
|
| 13 |
+
ANNO_PATH = 'data/internvid_vtime/anno_internvid_vtime_query_gpt4o_mini.jsonl'
|
| 14 |
+
|
| 15 |
+
VIDEO_ROOT = 'data/internvid_vtime/videos_crop_3fps_480_noaudio'
|
| 16 |
+
|
| 17 |
+
UNIT = 0.1
|
| 18 |
+
|
| 19 |
+
@classmethod
|
| 20 |
+
def load_annos(self, split='train'):
|
| 21 |
+
assert split == 'train'
|
| 22 |
+
|
| 23 |
+
raw_annos = nncore.load(self.ANNO_PATH)
|
| 24 |
+
|
| 25 |
+
all_videos = nncore.ls(self.VIDEO_ROOT, ext='.mp4')
|
| 26 |
+
all_videos = set(v[:11] for v in all_videos)
|
| 27 |
+
|
| 28 |
+
annos = []
|
| 29 |
+
for raw_anno in raw_annos:
|
| 30 |
+
vid = raw_anno['vid']
|
| 31 |
+
|
| 32 |
+
if vid not in all_videos:
|
| 33 |
+
continue
|
| 34 |
+
|
| 35 |
+
anno = dict(
|
| 36 |
+
source='internvid_vtime',
|
| 37 |
+
data_type='grounding',
|
| 38 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 39 |
+
duration=raw_anno['duration'],
|
| 40 |
+
query=parse_query(raw_anno['query']),
|
| 41 |
+
span=[raw_anno['span']])
|
| 42 |
+
|
| 43 |
+
annos.append(anno)
|
| 44 |
+
|
| 45 |
+
return annos
|
videomind/dataset/sub_classes/longvideobench.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import nncore
|
| 4 |
+
from torch.utils.data import Dataset
|
| 5 |
+
|
| 6 |
+
from videomind.dataset.hybrid import DATASETS
|
| 7 |
+
from videomind.utils.parser import parse_query, parse_question
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@DATASETS.register(name='longvideobench')
|
| 11 |
+
class LongVideoBenchDataset(Dataset):
|
| 12 |
+
|
| 13 |
+
ANNO_PATH_VALID = 'data/longvideobench/lvb_val.json'
|
| 14 |
+
ANNO_PATH_TEST = 'data/longvideobench/lvb_test_wo_gt.json'
|
| 15 |
+
|
| 16 |
+
VIDEO_ROOT = 'data/longvideobench/videos_3fps_480_noaudio'
|
| 17 |
+
|
| 18 |
+
@classmethod
|
| 19 |
+
def load_annos(self, split='valid'):
|
| 20 |
+
if split == 'valid':
|
| 21 |
+
raw_annos = nncore.load(self.ANNO_PATH_VALID)
|
| 22 |
+
else:
|
| 23 |
+
print('WARNING: Test split does not have ground truth annotations')
|
| 24 |
+
raw_annos = nncore.load(self.ANNO_PATH_TEST)
|
| 25 |
+
|
| 26 |
+
annos = []
|
| 27 |
+
for raw_anno in raw_annos:
|
| 28 |
+
vid = raw_anno['video_id']
|
| 29 |
+
|
| 30 |
+
if vid.startswith('@'):
|
| 31 |
+
vid = vid[-19:]
|
| 32 |
+
|
| 33 |
+
# videos might come from youtube or other sources
|
| 34 |
+
assert len(vid) in (11, 19)
|
| 35 |
+
|
| 36 |
+
anno = dict(
|
| 37 |
+
source='longvideobench',
|
| 38 |
+
data_type='multimodal',
|
| 39 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 40 |
+
query=parse_query(raw_anno['question']),
|
| 41 |
+
question=parse_question(raw_anno['question']),
|
| 42 |
+
options=raw_anno['candidates'],
|
| 43 |
+
task=str(raw_anno['duration_group']),
|
| 44 |
+
level=raw_anno['level'],
|
| 45 |
+
question_category=raw_anno['question_category'])
|
| 46 |
+
|
| 47 |
+
if 'correct_choice' in raw_anno:
|
| 48 |
+
anno['answer'] = raw_anno['candidates'][raw_anno['correct_choice']]
|
| 49 |
+
anno['ans'] = chr(ord('A') + raw_anno['correct_choice'])
|
| 50 |
+
|
| 51 |
+
annos.append(anno)
|
| 52 |
+
|
| 53 |
+
return annos
|
videomind/dataset/sub_classes/lvbench.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import nncore
|
| 4 |
+
from torch.utils.data import Dataset
|
| 5 |
+
|
| 6 |
+
from videomind.dataset.hybrid import DATASETS
|
| 7 |
+
from videomind.utils.parser import parse_query, parse_question
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@DATASETS.register(name='lvbench')
|
| 11 |
+
class LVBenchDataset(Dataset):
|
| 12 |
+
|
| 13 |
+
ANNO_PATH = 'data/lvbench/LVBench/video_info.meta.jsonl'
|
| 14 |
+
|
| 15 |
+
VIDEO_ROOT = 'data/lvbench/videos_3fps_480_noaudio'
|
| 16 |
+
|
| 17 |
+
@classmethod
|
| 18 |
+
def load_annos(self, split='test'):
|
| 19 |
+
assert split == 'test'
|
| 20 |
+
|
| 21 |
+
raw_annos = nncore.load(self.ANNO_PATH)
|
| 22 |
+
|
| 23 |
+
annos = []
|
| 24 |
+
for raw_anno in raw_annos:
|
| 25 |
+
vid = raw_anno['key']
|
| 26 |
+
|
| 27 |
+
for meta in raw_anno['qa']:
|
| 28 |
+
tok = meta['question'].split('\n')
|
| 29 |
+
|
| 30 |
+
assert len(tok) == 5
|
| 31 |
+
assert all(any(o.startswith(k) for k in ('(A) ', '(B) ', '(C) ', '(D) ')) for o in tok[1:])
|
| 32 |
+
|
| 33 |
+
options = [o[4:] for o in tok[1:]]
|
| 34 |
+
ans = meta['answer']
|
| 35 |
+
answer = options[ord(ans) - ord('A')]
|
| 36 |
+
assert ans in 'ABCD'
|
| 37 |
+
|
| 38 |
+
anno = dict(
|
| 39 |
+
source='lvbench',
|
| 40 |
+
data_type='multimodal',
|
| 41 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 42 |
+
query=parse_query(tok[0]),
|
| 43 |
+
question=parse_question(tok[0]),
|
| 44 |
+
options=options,
|
| 45 |
+
answer=answer,
|
| 46 |
+
ans=ans,
|
| 47 |
+
task=meta['question_type'],
|
| 48 |
+
time_reference=meta['time_reference'])
|
| 49 |
+
|
| 50 |
+
annos.append(anno)
|
| 51 |
+
|
| 52 |
+
return annos
|
videomind/dataset/sub_classes/mlvu.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import nncore
|
| 4 |
+
from torch.utils.data import Dataset
|
| 5 |
+
|
| 6 |
+
from videomind.dataset.hybrid import DATASETS
|
| 7 |
+
from videomind.utils.parser import parse_query, parse_question
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@DATASETS.register(name='mlvu')
|
| 11 |
+
class MLVUDataset(Dataset):
|
| 12 |
+
|
| 13 |
+
TASK_TO_DIR_MAP = {
|
| 14 |
+
'plotQA': '1_plotQA',
|
| 15 |
+
'findNeedle': '2_needle',
|
| 16 |
+
'ego': '3_ego',
|
| 17 |
+
'count': '4_count',
|
| 18 |
+
'order': '5_order',
|
| 19 |
+
'anomaly_reco': '6_anomaly_reco',
|
| 20 |
+
'topic_reasoning': '7_topic_reasoning'
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
DATA_ROOT = 'data/mlvu'
|
| 24 |
+
|
| 25 |
+
@classmethod
|
| 26 |
+
def load_annos(self, split='test'):
|
| 27 |
+
assert split == 'test'
|
| 28 |
+
|
| 29 |
+
paths = [nncore.join(self.DATA_ROOT, 'json', f'{n}.json') for n in self.TASK_TO_DIR_MAP.values()]
|
| 30 |
+
|
| 31 |
+
raw_annos = nncore.flatten([nncore.load(p) for p in paths])
|
| 32 |
+
|
| 33 |
+
annos = []
|
| 34 |
+
for raw_anno in raw_annos:
|
| 35 |
+
task = raw_anno['question_type']
|
| 36 |
+
video_name = nncore.join(self.TASK_TO_DIR_MAP[task], raw_anno['video'])
|
| 37 |
+
|
| 38 |
+
options = raw_anno['candidates']
|
| 39 |
+
answer = raw_anno['answer']
|
| 40 |
+
ans = chr(ord('A') + options.index(answer))
|
| 41 |
+
|
| 42 |
+
anno = dict(
|
| 43 |
+
source='mlvu',
|
| 44 |
+
data_type='multimodal',
|
| 45 |
+
video_path=nncore.join(self.DATA_ROOT, 'video', video_name),
|
| 46 |
+
query=parse_query(raw_anno['question']),
|
| 47 |
+
question=parse_question(raw_anno['question']),
|
| 48 |
+
options=options,
|
| 49 |
+
answer=answer,
|
| 50 |
+
ans=ans,
|
| 51 |
+
task=task)
|
| 52 |
+
|
| 53 |
+
annos.append(anno)
|
| 54 |
+
|
| 55 |
+
return annos
|
videomind/dataset/sub_classes/mvbench.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import nncore
|
| 4 |
+
from torch.utils.data import Dataset
|
| 5 |
+
|
| 6 |
+
from videomind.dataset.hybrid import DATASETS
|
| 7 |
+
from videomind.utils.parser import parse_query, parse_question
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@DATASETS.register(name='mvbench')
|
| 11 |
+
class MVBenchDataset(Dataset):
|
| 12 |
+
|
| 13 |
+
META_DATA = [('Episodic Reasoning', 'episodic_reasoning.json', 'tvqa/frames_fps3_hq', 'frame'),
|
| 14 |
+
('Action Sequence', 'action_sequence.json', 'star/Charades_v1_480', 'video'),
|
| 15 |
+
('Action Prediction', 'action_prediction.json', 'star/Charades_v1_480', 'video'),
|
| 16 |
+
('Action Antonym', 'action_antonym.json', 'ssv2_video', 'video'),
|
| 17 |
+
('Fine-grained Action', 'fine_grained_action.json', 'Moments_in_Time_Raw/videos', 'video'),
|
| 18 |
+
('Unexpected Action', 'unexpected_action.json', 'FunQA_test/test', 'video'),
|
| 19 |
+
('Object Existence', 'object_existence.json', 'clevrer/video_validation', 'video'),
|
| 20 |
+
('Object Interaction', 'object_interaction.json', 'star/Charades_v1_480', 'video'),
|
| 21 |
+
('Object Shuffle', 'object_shuffle.json', 'perception/videos', 'video'),
|
| 22 |
+
('Moving Direction', 'moving_direction.json', 'clevrer/video_validation', 'video'),
|
| 23 |
+
('Action Localization', 'action_localization.json', 'sta/sta_video', 'video'),
|
| 24 |
+
('Scene Transition', 'scene_transition.json', 'scene_qa/video', 'video'),
|
| 25 |
+
('Action Count', 'action_count.json', 'perception/videos', 'video'),
|
| 26 |
+
('Moving Count', 'moving_count.json', 'clevrer/video_validation', 'video'),
|
| 27 |
+
('Moving Attribute', 'moving_attribute.json', 'clevrer/video_validation', 'video'),
|
| 28 |
+
('State Change', 'state_change.json', 'perception/videos', 'video'),
|
| 29 |
+
('Fine-grained Pose', 'fine_grained_pose.json', 'nturgbd', 'video'),
|
| 30 |
+
('Character Order', 'character_order.json', 'perception/videos', 'video'),
|
| 31 |
+
('Egocentric Navigation', 'egocentric_navigation.json', 'vlnqa', 'video'),
|
| 32 |
+
('Counterfactual Inference', 'counterfactual_inference.json', 'clevrer/video_validation', 'video')]
|
| 33 |
+
|
| 34 |
+
DATA_ROOT = 'data/mvbench'
|
| 35 |
+
|
| 36 |
+
MIN_LEN = 64
|
| 37 |
+
|
| 38 |
+
@classmethod
|
| 39 |
+
def load_annos(self, split='test', sample_frames=32):
|
| 40 |
+
assert split == 'test'
|
| 41 |
+
|
| 42 |
+
annos = []
|
| 43 |
+
for meta in self.META_DATA:
|
| 44 |
+
raw_annos = nncore.load(nncore.join(self.DATA_ROOT, 'json', meta[1]))
|
| 45 |
+
|
| 46 |
+
for raw_anno in raw_annos:
|
| 47 |
+
video_name = nncore.join(meta[2], raw_anno['video'])
|
| 48 |
+
video_path = nncore.join(self.DATA_ROOT, 'video', video_name)
|
| 49 |
+
|
| 50 |
+
if meta[3] == 'frame':
|
| 51 |
+
num_frames = len(nncore.ls(video_path, ext='.jpg'))
|
| 52 |
+
video_path = [
|
| 53 |
+
nncore.join(video_path, f'{i:0>5}.jpg')
|
| 54 |
+
for i in range(1, num_frames + 1, num_frames // (sample_frames - 1))
|
| 55 |
+
][:sample_frames]
|
| 56 |
+
|
| 57 |
+
options = raw_anno['candidates']
|
| 58 |
+
answer = raw_anno['answer']
|
| 59 |
+
ans = chr(ord('A') + options.index(answer))
|
| 60 |
+
|
| 61 |
+
anno = dict(
|
| 62 |
+
source='mvbench',
|
| 63 |
+
data_type='multimodal',
|
| 64 |
+
video_path=video_path,
|
| 65 |
+
query=parse_query(raw_anno['question']),
|
| 66 |
+
question=parse_question(raw_anno['question']),
|
| 67 |
+
options=options,
|
| 68 |
+
answer=answer,
|
| 69 |
+
ans=ans,
|
| 70 |
+
task=meta[0])
|
| 71 |
+
|
| 72 |
+
annos.append(anno)
|
| 73 |
+
|
| 74 |
+
return annos
|
videomind/dataset/sub_classes/nextgqa.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import csv
|
| 4 |
+
|
| 5 |
+
import nncore
|
| 6 |
+
|
| 7 |
+
from videomind.dataset.hybrid import DATASETS
|
| 8 |
+
from videomind.dataset.wrappers import AnsweringCropDataset, AnsweringDataset, GroundingDataset
|
| 9 |
+
from videomind.utils.parser import parse_query, parse_question
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@DATASETS.register(name='nextgqa')
|
| 13 |
+
class NExTGQADataset(AnsweringDataset):
|
| 14 |
+
|
| 15 |
+
ANNO_PATH_VALID = 'data/nextgqa/val.csv'
|
| 16 |
+
ANNO_PATH_TEST = 'data/nextgqa/test.csv'
|
| 17 |
+
|
| 18 |
+
SPAN_PATH_VALID = 'data/nextgqa/gsub_val.json'
|
| 19 |
+
SPAN_PATH_TEST = 'data/nextgqa/gsub_test.json'
|
| 20 |
+
|
| 21 |
+
VIDEO_ID_MAP = 'data/nextgqa/map_vid_vidorID.json'
|
| 22 |
+
VIDEO_ROOT = 'data/nextqa/videos'
|
| 23 |
+
|
| 24 |
+
SOURCE = 'nextgqa'
|
| 25 |
+
DATA_TYPE = 'multimodal'
|
| 26 |
+
|
| 27 |
+
UNIT = 0.1
|
| 28 |
+
|
| 29 |
+
@classmethod
|
| 30 |
+
def load_annos(self, split='valid'):
|
| 31 |
+
assert split in ('valid', 'test')
|
| 32 |
+
|
| 33 |
+
if split == 'valid':
|
| 34 |
+
anno_path = self.ANNO_PATH_VALID
|
| 35 |
+
raw_spans = nncore.load(self.SPAN_PATH_VALID)
|
| 36 |
+
else:
|
| 37 |
+
anno_path = self.ANNO_PATH_TEST
|
| 38 |
+
raw_spans = nncore.load(self.SPAN_PATH_TEST)
|
| 39 |
+
|
| 40 |
+
with open(anno_path, mode='r') as f:
|
| 41 |
+
reader = csv.DictReader(f)
|
| 42 |
+
raw_annos = [d for d in reader]
|
| 43 |
+
|
| 44 |
+
video_id_map = nncore.load(self.VIDEO_ID_MAP)
|
| 45 |
+
|
| 46 |
+
annos = []
|
| 47 |
+
for raw_anno in raw_annos:
|
| 48 |
+
vid = raw_anno['video_id']
|
| 49 |
+
qid = raw_anno['qid']
|
| 50 |
+
|
| 51 |
+
video_id = video_id_map[vid]
|
| 52 |
+
|
| 53 |
+
query = parse_query(raw_anno['question'].capitalize() + '?')
|
| 54 |
+
question = parse_question(raw_anno['question'].capitalize() + '?')
|
| 55 |
+
options = [raw_anno[k].capitalize() for k in ('a0', 'a1', 'a2', 'a3', 'a4')]
|
| 56 |
+
answer = raw_anno['answer'].capitalize()
|
| 57 |
+
ans = chr(ord('A') + options.index(answer))
|
| 58 |
+
|
| 59 |
+
anno = dict(
|
| 60 |
+
source=self.SOURCE,
|
| 61 |
+
data_type=self.DATA_TYPE,
|
| 62 |
+
video_path=nncore.join(self.VIDEO_ROOT, video_id + '.mp4'),
|
| 63 |
+
duration=raw_spans[vid]['duration'],
|
| 64 |
+
query=query,
|
| 65 |
+
question=question,
|
| 66 |
+
options=options,
|
| 67 |
+
answer=answer,
|
| 68 |
+
ans=ans,
|
| 69 |
+
span=raw_spans[vid]['location'][qid],
|
| 70 |
+
task=raw_anno['type'])
|
| 71 |
+
|
| 72 |
+
annos.append(anno)
|
| 73 |
+
|
| 74 |
+
return annos
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
@DATASETS.register(name='nextgqa_crop')
|
| 78 |
+
class NExTGQACropDataset(AnsweringCropDataset, NExTGQADataset):
|
| 79 |
+
|
| 80 |
+
SOURCE = 'nextgqa_crop'
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
@DATASETS.register(name='nextgqa_grounding')
|
| 84 |
+
class NExTGQAGroundingDataset(GroundingDataset, NExTGQADataset):
|
| 85 |
+
|
| 86 |
+
SOURCE = 'nextgqa_grounding'
|
| 87 |
+
DATA_TYPE = 'grounding'
|
videomind/dataset/sub_classes/nextqa.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import csv
|
| 4 |
+
|
| 5 |
+
import nncore
|
| 6 |
+
|
| 7 |
+
from videomind.dataset.hybrid import DATASETS
|
| 8 |
+
from videomind.dataset.wrappers import AnsweringDataset
|
| 9 |
+
from videomind.utils.parser import parse_query, parse_question
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@DATASETS.register(name='nextqa')
|
| 13 |
+
class NExTQADataset(AnsweringDataset):
|
| 14 |
+
|
| 15 |
+
ANNO_PATH_TRAIN = 'data/nextqa/train.csv'
|
| 16 |
+
ANNO_PATH_VALID = 'data/nextqa/val.csv'
|
| 17 |
+
ANNO_PATH_TEST = 'data/nextqa/test.csv'
|
| 18 |
+
|
| 19 |
+
VIDEO_ID_MAP = 'data/nextqa/map_vid_vidorID.json'
|
| 20 |
+
VIDEO_ROOT = 'data/nextqa/NExTVideo'
|
| 21 |
+
|
| 22 |
+
@classmethod
|
| 23 |
+
def load_annos(self, split='train'):
|
| 24 |
+
if split == 'train':
|
| 25 |
+
anno_path = self.ANNO_PATH_TRAIN
|
| 26 |
+
elif split == 'valid':
|
| 27 |
+
anno_path = self.ANNO_PATH_VALID
|
| 28 |
+
else:
|
| 29 |
+
anno_path = self.ANNO_PATH_TEST
|
| 30 |
+
|
| 31 |
+
with open(anno_path, mode='r') as f:
|
| 32 |
+
reader = csv.DictReader(f)
|
| 33 |
+
raw_annos = [d for d in reader]
|
| 34 |
+
|
| 35 |
+
video_id_map = nncore.load(self.VIDEO_ID_MAP)
|
| 36 |
+
|
| 37 |
+
annos = []
|
| 38 |
+
for raw_anno in raw_annos:
|
| 39 |
+
vid = raw_anno['video']
|
| 40 |
+
qid = raw_anno['qid']
|
| 41 |
+
|
| 42 |
+
video_id = video_id_map[vid]
|
| 43 |
+
query = parse_query(raw_anno['question'].capitalize() + '?')
|
| 44 |
+
question = parse_question(raw_anno['question'].capitalize() + '?')
|
| 45 |
+
options = [raw_anno[k].capitalize() for k in ('a0', 'a1', 'a2', 'a3', 'a4')]
|
| 46 |
+
ans = chr(ord('A') + int(raw_anno['answer']))
|
| 47 |
+
answer = options[int(raw_anno['answer'])]
|
| 48 |
+
|
| 49 |
+
anno = dict(
|
| 50 |
+
source='nextqa',
|
| 51 |
+
data_type='multimodal',
|
| 52 |
+
uid=f'{vid}_{qid}',
|
| 53 |
+
video_path=nncore.join(self.VIDEO_ROOT, video_id + '.mp4'),
|
| 54 |
+
query=query,
|
| 55 |
+
question=question,
|
| 56 |
+
options=options,
|
| 57 |
+
answer=answer,
|
| 58 |
+
ans=ans,
|
| 59 |
+
task=raw_anno['type'])
|
| 60 |
+
|
| 61 |
+
annos.append(anno)
|
| 62 |
+
|
| 63 |
+
return annos
|
videomind/dataset/sub_classes/qa_ego4d.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import random
|
| 4 |
+
|
| 5 |
+
import nncore
|
| 6 |
+
|
| 7 |
+
from videomind.dataset.hybrid import DATASETS
|
| 8 |
+
from videomind.dataset.wrappers import AnsweringCropDataset, AnsweringDataset, GroundingDataset
|
| 9 |
+
from videomind.utils.parser import parse_query, parse_question
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@DATASETS.register(name='qa_ego4d')
|
| 13 |
+
class QAEgo4DDataset(AnsweringDataset):
|
| 14 |
+
|
| 15 |
+
ANNO_PATH_TRAIN = 'data/qa_ego4d/annotations.QaEgo4D_train.json'
|
| 16 |
+
ANNO_PATH_VALID = 'data/qa_ego4d/annotations.QaEgo4D_val_options.json'
|
| 17 |
+
ANNO_PATH_TEST = 'data/qa_ego4d/annotations.QaEgo4D_test_options.json'
|
| 18 |
+
|
| 19 |
+
VIDEO_ROOT = 'data/ego4d/v1/videos_3fps_480_noaudio'
|
| 20 |
+
DURATIONS = 'data/ego4d/v1/durations.json'
|
| 21 |
+
|
| 22 |
+
SOURCE = 'qa_ego4d'
|
| 23 |
+
DATA_TYPE = 'multimodal'
|
| 24 |
+
|
| 25 |
+
UNIT = 0.001
|
| 26 |
+
|
| 27 |
+
@classmethod
|
| 28 |
+
def load_annos(self, split='train'):
|
| 29 |
+
if split == 'train':
|
| 30 |
+
raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
|
| 31 |
+
elif split == 'valid':
|
| 32 |
+
raw_annos = nncore.load(self.ANNO_PATH_VALID)
|
| 33 |
+
else:
|
| 34 |
+
raw_annos = nncore.load(self.ANNO_PATH_TEST)
|
| 35 |
+
|
| 36 |
+
durations = nncore.load(self.DURATIONS)
|
| 37 |
+
|
| 38 |
+
annos = []
|
| 39 |
+
for raw_anno in raw_annos:
|
| 40 |
+
vid = raw_anno['video_id']
|
| 41 |
+
|
| 42 |
+
duration = durations[vid]
|
| 43 |
+
|
| 44 |
+
# too short or too long samples
|
| 45 |
+
if split == 'train' and (duration < 10 or duration > 600):
|
| 46 |
+
continue
|
| 47 |
+
|
| 48 |
+
span = [raw_anno['moment_start_frame'] / 30, raw_anno['moment_end_frame'] / 30]
|
| 49 |
+
span = [round(span[0], 3), round(span[1], 3)]
|
| 50 |
+
|
| 51 |
+
# skip samples with too short moments
|
| 52 |
+
# if split == 'train' and span[1] - span[0] < 2:
|
| 53 |
+
# continue
|
| 54 |
+
|
| 55 |
+
answer = raw_anno['answer'].capitalize()
|
| 56 |
+
|
| 57 |
+
if 'options' in raw_anno:
|
| 58 |
+
options = [o.capitalize() for o in raw_anno['options']]
|
| 59 |
+
idx = options.index(answer)
|
| 60 |
+
ans = chr(ord('A') + idx)
|
| 61 |
+
else:
|
| 62 |
+
# NOTE: indeterministic evaluation
|
| 63 |
+
assert len(raw_anno['wrong_answers']) == 3
|
| 64 |
+
idx = random.randint(0, 3)
|
| 65 |
+
ans = chr(ord('A') + idx)
|
| 66 |
+
options = [o.capitalize() for o in raw_anno['wrong_answers']]
|
| 67 |
+
options.insert(idx, answer)
|
| 68 |
+
|
| 69 |
+
assert len(options) == 4, options
|
| 70 |
+
|
| 71 |
+
anno = dict(
|
| 72 |
+
source=self.SOURCE,
|
| 73 |
+
data_type=self.DATA_TYPE,
|
| 74 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 75 |
+
duration=duration,
|
| 76 |
+
query=parse_query(raw_anno['question'].capitalize()),
|
| 77 |
+
question=parse_question(raw_anno['question'].capitalize()),
|
| 78 |
+
options=options,
|
| 79 |
+
answer=answer,
|
| 80 |
+
ans=ans,
|
| 81 |
+
span=[span])
|
| 82 |
+
|
| 83 |
+
annos.append(anno)
|
| 84 |
+
|
| 85 |
+
return annos
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
@DATASETS.register(name='qa_ego4d_crop')
|
| 89 |
+
class QAEgo4DCropDataset(AnsweringCropDataset, QAEgo4DDataset):
|
| 90 |
+
|
| 91 |
+
SOURCE = 'qa_ego4d_crop'
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
@DATASETS.register(name='qa_ego4d_grounding')
|
| 95 |
+
class QAEgo4DGroundingDataset(GroundingDataset, QAEgo4DDataset):
|
| 96 |
+
|
| 97 |
+
SOURCE = 'qa_ego4d_grounding'
|
| 98 |
+
DATA_TYPE = 'grounding'
|
videomind/dataset/sub_classes/queryd.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import nncore
|
| 4 |
+
|
| 5 |
+
from videomind.dataset.hybrid import DATASETS
|
| 6 |
+
from videomind.dataset.wrappers import GroundingDataset
|
| 7 |
+
from videomind.utils.parser import parse_query
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@DATASETS.register(name='queryd')
|
| 11 |
+
class QuerYDDataset(GroundingDataset):
|
| 12 |
+
|
| 13 |
+
VID_PATH = 'data/queryd/train_list.txt'
|
| 14 |
+
QUERY_PATH = 'data/queryd/raw_captions_combined_filtered-v2.pkl'
|
| 15 |
+
SPAN_PATH = 'data/queryd/times_captions_combined_filtered-v2.pkl'
|
| 16 |
+
|
| 17 |
+
VIDEO_ROOT = 'data/queryd/videos_3fps_480_noaudio'
|
| 18 |
+
DURATIONS = 'data/queryd/durations.json'
|
| 19 |
+
|
| 20 |
+
UNIT = 0.001
|
| 21 |
+
|
| 22 |
+
@classmethod
|
| 23 |
+
def load_annos(self, split='train'):
|
| 24 |
+
assert split == 'train'
|
| 25 |
+
|
| 26 |
+
vids = nncore.load(self.VID_PATH)
|
| 27 |
+
queries = nncore.load(self.QUERY_PATH)
|
| 28 |
+
spans = nncore.load(self.SPAN_PATH)
|
| 29 |
+
durations = nncore.load(self.DURATIONS)
|
| 30 |
+
|
| 31 |
+
annos = []
|
| 32 |
+
for vid in vids:
|
| 33 |
+
for query, span in zip(queries[vid], spans[vid]):
|
| 34 |
+
video_name = vid[6:]
|
| 35 |
+
|
| 36 |
+
if video_name not in durations:
|
| 37 |
+
continue
|
| 38 |
+
|
| 39 |
+
anno = dict(
|
| 40 |
+
source='queryd',
|
| 41 |
+
data_type='grounding',
|
| 42 |
+
video_path=nncore.join(self.VIDEO_ROOT, video_name + '.mp4'),
|
| 43 |
+
duration=durations[video_name],
|
| 44 |
+
query=parse_query(' '.join(query)),
|
| 45 |
+
span=[span])
|
| 46 |
+
|
| 47 |
+
annos.append(anno)
|
| 48 |
+
|
| 49 |
+
return annos
|
videomind/dataset/sub_classes/qvhighlights.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import nncore
|
| 4 |
+
|
| 5 |
+
from videomind.dataset.hybrid import DATASETS
|
| 6 |
+
from videomind.dataset.wrappers import GroundingDataset
|
| 7 |
+
from videomind.utils.parser import parse_query
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@DATASETS.register(name='qvhighlights')
|
| 11 |
+
class QVHighlightsDataset(GroundingDataset):
|
| 12 |
+
|
| 13 |
+
ANNO_PATH_TRAIN = 'data/qvhighlights/highlight_train_release.jsonl'
|
| 14 |
+
ANNO_PATH_VALID = 'data/qvhighlights/highlight_val_release.jsonl'
|
| 15 |
+
ANNO_PATH_TEST = 'data/qvhighlights/highlight_test_release.jsonl'
|
| 16 |
+
|
| 17 |
+
VIDEO_ROOT = 'data/qvhighlights/videos_3fps_480_noaudio'
|
| 18 |
+
|
| 19 |
+
UNIT = 2.0
|
| 20 |
+
|
| 21 |
+
@classmethod
|
| 22 |
+
def load_annos(self, split='train'):
|
| 23 |
+
if split == 'train':
|
| 24 |
+
raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
|
| 25 |
+
elif split == 'valid':
|
| 26 |
+
raw_annos = nncore.load(self.ANNO_PATH_VALID)
|
| 27 |
+
else:
|
| 28 |
+
print('WARNING: Test split does not have ground truth annotations')
|
| 29 |
+
raw_annos = nncore.load(self.ANNO_PATH_TEST)
|
| 30 |
+
|
| 31 |
+
annos = []
|
| 32 |
+
for raw_anno in raw_annos:
|
| 33 |
+
vid = raw_anno['vid']
|
| 34 |
+
qid = raw_anno['qid']
|
| 35 |
+
|
| 36 |
+
anno = dict(
|
| 37 |
+
source='qvhighlights',
|
| 38 |
+
data_type='grounding',
|
| 39 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 40 |
+
duration=raw_anno['duration'],
|
| 41 |
+
query=parse_query(raw_anno['query']),
|
| 42 |
+
span=raw_anno.get('relevant_windows'),
|
| 43 |
+
vid=vid,
|
| 44 |
+
qid=qid)
|
| 45 |
+
|
| 46 |
+
annos.append(anno)
|
| 47 |
+
|
| 48 |
+
return annos
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
@DATASETS.register(name='qvhighlights_single')
|
| 52 |
+
class QVHighlightsSingleDataset(QVHighlightsDataset):
|
| 53 |
+
|
| 54 |
+
@classmethod
|
| 55 |
+
def load_annos(self, split='train'):
|
| 56 |
+
assert split == 'train'
|
| 57 |
+
|
| 58 |
+
raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
|
| 59 |
+
|
| 60 |
+
annos = []
|
| 61 |
+
for raw_anno in raw_annos:
|
| 62 |
+
# skip samples with multiple moments
|
| 63 |
+
if len(raw_anno['relevant_windows']) > 1:
|
| 64 |
+
continue
|
| 65 |
+
|
| 66 |
+
vid = raw_anno['vid']
|
| 67 |
+
|
| 68 |
+
anno = dict(
|
| 69 |
+
source='qvhighlights_single',
|
| 70 |
+
data_type='grounding',
|
| 71 |
+
video_path=nncore.join(self.VIDEO_ROOT, vid + '.mp4'),
|
| 72 |
+
duration=raw_anno['duration'],
|
| 73 |
+
query=parse_query(raw_anno['query']),
|
| 74 |
+
span=raw_anno.get('relevant_windows'))
|
| 75 |
+
|
| 76 |
+
annos.append(anno)
|
| 77 |
+
|
| 78 |
+
return annos
|
videomind/dataset/sub_classes/rextime.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Ye Liu. Licensed under the BSD 3-Clause License.
|
| 2 |
+
|
| 3 |
+
import nncore
|
| 4 |
+
|
| 5 |
+
from videomind.dataset.hybrid import DATASETS
|
| 6 |
+
from videomind.dataset.wrappers import AnsweringCropDataset, AnsweringDataset, GroundingDataset
|
| 7 |
+
from videomind.utils.parser import parse_query, parse_question
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@DATASETS.register(name='rextime')
|
| 11 |
+
class ReXTimeDataset(AnsweringDataset):
|
| 12 |
+
|
| 13 |
+
ANNO_PATH_TRAIN = 'data/rextime/rextime_train.json'
|
| 14 |
+
ANNO_PATH_VALID = 'data/rextime/rextime_val.json'
|
| 15 |
+
ANNO_PATH_TEST = 'data/rextime/rextime_test_release.json'
|
| 16 |
+
|
| 17 |
+
VIDEO_ROOT_ANET = 'data/activitynet/videos_3fps_480_noaudio'
|
| 18 |
+
VIDEO_ROOT_QVHL = 'data/qvhighlights/videos_3fps_480_noaudio'
|
| 19 |
+
|
| 20 |
+
DURATIONS_ANET = 'data/activitynet/durations.json'
|
| 21 |
+
DURATIONS_QVHL = 'data/qvhighlights/durations.json'
|
| 22 |
+
|
| 23 |
+
SOURCE = 'rextime'
|
| 24 |
+
DATA_TYPE = 'multimodal'
|
| 25 |
+
|
| 26 |
+
UNIT = 1.0
|
| 27 |
+
MIN_LEN = 64
|
| 28 |
+
|
| 29 |
+
@classmethod
|
| 30 |
+
def load_annos(self, split='train'):
|
| 31 |
+
if split == 'train':
|
| 32 |
+
raw_annos = nncore.load(self.ANNO_PATH_TRAIN)
|
| 33 |
+
elif split == 'valid':
|
| 34 |
+
raw_annos = nncore.load(self.ANNO_PATH_VALID)
|
| 35 |
+
else:
|
| 36 |
+
print('WARNING: Test split does not have ground truth annotations')
|
| 37 |
+
raw_annos = nncore.load(self.ANNO_PATH_TEST)
|
| 38 |
+
|
| 39 |
+
durations_anet = nncore.load(self.DURATIONS_ANET)
|
| 40 |
+
durations_qvhl = nncore.load(self.DURATIONS_QVHL)
|
| 41 |
+
|
| 42 |
+
annos = []
|
| 43 |
+
for raw_anno in raw_annos:
|
| 44 |
+
vid = raw_anno['vid']
|
| 45 |
+
|
| 46 |
+
if len(vid) == 13:
|
| 47 |
+
video_path = nncore.join(self.VIDEO_ROOT_ANET, vid + '.mp4')
|
| 48 |
+
duration = durations_anet[vid]
|
| 49 |
+
else:
|
| 50 |
+
video_path = nncore.join(self.VIDEO_ROOT_QVHL, vid + '.mp4')
|
| 51 |
+
duration = durations_qvhl[vid]
|
| 52 |
+
|
| 53 |
+
anno = dict(
|
| 54 |
+
source=self.SOURCE,
|
| 55 |
+
data_type=self.DATA_TYPE,
|
| 56 |
+
video_path=video_path,
|
| 57 |
+
duration=duration,
|
| 58 |
+
query=parse_query(raw_anno['question']),
|
| 59 |
+
question=parse_question(raw_anno['question']),
|
| 60 |
+
options=[o.capitalize() for o in raw_anno['options']],
|
| 61 |
+
answer=raw_anno['answer'].replace('From <s0> to <e0>, ', '').capitalize(),
|
| 62 |
+
ans=raw_anno['ans'],
|
| 63 |
+
span=[raw_anno['span']],
|
| 64 |
+
task=raw_anno['category'])
|
| 65 |
+
|
| 66 |
+
annos.append(anno)
|
| 67 |
+
|
| 68 |
+
return annos
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
@DATASETS.register(name='rextime_crop')
|
| 72 |
+
class ReXTimeCropDataset(AnsweringCropDataset, ReXTimeDataset):
|
| 73 |
+
|
| 74 |
+
SOURCE = 'rextime_crop'
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
@DATASETS.register(name='rextime_grounding')
|
| 78 |
+
class ReXTimeGroundingDataset(GroundingDataset, ReXTimeDataset):
|
| 79 |
+
|
| 80 |
+
SOURCE = 'rextime_grounding'
|
| 81 |
+
DATA_TYPE = 'grounding'
|