| { | |
| "TextEncoders": { | |
| "bert": { | |
| "config": "configs/config_bert.json", | |
| "d_model": 768, | |
| "fusion_layer": 9, | |
| "name": "bert_base", | |
| "pretrained": "bert-base-uncased" | |
| }, | |
| "bert_large": { | |
| "config": "configs/config_bert_large.json", | |
| "d_model": 1024, | |
| "fusion_layer": 19, | |
| "name": "bert_large", | |
| "pretrained": "bert-large-uncased" | |
| }, | |
| "med_bert": { | |
| "config": "configs/med_config.json", | |
| "d_model": 768, | |
| "name": "med_bert_base", | |
| "pretrained": "bert-base-uncased" | |
| }, | |
| "med_bert_large": { | |
| "config": "configs/med_large_config.json", | |
| "d_model": 768, | |
| "name": "med_bert_large", | |
| "pretrained": "bert-base-uncased" | |
| } | |
| }, | |
| "VisionEncoders": {}, | |
| "architectures": [ | |
| "InternVideo2_Stage2" | |
| ], | |
| "auto_map": { | |
| "AutoConfig": "modeling_internvideo2.InternVideo2_Stage2_Config", | |
| "AutoModel": "modeling_internvideo2.InternVideo2_Stage2" | |
| }, | |
| "auto_resume": true, | |
| "available_corpus": { | |
| "anet_ret_train": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "is_paragraph_retrieval": true, | |
| "max_txt_l": 150, | |
| "media_type": "video" | |
| }, | |
| "anet_ret_val": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "is_paragraph_retrieval": true, | |
| "max_txt_l": 150, | |
| "media_type": "video" | |
| }, | |
| "audiocaps_ret_test": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "audio" | |
| }, | |
| "audiocaps_ret_train": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "audio" | |
| }, | |
| "cc12m": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "image" | |
| }, | |
| "cc3m": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "image" | |
| }, | |
| "cc3m_debug": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "image" | |
| }, | |
| "charades_mc_test": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| }, | |
| "clothov1_ret_test": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "audio" | |
| }, | |
| "clothov1_ret_train": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "audio" | |
| }, | |
| "clothov2_ret_test": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "audio" | |
| }, | |
| "clothov2_ret_train": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "audio" | |
| }, | |
| "coco": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "jump_filter": true, | |
| "media_type": "image" | |
| }, | |
| "data_25m": [ | |
| { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| }, | |
| { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "image" | |
| }, | |
| { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "jump_filter": true, | |
| "media_type": "image" | |
| }, | |
| { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "jump_filter": true, | |
| "media_type": "image" | |
| }, | |
| { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "image" | |
| }, | |
| { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "image" | |
| } | |
| ], | |
| "debug": [ | |
| { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "image" | |
| }, | |
| { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| } | |
| ], | |
| "didemo_ret_test": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "is_paragraph_retrieval": true, | |
| "max_txt_l": 64, | |
| "media_type": "video", | |
| "trimmed30": true | |
| }, | |
| "didemo_ret_train": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "is_paragraph_retrieval": true, | |
| "max_txt_l": 64, | |
| "media_type": "video", | |
| "trimmed30": true | |
| }, | |
| "didemo_ret_val": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "is_paragraph_retrieval": true, | |
| "max_txt_l": 64, | |
| "media_type": "video", | |
| "trimmed30": true | |
| }, | |
| "hmdb51_act_val": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "is_act_rec": true, | |
| "media_type": "video" | |
| }, | |
| "internvid_v1": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "jump_filter": true, | |
| "media_type": "video" | |
| }, | |
| "internvid_v2_avs_private": { | |
| "anno_path": "your_path", | |
| "caption_augmentation": { | |
| "caption_sample_type": "avs_all" | |
| }, | |
| "data_root": "", | |
| "jump_filter": true, | |
| "media_type": "audio_video", | |
| "read_audio_from_video": true, | |
| "read_clip_from_video": false, | |
| "zero_audio_padding_for_video": true | |
| }, | |
| "k400_act_val": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "is_act_rec": true | |
| }, | |
| "k600_act_val": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "is_act_rec": true, | |
| "media_type": "video" | |
| }, | |
| "k700_act_val": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "is_act_rec": true, | |
| "media_type": "video" | |
| }, | |
| "laion_2b": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "jump_filter": true, | |
| "media_type": "image" | |
| }, | |
| "laion_coco": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "jump_filter": true, | |
| "media_type": "image" | |
| }, | |
| "laion_pop": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "jump_filter": true, | |
| "media_type": "image" | |
| }, | |
| "lsmdc_ret_test_1000": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| }, | |
| "lsmdc_ret_train": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "max_txt_l": 96, | |
| "media_type": "video" | |
| }, | |
| "lsmdc_ret_val": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "max_txt_l": 96, | |
| "media_type": "video" | |
| }, | |
| "mit_act_val": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "is_act_rec": true, | |
| "media_type": "video" | |
| }, | |
| "msrvtt_1k_test": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| }, | |
| "msrvtt_ret_test1k": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| }, | |
| "msrvtt_ret_train9k": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| }, | |
| "msvd_ret_test": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "max_txt_l": 64, | |
| "media_type": "video" | |
| }, | |
| "msvd_ret_train": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "has_multi_txt_gt": true, | |
| "max_txt_l": 64, | |
| "media_type": "video" | |
| }, | |
| "msvd_ret_val": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "max_txt_l": 64, | |
| "media_type": "video" | |
| }, | |
| "pretrain_example_data_1B": [ | |
| { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "image" | |
| }, | |
| { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| } | |
| ], | |
| "pretrain_example_data_6B": [ | |
| { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "image" | |
| }, | |
| { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| }, | |
| { | |
| "anno_path": "your_path", | |
| "caption_augmentation": { | |
| "caption_sample_type": "avs_all" | |
| }, | |
| "data_root": "", | |
| "jump_filter": true, | |
| "media_type": "audio_video", | |
| "read_audio_from_video": true, | |
| "read_clip_from_video": false, | |
| "zero_audio_padding_for_video": true | |
| } | |
| ], | |
| "sbu": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "image" | |
| }, | |
| "ssv2_mc_val": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| }, | |
| "ucf101_act_val": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "is_act_rec": true, | |
| "media_type": "video" | |
| }, | |
| "vatex_ch_ret_val": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| }, | |
| "vatex_en_ret_train": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "has_multi_txt_gt": true, | |
| "media_type": "video" | |
| }, | |
| "vatex_en_ret_val": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| }, | |
| "vg": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "jump_filter": true, | |
| "media_type": "image" | |
| }, | |
| "wavcaps_400k": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "audio" | |
| }, | |
| "webvid": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| }, | |
| "webvid_10m": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| }, | |
| "webvid_debug": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "media_type": "video" | |
| }, | |
| "webvid_fuse_10m": { | |
| "anno_path": "your_path", | |
| "data_root": "", | |
| "jump_filter": true, | |
| "media_type": "video" | |
| } | |
| }, | |
| "batch_size": 8, | |
| "batch_size_test": 4, | |
| "compile_model": false, | |
| "debug": false, | |
| "deep_fusion": false, | |
| "deepspeed": { | |
| "enable": true, | |
| "stage": 1 | |
| }, | |
| "device": "cuda", | |
| "dist_url": "env://", | |
| "evaluate": true, | |
| "evaluation": { | |
| "eval_frame_ensemble": "concat", | |
| "eval_offload": true, | |
| "eval_x_only": false, | |
| "k_test": 128 | |
| }, | |
| "gradient_checkpointing": true, | |
| "inputs": { | |
| "batch_size": { | |
| "image": 8, | |
| "video": 8 | |
| }, | |
| "batch_size_test": { | |
| "image": 4, | |
| "video": 4 | |
| }, | |
| "image_res": 224, | |
| "max_txt_l": { | |
| "image": 40, | |
| "video": 40 | |
| }, | |
| "video_input": { | |
| "num_frames": 4, | |
| "num_frames_test": 4, | |
| "random_aug": false, | |
| "sample_type": "rand", | |
| "sample_type_test": "middle" | |
| } | |
| }, | |
| "jump_evaluate": false, | |
| "log_freq": 100, | |
| "max_txt_l": 40, | |
| "mode": "pt", | |
| "model": { | |
| "embed_dim": 512, | |
| "find_unused_parameters": false, | |
| "model_cls": "InternVideo2_Stage2", | |
| "multimodal": { | |
| "enable": true | |
| }, | |
| "temp": 0.07, | |
| "text_encoder": { | |
| "config": "configs/config_bert_large.json", | |
| "d_model": 1024, | |
| "fusion_layer": 19, | |
| "name": "bert_large", | |
| "pretrained": "bert-large-uncased" | |
| }, | |
| "vision_encoder": { | |
| "checkpoint_num": 40, | |
| "clip_embed_dim": 768, | |
| "clip_input_resolution": 224, | |
| "clip_norm_type": "l2", | |
| "clip_return_layer": 6, | |
| "clip_student_return_interval": 1, | |
| "clip_teacher": null, | |
| "clip_teacher_embed_dim": 3200, | |
| "clip_teacher_final_dim": 768, | |
| "clip_teacher_return_interval": 1, | |
| "d_model": 1408, | |
| "image_mask_ratio": 0.5, | |
| "image_mask_type": "random", | |
| "img_size": 224, | |
| "keep_temporal": false, | |
| "name": "pretrain_internvideo2_6b_patch14_224", | |
| "num_frames": 4, | |
| "only_mask": true, | |
| "patch_size": 14, | |
| "pretrained": "", | |
| "sep_image_video_pos_embed": true, | |
| "tubelet_size": 1, | |
| "use_checkpoint": true, | |
| "use_flash_attn": false, | |
| "use_fused_mlp": false, | |
| "use_fused_rmsnorm": false, | |
| "video_mask_ratio": 0.8, | |
| "video_mask_type": "random" | |
| } | |
| }, | |
| "num_frames": 4, | |
| "num_frames_test": 4, | |
| "num_workers": 6, | |
| "origin_num_frames": 4, | |
| "output_dir": null, | |
| "resume": false, | |
| "save_latest": false, | |
| "seed": 42, | |
| "size_t": 224, | |
| "text_enc": "bert_large", | |
| "torch_dtype": "float32", | |
| "transformers_version": "4.37.2", | |
| "use_bf16": false, | |
| "use_flash_sdp": false, | |
| "use_half_precision": false, | |
| "use_mem_efficient_sdp": false | |
| } | |