| SYSTEM = '' | |
| accumulative_counts = 96 | |
| allava_cl_data_path = '/data/wenhao/projects/xtuner/data/ALLaVA-4V/ALLaVA-Caption-LAION-4V.jsonl' | |
| allava_cl_dataset = dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_cl_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset') | |
| allava_cl_image_folder = '/data/wenhao/projects/xtuner/data/ALLaVA-4V' | |
| allava_cv_data_path = '/data/wenhao/projects/xtuner/data/ALLaVA-4V/ALLaVA-Caption-VFLAN-4V.jsonl' | |
| allava_cv_dataset = dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_cv_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset') | |
| allava_data_root = '/data/wenhao/projects/xtuner/data/ALLaVA-4V' | |
| allava_il_data_path = '/data/wenhao/projects/xtuner/data/ALLaVA-4V/ALLaVA-Instruct-LAION-4V.jsonl' | |
| allava_il_dataset = dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_il_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset') | |
| allava_il_image_folder = '/data/wenhao/projects/xtuner/data/ALLaVA-4V' | |
| allava_image_folder = '/data/wenhao/projects/xtuner/data/ALLaVA-4V' | |
| allava_iv_data_path = '/data/wenhao/projects/xtuner/data/ALLaVA-4V/ALLaVA-Instruct-VFLAN-4V.jsonl' | |
| allava_iv_dataset = dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_iv_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset') | |
| batch_size = 1 | |
| betas = ( | |
| 0.9, | |
| 0.999, | |
| ) | |
| cambrian_data_path = '/data/wenhao/projects/xtuner/data/Cambrian-10M/jsons/Cambrian7M_withsystemprompt.jsonl' | |
| cambrian_data_root = '/data/wenhao/projects/xtuner/data/Cambrian-10M/' | |
| cambrian_dataset = dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/Cambrian-10M/', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/Cambrian-10M/pre_token_llama3', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset') | |
| cambrian_image_folder = '/data/wenhao/projects/xtuner/data/Cambrian-10M/' | |
| cambrian_processed_text_folder = '/data/wenhao/projects/xtuner/data/Cambrian-10M/pre_token_llama3' | |
| custom_hooks = [ | |
| dict( | |
| tokenizer=dict( | |
| padding_side='right', | |
| pretrained_model_name_or_path='meta-llama/Meta-Llama-3-8B-Instruct', | |
| trust_remote_code=True, | |
| type='transformers.AutoTokenizer.from_pretrained'), | |
| type='xtuner.engine.DatasetInfoHook'), | |
| dict( | |
| evaluation_images='https://llava-vl.github.io/static/images/view.jpg', | |
| evaluation_inputs=[ | |
| '请描述一下这张照片', | |
| 'Please describe this picture', | |
| ], | |
| every_n_iters=100, | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| prompt_template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| system='', | |
| tokenizer=dict( | |
| padding_side='right', | |
| pretrained_model_name_or_path='meta-llama/Meta-Llama-3-8B-Instruct', | |
| trust_remote_code=True, | |
| type='transformers.AutoTokenizer.from_pretrained'), | |
| type='xtuner.engine.EvaluateChatHook'), | |
| ] | |
| dataloader_num_workers = 0 | |
| default_hooks = dict( | |
| checkpoint=dict( | |
| by_epoch=False, | |
| interval=200, | |
| max_keep_ckpts=2, | |
| type='mmengine.hooks.CheckpointHook'), | |
| logger=dict(interval=10, type='mmengine.hooks.LoggerHook'), | |
| param_scheduler=dict(type='mmengine.hooks.ParamSchedulerHook'), | |
| sampler_seed=dict(type='mmengine.hooks.DistSamplerSeedHook'), | |
| timer=dict(type='mmengine.hooks.IterTimerHook')) | |
| dense_data_path = '/data/wenhao/projects/xtuner/data/DenseFusion-1M/DenseFusion-1M/DenseFusion-1M-instruct.jsonl' | |
| dense_data_root = '/data/wenhao/projects/xtuner/data/DenseFusion-1M/' | |
| dense_dataset = dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/DenseFusion-1M/1M_data', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/DenseFusion-1M/pre_token_llama3', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset') | |
| dense_image_folder = '/data/wenhao/projects/xtuner/data/DenseFusion-1M/1M_data' | |
| dense_processed_text_folder = '/data/wenhao/projects/xtuner/data/DenseFusion-1M/pre_token_llama3' | |
| env_cfg = dict( | |
| cudnn_benchmark=False, | |
| dist_cfg=dict(backend='nccl'), | |
| mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) | |
| evaluation_freq = 100 | |
| evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' | |
| evaluation_inputs = [ | |
| '请描述一下这张照片', | |
| 'Please describe this picture', | |
| ] | |
| evol_data_path = '/data/wenhao/projects/xtuner/data/ALLaVA-4V/allava_text/Evol-Instruct-GPT4-Turbo-143K.jsonl' | |
| evol_data_root = '/data/wenhao/projects/xtuner/data/ALLaVA-4V/allava_text/' | |
| evol_dataset = dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/allava_text/images', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_evol_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset') | |
| evol_image_folder = '/data/wenhao/projects/xtuner/data/ALLaVA-4V/allava_text/images' | |
| face_data_path = '/data/wenhao/projects/xtuner/data/FaceCaption-15M/FaceCaption-100K.jsonl' | |
| face_data_root = '/data/wenhao/projects/xtuner/data/FaceCaption-15M/' | |
| face_dataset = dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/FaceCaption-15M/full_data', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/FaceCaption-15M/pre_token_llama3', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset') | |
| face_image_folder = '/data/wenhao/projects/xtuner/data/FaceCaption-15M/full_data' | |
| face_processed_text_folder = '/data/wenhao/projects/xtuner/data/FaceCaption-15M/pre_token_llama3' | |
| image_processor = dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path='laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained') | |
| launcher = 'slurm' | |
| llava_mix_data_path = '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/llava_v1_5_mix665k.jsonl' | |
| llava_mix_data_root = '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/' | |
| llava_mix_dataset = dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/images', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/pre_token_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset') | |
| llava_mix_image_folder = '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/images' | |
| llavanext_data_path = '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/llava_next.jsonl' | |
| llavanext_data_root = '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/' | |
| llavanext_dataset = dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/images', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/pre_token_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset') | |
| llavanext_image_folder = '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/images' | |
| llm_name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' | |
| load_from = 'work_dirs/new_image/iter_449600.pth' | |
| log_level = 'INFO' | |
| lr = 1e-05 | |
| m4_data_path = '/data/wenhao/projects/xtuner/data/M4-Instruct-Data/m4_instruct_image.jsonl' | |
| m4_data_root = '/data/wenhao/projects/xtuner/data/M4-Instruct-Data/' | |
| m4_dataset = dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/M4-Instruct-Data/', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/M4-Instruct-Data/pre_token_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset') | |
| m4_image_folder = '/data/wenhao/projects/xtuner/data/M4-Instruct-Data/' | |
| max_epochs = 1 | |
| max_length = 4096 | |
| max_norm = 1 | |
| model = dict( | |
| freeze_llm=False, | |
| freeze_visual_encoder=False, | |
| llm=dict( | |
| pretrained_model_name_or_path='meta-llama/Meta-Llama-3-8B-Instruct', | |
| torch_dtype='torch.float16', | |
| trust_remote_code=True, | |
| type='transformers.AutoModelForCausalLM.from_pretrained'), | |
| llm_lora=dict( | |
| bias='none', | |
| lora_alpha=256, | |
| lora_dropout=0.05, | |
| r=512, | |
| task_type='CAUSAL_LM', | |
| type='peft.LoraConfig'), | |
| pretrained_pth= | |
| '/data/wenhao/projects/xtuner/work_dirs/final_new_v/projector', | |
| type='xtuner.model.PikaModel', | |
| visual_encoder=dict( | |
| pretrained_model_name_or_path= | |
| '/data/wenhao/projects/xtuner/work_dirs/final_new_v/visual_encoder', | |
| type='xtuner.model.pika.PikaSigVidEncoder.from_pretrained', | |
| visual_token_merge_ratio=0.1)) | |
| optim_type = 'torch.optim.AdamW' | |
| optim_wrapper = dict( | |
| optimizer=dict( | |
| betas=( | |
| 0.9, | |
| 0.999, | |
| ), | |
| lr=1e-05, | |
| type='torch.optim.AdamW', | |
| weight_decay=0), | |
| type='DeepSpeedOptimWrapper') | |
| param_scheduler = [ | |
| dict( | |
| begin=0, | |
| by_epoch=True, | |
| convert_to_iter_based=True, | |
| end=0.03, | |
| start_factor=1e-05, | |
| type='mmengine.optim.LinearLR'), | |
| dict( | |
| T_max=1, | |
| begin=0.03, | |
| by_epoch=True, | |
| convert_to_iter_based=True, | |
| eta_min=0.0, | |
| type='mmengine.optim.CosineAnnealingLR'), | |
| ] | |
| pretrained_pth = '/data/wenhao/projects/xtuner/work_dirs/final_new_v/projector' | |
| prompt_template = 'xtuner.utils.PROMPT_TEMPLATE.llama3_chat' | |
| randomness = dict(deterministic=False, seed=1416244085) | |
| resume = True | |
| runner_type = 'FlexibleRunner' | |
| save_steps = 200 | |
| save_total_limit = 2 | |
| sharegpt4v_data_path = '/data/wenhao/projects/xtuner/data/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.jsonl' | |
| sharegpt4v_data_root = '/data/wenhao/projects/xtuner/data/ShareGPT4V' | |
| sharegpt4v_dataset = dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ShareGPT4V/pre_token_sharegpt4v_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset') | |
| sharegpt4v_image_folder = '/data/wenhao/projects/xtuner/data' | |
| size = 378 | |
| strategy = dict( | |
| config=dict( | |
| bf16=dict(enabled=True), | |
| fp16=dict(enabled=False, initial_scale_power=16), | |
| gradient_accumulation_steps='auto', | |
| gradient_clipping='auto', | |
| train_micro_batch_size_per_gpu='auto', | |
| zero_allow_untested_optimizer=True, | |
| zero_force_ds_cpu_optimizer=False, | |
| zero_optimization=dict(overlap_comm=True, stage=2)), | |
| exclude_frozen_parameters=True, | |
| gradient_accumulation_steps=96, | |
| gradient_clipping=1, | |
| sequence_parallel_size=1, | |
| train_micro_batch_size_per_gpu=1, | |
| type='xtuner.engine.DeepSpeedStrategy') | |
| tokenizer = dict( | |
| padding_side='right', | |
| pretrained_model_name_or_path='meta-llama/Meta-Llama-3-8B-Instruct', | |
| trust_remote_code=True, | |
| type='transformers.AutoTokenizer.from_pretrained') | |
| train_cfg = dict(by_epoch=True, max_epochs=1, val_interval=1) | |
| train_dataloader = dict( | |
| batch_size=1, | |
| collate_fn=dict(type='xtuner.dataset.collate_fns.default_collate_fn'), | |
| dataset=dict( | |
| datasets=[ | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder= | |
| '/data/wenhao/projects/xtuner/data/M4-Instruct-Data/', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/M4-Instruct-Data/pre_token_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder= | |
| '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/images', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/pre_token_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_cl_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_cv_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_il_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_iv_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder= | |
| '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/images', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/pre_token_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/allava_text/images', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_evol_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ShareGPT4V/pre_token_sharegpt4v_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder= | |
| '/data/wenhao/projects/xtuner/data/FaceCaption-15M/full_data', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/FaceCaption-15M/pre_token_llama3', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| ], | |
| type='xtuner.dataset.ConcatDataset'), | |
| num_workers=0, | |
| sampler=dict(shuffle=True, type='mmengine.dataset.DefaultSampler')) | |
| train_dataset = dict( | |
| datasets=[ | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/M4-Instruct-Data/', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/M4-Instruct-Data/pre_token_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder= | |
| '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/images', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/LLaVA-NeXT-Data/pre_token_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_cl_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_cv_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_il_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data/ALLaVA-4V', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_iv_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder= | |
| '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/images', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/LLaVA-Instruct-150K/pre_token_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/allava_text/images', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ALLaVA-4V/pre_token_evol_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder='/data/wenhao/projects/xtuner/data', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/ShareGPT4V/pre_token_sharegpt4v_llama31', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| dict( | |
| dataset_map_fn='xtuner.dataset.map_fns.cambrian_map_fn', | |
| image_folder= | |
| '/data/wenhao/projects/xtuner/data/FaceCaption-15M/full_data', | |
| image_processor=dict( | |
| crop_size=378, | |
| pretrained_model_name_or_path= | |
| 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', | |
| size=378, | |
| trust_remote_code=True, | |
| type='transformers.CLIPImageProcessor.from_pretrained'), | |
| max_length=4096, | |
| offline_processed_text_folder= | |
| '/data/wenhao/projects/xtuner/data/FaceCaption-15M/pre_token_llama3', | |
| pad_image_to_square=True, | |
| template_map_fn=dict( | |
| template='xtuner.utils.PROMPT_TEMPLATE.llama3_chat', | |
| type='xtuner.dataset.map_fns.template_map_fn_factory'), | |
| type='xtuner.dataset.CambrianDataset'), | |
| ], | |
| type='xtuner.dataset.ConcatDataset') | |
| visual_encoder_name_or_path = '/data/wenhao/projects/xtuner/work_dirs/final_new_v/visual_encoder' | |
| visual_token_merge_ratio = 0.1 | |
| visualizer = None | |
| warmup_ratio = 0.03 | |
| weight_decay = 0 | |
| work_dir = 'work_dirs/new_image' | |