Spaces:

BenkHel
/

CumoThesis

Runtime error

App Files Files Community

CumoThesis / cumo /eval /mmmu_utils /data_utils.py

BenkHel

Upload 43 files

d1f015b verified 5 months ago

raw

history blame contribute delete

6.2 kB

	"""Utils for data load, save, and process (e.g., prompt construction)"""

	import os
	import json
	import yaml
	import re


	DOMAIN_CAT2SUB_CAT = {
	'Art and Design': ['Art', 'Art_Theory', 'Design', 'Music'],
	'Business': ['Accounting', 'Economics', 'Finance', 'Manage','Marketing'],
	'Science': ['Biology', 'Chemistry', 'Geography', 'Math', 'Physics',],
	'Health and Medicine': ['Basic_Medical_Science', 'Clinical_Medicine', 'Diagnostics_and_Laboratory_Medicine', 'Pharmacy', 'Public_Health'],
	'Humanities and Social Science': ['History', 'Literature', 'Sociology', 'Psychology'],
	'Tech and Engineering': ['Agriculture', 'Architecture_and_Engineering', 'Computer_Science', 'Electronics', 'Energy_and_Power', 'Materials', 'Mechanical_Engineering'],
	}


	CAT_SHORT2LONG = {
	'acc': 'Accounting',
	'agri': 'Agriculture',
	'arch': 'Architecture_and_Engineering',
	'art': 'Art',
	'art_theory': 'Art_Theory',
	'bas_med': 'Basic_Medical_Science',
	'bio': 'Biology',
	'chem': 'Chemistry',
	'cli_med': 'Clinical_Medicine',
	'cs': 'Computer_Science',
	'design': 'Design',
	'diag_med': 'Diagnostics_and_Laboratory_Medicine',
	'econ': 'Economics',
	'elec': 'Electronics',
	'ep': 'Energy_and_Power',
	'fin': 'Finance',
	'geo': 'Geography',
	'his': 'History',
	'liter': 'Literature',
	'manage': 'Manage',
	'mark': 'Marketing',
	'mate': 'Materials',
	'math': 'Math',
	'mech': 'Mechanical_Engineering',
	'music': 'Music',
	'phar': 'Pharmacy',
	'phys': 'Physics',
	'psy': 'Psychology',
	'pub_health': 'Public_Health',
	'socio': 'Sociology'
	}

	# DATA SAVING
	def save_json(filename, ds):
	with open(filename, 'w') as f:
	json.dump(ds, f, indent=4)


	def get_multi_choice_info(options):
	"""
	Given the list of options for multiple choice question
	Return the index2ans and all_choices
	"""

	start_chr = 'A'
	all_choices = []
	index2ans = {}
	for i, option in enumerate(options):
	index2ans[chr(ord(start_chr) + i)] = option
	all_choices.append(chr(ord(start_chr) + i))

	return index2ans, all_choices

	def load_yaml(file_path):
	with open(file_path, 'r') as stream:
	try:
	yaml_dict = yaml.safe_load(stream)
	except yaml.YAMLError as exc:
	print(exc)

	return yaml_dict


	def parse_img_path(text):
	matches = re.findall("<img='(.*?)'>", text)
	return matches

	def process_single_sample(data):
	question = data['question']
	o_imgs_paths = []
	for option in data['options']:
	current_o_imgs_paths = parse_img_path(option)
	for img_path in current_o_imgs_paths:
	o_imgs_paths.append(img_path)

	if len(o_imgs_paths) > 1: # multiple images in options, used for random selection
	return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'],
	'image': None, 'question_type': data['question_type']}
	else:
	return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'],
	'image': data['image_1'], 'question_type': data['question_type']}


	# DATA SAVING
	def save_json(filename, ds):
	with open(filename, 'w') as f:
	json.dump(ds, f, indent=4)

	def save_jsonl(filename, data):
	"""
	Save a dictionary of data to a JSON Lines file with the filename as key and caption as value.

	Args:
	filename (str): The path to the file where the data should be saved.
	data (dict): The dictionary containing the data to save where key is the image path and value is the caption.
	"""
	with open(filename, 'w', encoding='utf-8') as f:
	for img_path, caption in data.items():
	# Extract the base filename without the extension
	base_filename = os.path.basename(img_path)
	# Create a JSON object with the filename as the key and caption as the value
	json_record = json.dumps({base_filename: caption}, ensure_ascii=False)
	# Write the JSON object to the file, one per line
	f.write(json_record + '\n')

	def save_args(args, path_dir):
	argsDict = args.__dict__
	with open(path_dir + 'setting.txt', 'w') as f:
	f.writelines('------------------ start ------------------' + '\n')
	for eachArg, value in argsDict.items():
	f.writelines(eachArg + ' : ' + str(value) + '\n')
	f.writelines('------------------- end -------------------')



	# DATA PROCESSING
	def construct_prompt(sample, config):
	question = sample['question']
	options = eval(sample['options'])
	example = ""
	if sample['question_type'] == 'multiple-choice':
	start_chr = 'A'
	prediction_range = []
	index2ans = {}
	for option in options:
	prediction_range.append(start_chr)
	example += f"({start_chr}) {option}\n"
	index2ans[start_chr] = option
	start_chr = chr(ord(start_chr) + 1)
	empty_prompt_sample_structure = config['multi_choice_example_format']
	empty_prompt = empty_prompt_sample_structure.format(question, example)
	res_dict = {}
	res_dict['index2ans'] = index2ans
	res_dict['correct_choice'] = sample['answer']
	res_dict['all_choices'] = prediction_range
	res_dict['empty_prompt'] = empty_prompt
	if config['task_instructions']:
	res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt
	else:
	res_dict['final_input_prompt'] = empty_prompt

	res_dict['gt_content'] = options[ord(sample['answer'].upper()) - ord('A')]
	else:
	empty_prompt_sample_structure = config['short_ans_example_format']
	empty_prompt = empty_prompt_sample_structure.format(question)
	res_dict = {}
	res_dict['empty_prompt'] = empty_prompt
	if config['task_instructions']:
	res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt
	else:
	res_dict['final_input_prompt'] = empty_prompt
	res_dict['gt_content'] = sample['answer']

	res_dict.update(sample)
	return res_dict