import gradio as gr import base64 import json import os import shutil import uuid from huggingface_hub import CommitScheduler, HfApi api = HfApi() api.login(os.environ["HF_TOKEN"]) scheduler = CommitScheduler( repo_id="taesiri/EdgeQuest", repo_type="dataset", folder_path="./data", path_in_repo="data", every=1, ) def generate_json_files( system_message, # New fields name, email_address, institution, openreview_profile, question_categories, subquestion_1_text, subquestion_1_answer, subquestion_2_text, subquestion_2_answer, # Existing fields question, final_answer, rationale_text, # Question images image1, image2, image3, image4, # Rationale images rationale_image1, rationale_image2, ): """ For each request: 1) Create a unique folder under ./data/ 2) Copy uploaded images (question + rationale) into that folder 3) Produce two JSON files: - request_urls.json (local file paths in content) - request_base64.json (base64-encoded images in content) 4) Return paths to both files for Gradio to provide as download links """ # 1) Create parent data folder if it doesn't exist parent_data_folder = "./data" os.makedirs(parent_data_folder, exist_ok=True) # 2) Generate a unique request ID and create a subfolder request_id = str(uuid.uuid4()) # unique ID request_folder = os.path.join(parent_data_folder, request_id) os.makedirs(request_folder) # Handle defaults if not system_message: system_message = "You are a helpful assistant" # Convert None strings def safe_str(val): return val if val is not None else "" name = safe_str(name) email_address = safe_str(email_address) institution = safe_str(institution) openreview_profile = safe_str(openreview_profile) # Convert question_categories to list question_categories = ( [cat.strip() for cat in safe_str(question_categories).split(",")] if question_categories else [] ) subquestion_1_text = safe_str(subquestion_1_text) subquestion_1_answer = safe_str(subquestion_1_answer) subquestion_2_text = safe_str(subquestion_2_text) subquestion_2_answer = safe_str(subquestion_2_answer) question = safe_str(question) final_answer = safe_str(final_answer) rationale_text = safe_str(rationale_text) # Collect image-like fields so we can process them in one loop all_images = [ ("question_image_1", image1), ("question_image_2", image2), ("question_image_3", image3), ("question_image_4", image4), ("rationale_image_1", rationale_image1), ("rationale_image_2", rationale_image2), ] files_list = [] for idx, (img_label, img_obj) in enumerate(all_images): if img_obj is not None: temp_path = os.path.join(request_folder, f"{img_label}.png") if isinstance(img_obj, str): # If image is a file path shutil.copy2(img_obj, temp_path) else: # If image is a numpy array gr.processing_utils.save_image(img_obj, temp_path) # Keep track of the saved path + label files_list.append((img_label, temp_path)) # Build user content in two flavors: local file paths vs base64 # We’ll store text fields as simple dictionaries, and then images separately. content_list_urls = [ {"type": "field", "label": "name", "value": name}, {"type": "field", "label": "email_address", "value": email_address}, {"type": "field", "label": "institution", "value": institution}, {"type": "field", "label": "openreview_profile", "value": openreview_profile}, {"type": "field", "label": "question_categories", "value": question_categories}, {"type": "field", "label": "subquestion_1_text", "value": subquestion_1_text}, { "type": "field", "label": "subquestion_1_answer", "value": subquestion_1_answer, }, {"type": "field", "label": "subquestion_2_text", "value": subquestion_2_text}, { "type": "field", "label": "subquestion_2_answer", "value": subquestion_2_answer, }, {"type": "field", "label": "question", "value": question}, {"type": "field", "label": "final_answer", "value": final_answer}, {"type": "field", "label": "rationale_text", "value": rationale_text}, ] content_list_base64 = [ {"type": "field", "label": "name", "value": name}, {"type": "field", "label": "email_address", "value": email_address}, {"type": "field", "label": "institution", "value": institution}, {"type": "field", "label": "openreview_profile", "value": openreview_profile}, {"type": "field", "label": "question_categories", "value": question_categories}, {"type": "field", "label": "subquestion_1_text", "value": subquestion_1_text}, { "type": "field", "label": "subquestion_1_answer", "value": subquestion_1_answer, }, {"type": "field", "label": "subquestion_2_text", "value": subquestion_2_text}, { "type": "field", "label": "subquestion_2_answer", "value": subquestion_2_answer, }, {"type": "field", "label": "question", "value": question}, {"type": "field", "label": "final_answer", "value": final_answer}, {"type": "field", "label": "rationale_text", "value": rationale_text}, ] # Append image references for img_label, file_path in files_list: # 1) Local path (URL) version rel_path = os.path.join(".", os.path.basename(file_path)) content_list_urls.append( { "type": "image_url", "label": img_label, "image_url": {"url": {"data:image/png;path": rel_path}}, } ) # 2) Base64 version with open(file_path, "rb") as f: file_bytes = f.read() img_b64_str = base64.b64encode(file_bytes).decode("utf-8") content_list_base64.append( { "type": "image_url", "label": img_label, "image_url": {"url": {"data:image/png;base64": img_b64_str}}, } ) # Build the final JSON structures for each approach i = 1 assistant_content = [ {"type": "text", "text": rationale_text}, {"type": "text", "text": final_answer}, ] # A) URLs JSON item_urls = { "custom_id": f"request______{i}", # Metadata at top level "name": name, "email_address": email_address, "institution": institution, "openreview_profile": openreview_profile, "question_categories": question_categories, "question": { "messages": [ {"role": "system", "content": system_message}, { "role": "user", "content": [ {"type": "text", "label": "question", "value": question} ] + [ item for item in content_list_urls if item.get("type") == "image_url" and "question_image" in item.get("label", "") ], }, ], }, "subquestions": [ {"text": subquestion_1_text, "answer": subquestion_1_answer}, {"text": subquestion_2_text, "answer": subquestion_2_answer}, ], "answer": { "final_answer": final_answer, "rationale_text": rationale_text, "rationale_images": [ item for item in content_list_urls if item.get("type") == "image_url" and "rationale_image" in item.get("label", "") ], }, } # B) Base64 JSON item_base64 = { "custom_id": f"request______{i}", # Metadata at top level "name": name, "email_address": email_address, "institution": institution, "openreview_profile": openreview_profile, # Question-related fields at top level "question_categories": question_categories, "subquestions": [ {"text": subquestion_1_text, "answer": subquestion_1_answer}, {"text": subquestion_2_text, "answer": subquestion_2_answer}, ], "final_answer": final_answer, "rationale_text": rationale_text, "body": { "model": "MODEL_NAME", "messages": [ {"role": "system", "content": system_message}, { "role": "user", "content": [ {"type": "field", "label": "question", "value": question} ] + [ item for item in content_list_base64 if item.get("type") == "image_url" and "question_image" in item.get("label", "") ], }, { "role": "assistant", "content": [ {"type": "text", "text": rationale_text}, {"type": "text", "text": final_answer}, *[ item for item in content_list_base64 if item.get("type") == "image_url" and "rationale_image" in item.get("label", "") ], ], }, ], }, } # Convert each to JSON line format urls_json_line = json.dumps(item_urls, ensure_ascii=False) base64_json_line = json.dumps(item_base64, ensure_ascii=False) # 3) Write out two JSON files in request_folder urls_jsonl_path = os.path.join(request_folder, "request_urls.json") base64_jsonl_path = os.path.join(request_folder, "request_base64.json") with open(urls_jsonl_path, "w", encoding="utf-8") as f: f.write(urls_json_line + "\n") with open(base64_jsonl_path, "w", encoding="utf-8") as f: f.write(base64_json_line + "\n") # Return the two file paths so Gradio can offer them as downloads return urls_jsonl_path, base64_jsonl_path # Build the Gradio app with gr.Blocks() as demo: gr.Markdown("# Dataset Builder") with gr.Accordion("Instructions", open=True): gr.HTML( """
Welcome to the Hugging Face space for collecting questions for new benchmark datasets.
| Required Fields | Optional Fields |
|---|---|
|
|
While not all fields are mandatory, providing additional context through optional fields will help create a more comprehensive dataset. After submitting a question, you can clear up the form to submit another one.
""" ) gr.Markdown("## Author Information") with gr.Row(): name_input = gr.Textbox(label="Name", lines=1) email_address_input = gr.Textbox(label="Email Address", lines=1) institution_input = gr.Textbox( label="Institution or 'Independent'", lines=1, placeholder="e.g. MIT, Google, Independent, etc.", ) openreview_profile_input = gr.Textbox( label="OpenReview Profile Name", lines=1, placeholder="Your OpenReview username or profile name", ) gr.Markdown("## Question Information") # Question Images - Individual Tabs with gr.Tabs(): with gr.Tab("Image 1"): image1 = gr.Image(label="Question Image 1", type="filepath") with gr.Tab("Image 2 (Optional)"): image2 = gr.Image(label="Question Image 2", type="filepath") with gr.Tab("Image 3 (Optional)"): image3 = gr.Image(label="Question Image 3", type="filepath") with gr.Tab("Image 4 (Optional)"): image4 = gr.Image(label="Question Image 4", type="filepath") question_input = gr.Textbox( label="Question", lines=15, placeholder="Type your question here..." ) question_categories_input = gr.Textbox( label="Question Categories", lines=1, placeholder="Comma-separated tags, e.g. math, geometry", ) # Answer Section gr.Markdown("## Answer ") final_answer_input = gr.Textbox( label="Final Answer", lines=1, placeholder="Enter the short/concise final answer...", ) rationale_text_input = gr.Textbox( label="Rationale Text", lines=5, placeholder="Enter the reasoning or explanation for the answer...", ) # Rationale Images - Individual Tabs with gr.Tabs(): with gr.Tab("Rationale 1 (Optional)"): rationale_image1 = gr.Image(label="Rationale Image 1", type="filepath") with gr.Tab("Rationale 2 (Optional)"): rationale_image2 = gr.Image(label="Rationale Image 2", type="filepath") # Subquestions Section gr.Markdown("## Subquestions") with gr.Row(): subquestion_1_text_input = gr.Textbox( label="Subquestion 1 Text", lines=2, placeholder="First sub-question..." ) subquestion_1_answer_input = gr.Textbox( label="Subquestion 1 Answer", lines=2, placeholder="Answer to sub-question 1...", ) with gr.Row(): subquestion_2_text_input = gr.Textbox( label="Subquestion 2 Text", lines=2, placeholder="Second sub-question..." ) subquestion_2_answer_input = gr.Textbox( label="Subquestion 2 Answer", lines=2, placeholder="Answer to sub-question 2...", ) system_message_input = gr.Textbox( label="System Message", value="You are a helpful assistant", lines=2, placeholder="Enter the system message that defines the AI assistant's role and behavior...", ) with gr.Row(): submit_button = gr.Button("Submit") clear_button = gr.Button("Clear Form") with gr.Row(): output_file_urls = gr.File( label="Download URLs JSON", interactive=False, visible=False ) output_file_base64 = gr.File( label="Download Base64 JSON", interactive=False, visible=False ) # On Submit, we call generate_json_files with all relevant fields def validate_and_generate( sys_msg, nm, em, inst, orp, qcats, sq1t, sq1a, sq2t, sq2a, q, fa, rt, i1, i2, i3, i4, ri1, ri2, ): # Check all required fields missing_fields = [] if not nm or not nm.strip(): missing_fields.append("Name") if not em or not em.strip(): missing_fields.append("Email Address") if not inst or not inst.strip(): missing_fields.append("Institution") if not q or not q.strip(): missing_fields.append("Question") if not fa or not fa.strip(): missing_fields.append("Final Answer") if not i1: missing_fields.append("First Question Image") # If any required fields are missing, return a warning and keep all fields as is if missing_fields: warning_msg = f"Required fields missing: {', '.join(missing_fields)} ⛔️" # Return all inputs unchanged plus the warning gr.Warning(warning_msg, duration=5) return gr.Button(interactive=True) # Only after successful validation, generate files but keep all fields results = generate_json_files( sys_msg, nm, em, inst, orp, qcats, sq1t, sq1a, sq2t, sq2a, q, fa, rt, i1, i2, i3, i4, ri1, ri2, ) gr.Info( "Dataset item created successfully! 🎉, Clear the form to submit a new one" ) return gr.update(interactive=False) submit_button.click( fn=validate_and_generate, inputs=[ system_message_input, name_input, email_address_input, institution_input, openreview_profile_input, question_categories_input, subquestion_1_text_input, subquestion_1_answer_input, subquestion_2_text_input, subquestion_2_answer_input, question_input, final_answer_input, rationale_text_input, image1, image2, image3, image4, rationale_image1, rationale_image2, ], outputs=[submit_button], ) # Clear button functionality def clear_form_fields(sys_msg, name, email, inst, openreview, *args): # Preserve personal info fields return [ "You are a helpful assistant", # Reset system message to default name, # Preserve name email, # Preserve email inst, # Preserve institution openreview, # Preserve OpenReview profile None, # Clear question categories None, # Clear subquestion 1 text None, # Clear subquestion 1 answer None, # Clear subquestion 2 text None, # Clear subquestion 2 answer None, # Clear question None, # Clear final answer None, # Clear rationale text None, # Clear image1 None, # Clear image2 None, # Clear image3 None, # Clear image4 None, # Clear rationale image1 None, # Clear rationale image2 None, # Clear output file urls None, # Clear output file base64 gr.update(interactive=True), # Re-enable submit button ] clear_button.click( fn=clear_form_fields, inputs=[ system_message_input, name_input, email_address_input, institution_input, openreview_profile_input, ], outputs=[ system_message_input, name_input, email_address_input, institution_input, openreview_profile_input, question_categories_input, subquestion_1_text_input, subquestion_1_answer_input, subquestion_2_text_input, subquestion_2_answer_input, question_input, final_answer_input, rationale_text_input, image1, image2, image3, image4, rationale_image1, rationale_image2, output_file_urls, output_file_base64, submit_button, ], ) demo.launch()