Spaces:
Runtime error
Runtime error
| import ast | |
| import re | |
| import io | |
| import os | |
| import json | |
| import copy | |
| import shutil | |
| import base64 | |
| import random | |
| import requests | |
| import gradio as gr | |
| from datetime import datetime | |
| from modelscope.pipelines import pipeline | |
| from modelscope import snapshot_download | |
| from modelscope.utils.constant import Tasks | |
| from PIL import Image, ImageDraw, ImageFont | |
| from PCAgent.api import inference_chat | |
| from PCAgent.icon_localization import det | |
| from PCAgent.text_localization_old import ocr | |
| from PCAgent.prompt_qwen import get_subtask_prompt as get_subtask_prompt | |
| from PCAgent.chat import init_action_chat, init_memory_chat, add_response | |
| from PCAgent.prompt_qwen import get_action_prompt, get_process_prompt, get_memory_prompt | |
| from PCAgent.merge_strategy import merge_boxes_and_texts, merge_all_icon_boxes, merge_boxes_and_texts_new | |
| vl_model_version = os.getenv('vl_model_version') | |
| llm_model_version = os.getenv('llm_model_version') | |
| API_url = os.getenv('API_url') | |
| token = os.getenv('token') | |
| # os.environ["OCR_ACCESS_KEY_ID"] = os.getenv('OCR_ACCESS_KEY_ID') | |
| # os.environ["OCR_ACCESS_KEY_SECRET"] = os.getenv('OCR_ACCESS_KEY_SECRET') | |
| ocr_detection = pipeline(Tasks.ocr_detection, model='damo/cv_resnet18_ocr-detection-line-level_damo') | |
| ocr_recognition = pipeline(Tasks.ocr_recognition, model='damo/cv_convnextTiny_ocr-recognition-document_damo') | |
| tff_file = os.environ.get('tff_file') | |
| radius = 100 | |
| def download_file(url, save_path): | |
| response = requests.get(url, stream=True) # 以流的方式下载 | |
| response.raise_for_status() # 确保请求成功 | |
| with open(save_path, 'wb') as file: | |
| for chunk in response.iter_content(chunk_size=8192): # 分块写入,防止占用过多内存 | |
| file.write(chunk) | |
| download_file(tff_file, "arial.ttf") | |
| chatbot_css = """ | |
| <style> | |
| .chat-container { | |
| display: flex; | |
| flex-direction: column; | |
| overflow-y: auto; | |
| max-height: 800px; | |
| margin: 10px; | |
| } | |
| .user-message, .bot-message { | |
| margin: 5px; | |
| padding: 10px; | |
| border-radius: 10px; | |
| } | |
| .user-message { | |
| text-align: right; | |
| background-color: #7B68EE; | |
| color: white; | |
| align-self: flex-end; | |
| } | |
| .bot-message { | |
| text-align: left; | |
| background-color: #ADD8E6; | |
| color: black; | |
| align-self: flex-start; | |
| } | |
| .user-image { | |
| text-align: right; | |
| align-self: flex-end; | |
| max-width: 150px; | |
| max-height: 300px; | |
| } | |
| .bot-image { | |
| text-align: left; | |
| align-self: flex-start; | |
| max-width: 200px; | |
| max-height: 400px; | |
| } | |
| </style> | |
| """ | |
| def cmyk_to_rgb(c, m, y, k): | |
| r = 255 * (1.0 - c / 255) * (1.0 - k / 255) | |
| g = 255 * (1.0 - m / 255) * (1.0 - k / 255) | |
| b = 255 * (1.0 - y / 255) * (1.0 - k / 255) | |
| return int(r), int(g), int(b) | |
| def draw_coordinates_boxes_on_image(image_path, coordinates, output_image_path, font_path, no_text=0): | |
| image = Image.open(image_path) | |
| width, height = image.size | |
| draw = ImageDraw.Draw(image) | |
| total_boxes = len(coordinates) | |
| colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) for _ in | |
| range(total_boxes)] | |
| for i, coord in enumerate(coordinates): | |
| c, m, y, k = colors[i] | |
| color = cmyk_to_rgb(c, m, y, k) | |
| draw.rectangle(coord, outline=color, width=int(height * 0.0025)) | |
| if no_text != 1: | |
| font = ImageFont.truetype(font_path, int(height * 0.012)) | |
| text_x = coord[0] + int(height * 0.0025) | |
| text_y = max(0, coord[1] - int(height * 0.013)) | |
| draw.text((text_x, text_y), str(i + 1), fill=color, font=font) | |
| image = image.convert('RGB') | |
| if os.path.exists(output_image_path): | |
| os.remove(output_image_path) | |
| image.save(output_image_path) | |
| def get_perception_infos(screenshot_file, screenshot_som_file, font_path): | |
| total_width, total_height = Image.open(screenshot_file).size | |
| # no partition | |
| img_list = [screenshot_file] | |
| img_x_list = [0] | |
| img_y_list = [0] | |
| coordinates = [] | |
| texts = [] | |
| padding = total_height * 0.0025 # 10 | |
| for i, img in enumerate(img_list): | |
| width, height = Image.open(img).size | |
| sub_text, sub_coordinates = ocr(img, ocr_detection, ocr_recognition) # for api | |
| for coordinate in sub_coordinates: | |
| coordinate[0] = int(max(0, img_x_list[i] + coordinate[0] - padding)) | |
| coordinate[2] = int(min(total_width, img_x_list[i] + coordinate[2] + padding)) | |
| coordinate[1] = int(max(0, img_y_list[i] + coordinate[1] - padding)) | |
| coordinate[3] = int(min(total_height,img_y_list[i] + coordinate[3] + padding)) | |
| sub_text_merge, sub_coordinates_merge = merge_boxes_and_texts_new(sub_text, sub_coordinates) | |
| coordinates.extend(sub_coordinates_merge) | |
| texts.extend(sub_text_merge) | |
| merged_text, merged_text_coordinates = merge_boxes_and_texts(texts, coordinates) | |
| filtered_merged_text = [] | |
| filtered_merged_text_coordinates = [] | |
| for i in range(len(merged_text)): | |
| filtered_merged_text.append(merged_text[i]) | |
| filtered_merged_text_coordinates.append(merged_text_coordinates[i]) | |
| merged_text, merged_text_coordinates = filtered_merged_text, filtered_merged_text_coordinates | |
| coordinates = [] | |
| for i, img in enumerate(img_list): | |
| width, height = Image.open(img).size | |
| sub_coordinates = det(img, "icon", groundingdino_model) | |
| for coordinate in sub_coordinates: | |
| coordinate[0] = int(max(0, img_x_list[i] + coordinate[0] - padding)) | |
| coordinate[2] = int(min(total_width, img_x_list[i] + coordinate[2] + padding)) | |
| coordinate[1] = int(max(0, img_y_list[i] + coordinate[1] - padding)) | |
| coordinate[3] = int(min(total_height, img_y_list[i] + coordinate[3] + padding)) | |
| sub_coordinates = merge_all_icon_boxes(sub_coordinates) | |
| coordinates.extend(sub_coordinates) | |
| merged_icon_coordinates = merge_all_icon_boxes(coordinates) | |
| rec_list = merged_text_coordinates + merged_icon_coordinates | |
| draw_coordinates_boxes_on_image(screenshot_file, copy.deepcopy(rec_list), screenshot_som_file, font_path) | |
| mark_number = 0 | |
| perception_infos = [] | |
| for i in range(len(merged_text_coordinates)): | |
| mark_number += 1 | |
| perception_info = {"text": "mark number: " + str(mark_number) + " text: " + merged_text[i], "coordinates": merged_text_coordinates[i]} | |
| perception_infos.append(perception_info) | |
| for i in range(len(merged_icon_coordinates)): | |
| mark_number += 1 | |
| perception_info = {"text": "mark number: " + str(mark_number) + " icon", "coordinates": merged_icon_coordinates[i]} | |
| perception_infos.append(perception_info) | |
| for i in range(len(perception_infos)): | |
| perception_infos[i]['coordinates'] = [int((perception_infos[i]['coordinates'][0]+perception_infos[i]['coordinates'][2])/2), int((perception_infos[i]['coordinates'][1]+perception_infos[i]['coordinates'][3])/2)] | |
| return perception_infos, total_width, total_height | |
| groundingdino_dir = snapshot_download('AI-ModelScope/GroundingDINO', revision='v1.0.0') | |
| groundingdino_model = pipeline('grounding-dino-task', model=groundingdino_dir) | |
| def analyze_string(s): | |
| result = { | |
| 'type': None, | |
| 'format_keys': [], | |
| 'dict_content': None | |
| } | |
| format_pattern = re.compile(r'\{(\w+)\}') | |
| # {'key': 'value'} | |
| dict_pattern = re.compile( | |
| r'\{(?:\s*[\'\"]\w+[\'\"]\s*:\s*[\'\"][^{}\'\"]+[\'\"]\s*,?)*\}' | |
| ) | |
| dict_matches = dict_pattern.findall(s) | |
| dicts = [] | |
| for match in dict_matches: | |
| try: | |
| parsed_dict = ast.literal_eval(match) | |
| if isinstance(parsed_dict, dict): | |
| dicts.append(parsed_dict) | |
| except (ValueError, SyntaxError): | |
| continue | |
| has_dict = len(dicts) > 0 | |
| s_without_dicts = dict_pattern.sub('', s) | |
| format_keys = format_pattern.findall(s_without_dicts) | |
| has_format = len(format_keys) > 0 | |
| has_format_and_dict = has_format and has_dict | |
| if has_format_and_dict: | |
| result['type'] = 4 | |
| elif has_format: | |
| result['type'] = 2 | |
| elif has_dict: | |
| result['type'] = 3 | |
| else: | |
| result['type'] = 1 | |
| if has_format: | |
| result['format_keys'] = format_keys | |
| if has_dict: | |
| result['dict_content'] = dicts[0] | |
| return result | |
| import re | |
| def is_good_string(s): | |
| # Regex to match the dictionary-like part {'key1': 'value1', ...} | |
| dict_pattern = r"\{('[^']+' *: *'[^']+' *(, *'[^']+' *: *'[^']+')*)?\}" | |
| # Regex to match the item list part {item1, item2,...} with no single quotes in items | |
| item_pattern = r"\{([a-zA-Z0-9_]+( *, *[a-zA-Z0-9_]+)*)?\}" | |
| # Find all parts of the string contained within braces | |
| parts = re.findall(r'\{.*?\}', s) | |
| for part in parts: | |
| # Check if the part matches either the dictionary pattern or item pattern | |
| if not re.fullmatch(dict_pattern, part) and not re.fullmatch(item_pattern, part): | |
| return False | |
| return True | |
| screenshot_root = "screenshot" | |
| if os.path.exists(screenshot_root): | |
| shutil.rmtree(screenshot_root) | |
| os.mkdir(screenshot_root) | |
| def image_to_base64(image): | |
| buffered = io.BytesIO() | |
| image.save(buffered, format="PNG") | |
| img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") | |
| img_html = f'<img src="data:image/png;base64,{img_str}" />' | |
| return img_html | |
| def chatbot(image, instruction, add_info, history, chat_log): | |
| if history == {}: | |
| output_for_save = [] | |
| thought_history = [] | |
| summary_history = [] | |
| action_history = [] | |
| summary = "" | |
| action = "" | |
| completed_requirements = "" | |
| memory = "" | |
| insight = "" | |
| error_flag = False | |
| user_msg = "<div class='user-message'>{}</div>".format(instruction) | |
| step_idx = 0 | |
| else: | |
| output_for_save = history["output_for_save"] | |
| thought_history = history["thought_history"] | |
| summary_history = history["summary_history"] | |
| action_history = history["action_history"] | |
| summary = history["summary"] | |
| action = history["action"] | |
| completed_requirements = history["completed_requirements"] | |
| memory = history["memory"][0] | |
| insight = history["insight"] | |
| error_flag = history["error_flag"] | |
| user_msg = "<div class='user-message'>{}</div>".format("I have uploaded the screenshot. Please continue operating.") | |
| step_idx = history["history"] | |
| current_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") | |
| temp_file = f"temp_{current_time}" | |
| os.mkdir(temp_file) | |
| screenshot_file = os.path.join(screenshot_root, f"screenshot_{current_time}.png") | |
| image.save(screenshot_file, format="PNG") | |
| screenshot_som_file = screenshot_file.split(".")[0] + "_som." + screenshot_file.split(".")[1] | |
| perception_infos, width, height = get_perception_infos(screenshot_file, screenshot_som_file, font_path="arial.ttf") | |
| shutil.rmtree(temp_file) | |
| os.mkdir(temp_file) | |
| output_for_save_this_step = {} | |
| prompt_action = get_action_prompt(instruction, perception_infos, width, height, thought_history, summary_history, action_history, [], summary, action, "", add_info, error_flag, completed_requirements, memory) | |
| chat_action = init_action_chat() | |
| chat_action = add_response("user", prompt_action, chat_action, [screenshot_som_file]) | |
| output_action = inference_chat(chat_action, vl_model_version, API_url, token) | |
| output_for_save_this_step['action'] = output_action | |
| action_json = json.loads(output_action.split("```json")[-1].split("```")[0]) | |
| thought = action_json['Thought'] | |
| summary = action_json['Summary'] | |
| action = action_json['Action'] | |
| chat_action = add_response("assistant", output_action, chat_action) | |
| if "Double TapIdx" in action: | |
| bot_response = "Please double click (click x 2) the red circle and upload the current screenshot again." | |
| idx = action.split("(")[-1].split(")")[0] | |
| coordinate = perception_infos[idx]['coordinates'] | |
| x, y = int(coordinate[0]), int(coordinate[1]) | |
| draw = ImageDraw.Draw(image) | |
| draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) | |
| elif "Double Tap" in action: | |
| bot_response = "Please double click (click x 2) the red circle and upload the current screenshot again." | |
| coordinate = action.split("(")[-1].split(")")[0].split(", ") | |
| x, y = int(coordinate[0]), int(coordinate[1]) | |
| draw = ImageDraw.Draw(image) | |
| draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) | |
| elif "Triple TapIdx" in action: | |
| bot_response = "Please triple click (click x 3) the red circle and upload the current screenshot again." | |
| coordinate = action.split("(")[-1].split(")")[0].split(", ") | |
| x, y = int(coordinate[0]), int(coordinate[1]) | |
| draw = ImageDraw.Draw(image) | |
| draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) | |
| elif "Triple Tap" in action: | |
| bot_response = "Please triple click (click x 3) the red circle and upload the current screenshot again." | |
| idx = action.split("(")[-1].split(")")[0] | |
| coordinate = perception_infos[idx]['coordinates'] | |
| x, y = int(coordinate[0]), int(coordinate[1]) | |
| draw = ImageDraw.Draw(image) | |
| draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) | |
| elif "TapIdx" in action: | |
| bot_response = "Please click (click x 1) the red circle and upload the current screenshot again." | |
| idx = action.split("(")[-1].split(")")[0] | |
| coordinate = perception_infos[idx]['coordinates'] | |
| x, y = int(coordinate[0]), int(coordinate[1]) | |
| draw = ImageDraw.Draw(image) | |
| draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) | |
| elif "Tap" in action: | |
| bot_response = "Please click (click x 1) the red circle and upload the current screenshot again." | |
| coordinate = action.split("(")[-1].split(")")[0].split(", ") | |
| x, y = int(coordinate[0]), int(coordinate[1]) | |
| draw = ImageDraw.Draw(image) | |
| draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) | |
| elif "Shortcut" in action: | |
| keys = action.split("(")[-1].split(")")[0].split(", ") | |
| key1, key2 = keys[0].lower(), keys[1].lower() | |
| bot_response = f"Please press {key1}+{key2} and upload the current screenshot again." | |
| elif "Press" in action: | |
| key = action.split("(")[-1].split(")")[0] | |
| bot_response = f"Please press {key} and upload the current screenshot again." | |
| elif "Open App" in action: | |
| app = action.split("(")[-1].split(")")[0] | |
| bot_response = f"Please open {app} app and upload the current screenshot again." | |
| elif "Type" in action: | |
| coordinate = action.split("(")[1].split(")")[0].split(", ") | |
| x, y = int(coordinate[0]), int(coordinate[1]) | |
| if "[text]" not in action: | |
| # for claude | |
| if '[' not in action or ']' not in action: | |
| # text = action.split('),')[-1].strip() | |
| text = action.split('),')[-1].strip().split("(")[1].split(")")[0].replace("text: ", '').replace("'", "") | |
| else: | |
| text = action.split("[")[-1].split("]")[0] | |
| else: | |
| text = action.split(" \"")[-1].split("\"")[0] | |
| draw = ImageDraw.Draw(image) | |
| draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) | |
| bot_response = f"Please type \"{text}\" in the red circle and upload the current screenshot again." | |
| elif "Select (" in action: | |
| content = action.split("(")[1].split(")")[0] | |
| bot_response = f"Please select the text content \"{content}\" and upload the current screenshot again." | |
| elif "Replace (" in action: | |
| coordinate = action.split("(")[1].split(")")[0].split(", ") | |
| x, y = int(coordinate[0]), int(coordinate[1]) | |
| if "[text]" not in action: | |
| # for claude | |
| if '[' not in action or ']' not in action: | |
| # text = action.split('),')[-1].strip() | |
| text = action.split('),')[-1].strip().split("(")[1].split(")")[0].replace("text: ", '') | |
| else: | |
| if "] with " in action: | |
| text = action.split("] with ")[-1] | |
| text = text.replace("\"", '').replace("'", '').strip('.') | |
| else: | |
| text = action.split("[")[-1].split("]")[0] | |
| else: | |
| text = action.split(" \"")[-1].split("\"")[0] | |
| draw = ImageDraw.Draw(image) | |
| draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) | |
| bot_response = f"Please replace the text in the red circle by \"{text}\" and upload the current screenshot again." | |
| elif "Append (" in action: | |
| coordinate = action.split("(")[1].split(")")[0].split(", ") | |
| x, y = int(coordinate[0]), int(coordinate[1]) | |
| if "[text]" not in action: | |
| if '[' not in action or ']' not in action: | |
| text = action.split('),')[-1].strip() | |
| else: | |
| text = action.split("[")[-1].split("]")[0] | |
| else: | |
| text = action.split(" \"")[-1].split("\"")[0] | |
| draw = ImageDraw.Draw(image) | |
| draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) | |
| bot_response = f"Please insert the text \"{text}\" in the red circle and upload the current screenshot again." | |
| elif "Stop" in action: | |
| output_for_save.append(output_for_save_this_step) | |
| bot_response = f"Answer: {output_for_save}, task completed" | |
| prompt_memory = get_memory_prompt(insight) | |
| chat_action = add_response("user", prompt_memory, chat_action) | |
| output_memory = inference_chat(chat_action, vl_model_version, API_url, token) | |
| chat_action = add_response("assistant", output_memory, chat_action) | |
| output_memory = output_memory.split("### Important content ###")[-1].split("\n\n")[0].strip() + "\n" | |
| if "None" not in output_memory and output_memory not in memory: | |
| memory += output_memory | |
| bot_text1 = "<div class='bot-message'>{}</div>".format("### Decision ###") | |
| bot_thought = "<div class='bot-message'>{}</div>".format("Thought: " + thought) | |
| bot_action = "<div class='bot-message'>{}</div>".format("Action: " + action) | |
| bot_operation = "<div class='bot-message'>{}</div>".format("Operation: " + summary) | |
| bot_text2 = "<div class='bot-message'>{}</div>".format("### Memory ###") | |
| if len(memory) > 0: | |
| bot_memory = "<div class='bot-message'>{}</div>".format(memory) | |
| else: | |
| bot_memory = "<div class='bot-message'>{}</div>".format("None") | |
| bot_response = "<div class='bot-message'>{}</div>".format(bot_response) | |
| if image is not None: | |
| bot_img_html = image_to_base64(image) | |
| bot_response = "<div class='bot-image'>{}</div>".format(bot_img_html) + bot_response | |
| chat_log.append(user_msg) | |
| shutil.rmtree(temp_file) | |
| # os.remove(screenshot_file) | |
| # os.remove(screenshot_som_file) | |
| thought_history.append(thought) | |
| summary_history.append(summary) | |
| action_history.append(action) | |
| prompt_planning = get_process_prompt(instruction, thought_history, summary_history, action_history, completed_requirements, add_info) | |
| chat_planning = init_memory_chat() | |
| chat_planning = add_response("user", prompt_planning, chat_planning ) | |
| output_planning = inference_chat(chat_planning, llm_model_version, API_url, token) | |
| output_for_save_this_step['planning'] = output_planning | |
| chat_planning = add_response("assistant", output_planning, chat_planning ) | |
| completed_requirements = output_planning.split("### Completed contents ###")[-1].replace("\n", " ").strip() | |
| bot_text3 = "<div class='bot-message'>{}</div>".format("### Planning ###") | |
| output_planning = "<div class='bot-message'>{}</div>".format(output_planning) | |
| history["thought_history"] = thought_history | |
| history["summary_history"] = summary_history | |
| history["action_history"] = action_history | |
| history["summary"] = summary | |
| history["action"] = action | |
| history["memory"] = memory, | |
| history["memory_switch"] = True, | |
| history["insight"] = insight | |
| history["error_flag"] = error_flag | |
| history["completed_requirements"] = completed_requirements | |
| history["output_for_save"] = output_for_save | |
| history["history"] = step_idx + 1 | |
| chat_log.append(bot_text3) | |
| chat_log.append(output_planning) | |
| chat_log.append(bot_text1) | |
| chat_log.append(bot_thought) | |
| chat_log.append(bot_action) | |
| chat_log.append(bot_operation) | |
| chat_log.append(bot_text2) | |
| chat_log.append(bot_memory) | |
| chat_log.append(bot_response) | |
| chat_html = "<div class='chat-container'>{}</div>".format("".join(chat_log)) | |
| return chatbot_css + chat_html, history, chat_log | |
| def lock_input(instruction): | |
| return gr.update(value=instruction, interactive=False), gr.update(value=None) | |
| def reset_demo(): | |
| return gr.update(value="", interactive=True), gr.update(value=None, interactive=True), "<div class='chat-container'></div>", {}, [] | |
| tos_markdown = ("""<div style="display:flex; gap: 0.25rem;" align="center"> | |
| <a href='https://github.com/X-PLUG/MobileAgent'><img src='https://img.shields.io/badge/Github-Code-blue'></a> | |
| <a href="https://arxiv.org/abs/2502.14282"><img src="https://img.shields.io/badge/Arxiv-2502.14282-red"></a> | |
| <a href='https://github.com/X-PLUG/MobileAgent/stargazers'><img src='https://img.shields.io/github/stars/X-PLUG/MobileAgent.svg?style=social'></a> | |
| </div> | |
| If you like our project, please give us a star ✨ on Github for latest update. | |
| **Terms of use** | |
| 1. Input your instruction in \"Instruction\", for example \"Turn on the dark mode\". | |
| 2. You can input helpful operation knowledge in \"Knowledge\". | |
| 3. Click \"Submit\" to get the operation. You need to operate your PC according to the operation and then upload the screenshot after your operation. | |
| 4. We show two examples below, each with three screenshots. Click and submit from top to bottom to experience it. | |
| **使用说明** | |
| 1. 在“Instruction”中输入你的指令,例如“打开深色模式”。 | |
| 2. 你可以在“Knowledge”中输入帮助性的操作知识。 | |
| 3. 点击“Submit”来获得操作。你需要根据输出来操作PC,并且上传操作后的截图。 | |
| 4. 我们在下方展示了两个例子,每个例子有三张截屏。请从上到下依次点击并提交来体验。""") | |
| title_markdowm = ("""# PC-Agent: A Hierarchical Multi-Agent Collaboration Framework for Complex Task Automation on PC""") | |
| instruction_input = gr.Textbox(label="Instruction", placeholder="Input your instruction") | |
| knowledge_input = gr.Textbox(label="Knowledge", placeholder="Input your knowledge") | |
| image_input = gr.Image(label="Screenshot", type="pil", height=350, width=700) | |
| with gr.Blocks() as demo: | |
| history_state = gr.State(value={}) | |
| history_output = gr.State(value=[]) | |
| with gr.Row(): | |
| gr.Markdown(title_markdowm) | |
| with gr.Row(): | |
| with gr.Column(scale=5): | |
| gr.Markdown(tos_markdown) | |
| image_input.render() | |
| gr.Examples(examples=[ | |
| ["./example/1-1.jpg", "Search for Alibaba's stock price in Chrome", "The Chrome search bar is in the middle of the screen and has \"在Google 中搜索,或输入网址\" written on it."], | |
| ["./example/1-2.jpg", "Search for Alibaba's stock price in Chrome", "The Chrome search bar is in the middle of the screen and has \"在Google 中搜索,或输入网址\" written on it."], | |
| ["./example/1-3.jpg", "Search for Alibaba's stock price in Chrome", "The Chrome search bar is in the middle of the screen and has \"在Google 中搜索,或输入网址\" written on it."], | |
| ], inputs=[image_input, instruction_input, knowledge_input]) | |
| with gr.Column(scale=6): | |
| instruction_input.render() | |
| knowledge_input.render() | |
| with gr.Row(): | |
| start_button = gr.Button("Submit") | |
| clear_button = gr.Button("Clear") | |
| output_component = gr.HTML(label="Chat history", value="<div class='chat-container'></div>") | |
| start_button.click( | |
| fn=lambda image, instruction, add_info, history, output: chatbot(image, instruction, add_info, history, output), | |
| inputs=[image_input, instruction_input, knowledge_input, history_state, history_output], | |
| outputs=[output_component, history_state, history_output] | |
| ) | |
| clear_button.click( | |
| fn=reset_demo, | |
| inputs=[], | |
| outputs=[instruction_input, knowledge_input, output_component, history_state, history_output] | |
| ) | |
| demo.queue().launch(share=True) |