Spaces:
Runtime error
Runtime error
| import os | |
| import base64 | |
| from io import BytesIO | |
| from PIL import Image | |
| from MobileAgent.api import inference_chat | |
| from MobileAgent.prompt_no_input import get_action_prompt, get_reflect_prompt, get_memory_prompt, get_process_prompt | |
| from MobileAgent.chat import init_action_chat, init_reflect_chat, init_memory_chat, add_response, add_response_two_image | |
| from dashscope import MultiModalConversation | |
| import dashscope | |
| import concurrent | |
| API_url = os.environ.get('url') | |
| token = os.environ.get('token') | |
| def base64_to_pil(base64_string): | |
| if base64_string.startswith('data:image'): | |
| base64_string = base64_string.split(',')[-1] | |
| image_data = base64.b64decode(base64_string) | |
| image_stream = BytesIO(image_data) | |
| pil_image = Image.open(image_stream) | |
| return pil_image | |
| def process_image(image, query): | |
| dashscope.api_key = os.environ.get('qwen') | |
| image = "file://" + image | |
| messages = [{ | |
| 'role': 'user', | |
| 'content': [ | |
| { | |
| 'image': image | |
| }, | |
| { | |
| 'text': query | |
| }, | |
| ] | |
| }] | |
| response = MultiModalConversation.call(model="qwen-vl-plus", messages=messages) | |
| try: | |
| response = response['output']['choices'][0]['message']['content'][0]["text"] | |
| except: | |
| response = "This is an icon." | |
| return response | |
| if not os.path.exists("screenshot"): | |
| os.mkdir("screenshot") | |
| if not os.path.exists("temp"): | |
| os.mkdir("temp") | |
| def mobile_agent_infer(json_data): | |
| task = json_data["task"] | |
| if task == "caption": | |
| query = json_data["query"] | |
| images = json_data["images"] | |
| local_images = [] | |
| for image in images: | |
| image_name = image["image_name"] | |
| image_file = image["image_file"] | |
| image_file = base64_to_pil(image_file) | |
| image_path = "temp/" + image_name | |
| image_file.save(image_path, "PNG") | |
| local_images.append(image_path) | |
| icon_map = {} | |
| with concurrent.futures.ThreadPoolExecutor() as executor: | |
| futures = {executor.submit(process_image, image, query): i for i, image in enumerate(local_images)} | |
| for future in concurrent.futures.as_completed(futures): | |
| i = futures[future] | |
| response = future.result() | |
| icon_map[i + 1] = response | |
| output = {"icon_map": icon_map} | |
| return output | |
| elif task == "planning": | |
| instruction = json_data["instruction"] | |
| thought_history = json_data["thought_history"] | |
| summary_history = json_data["summary_history"] | |
| action_history = json_data["action_history"] | |
| completed_requirements = json_data["completed_requirements"] | |
| add_info = json_data["add_info"] | |
| prompt_planning = get_process_prompt(instruction, thought_history, summary_history, action_history, | |
| completed_requirements, add_info) | |
| chat_planning = init_memory_chat() | |
| chat_planning = add_response("user", prompt_planning, chat_planning) | |
| output_planning = inference_chat(chat_planning, 'gpt-4-turbo', API_url, token) | |
| output = {"planning": output_planning} | |
| return output | |
| elif task == "decision": | |
| screenshot_file = json_data["screenshot_file"] | |
| screenshot_file = base64_to_pil(screenshot_file) | |
| image_path = "screenshot/screenshot_local.png" | |
| screenshot_file.save(image_path, "PNG") | |
| instruction = json_data["instruction"] | |
| perception_infos = json_data["perception_infos"] | |
| width = json_data["width"] | |
| height = json_data["height"] | |
| summary_history = json_data["summary_history"] | |
| action_history = json_data["action_history"] | |
| summary = json_data["summary"] | |
| action = json_data["action"] | |
| add_info = json_data["add_info"] | |
| error_flag = json_data["error_flag"] | |
| completed_requirements = json_data["completed_requirements"] | |
| memory = json_data["memory"] | |
| memory_switch = json_data["memory_switch"] | |
| insight = json_data["insight"] | |
| prompt_action = get_action_prompt(instruction, perception_infos, width, height, summary_history, | |
| action_history, summary, action, add_info, error_flag, completed_requirements, | |
| memory) | |
| chat_action = init_action_chat() | |
| chat_action = add_response("user", prompt_action, chat_action, image_path) | |
| output_action = inference_chat(chat_action, 'gpt-4o', API_url, token) | |
| if output_action == "No token": | |
| output = {"decision": "No token", "memory": None} | |
| return output | |
| chat_action = add_response("assistant", output_action, chat_action) | |
| output_memory = None | |
| if memory_switch: | |
| prompt_memory = get_memory_prompt(insight) | |
| chat_action = add_response("user", prompt_memory, chat_action) | |
| output_memory = inference_chat(chat_action, 'gpt-4o', API_url, token) | |
| output = {"decision": output_action, "memory": output_memory} | |
| return output | |
| elif task == "reflection": | |
| screenshot_file = json_data["screenshot_file"] | |
| screenshot_file = base64_to_pil(screenshot_file) | |
| image_path = "screenshot/screenshot_local.png" | |
| screenshot_file.save(image_path, "PNG") | |
| last_screenshot_file = json_data["last_screenshot_file"] | |
| last_screenshot_file = base64_to_pil(last_screenshot_file) | |
| last_image_path = "screenshot/last_screenshot_local.png" | |
| last_screenshot_file.save(last_image_path, "PNG") | |
| instruction = json_data["instruction"] | |
| last_perception_infos = json_data["last_perception_infos"] | |
| perception_infos = json_data["perception_infos"] | |
| width = json_data["width"] | |
| height = json_data["height"] | |
| summary = json_data["summary"] | |
| action = json_data["action"] | |
| add_info = json_data["add_info"] | |
| prompt_reflect = get_reflect_prompt(instruction, last_perception_infos, perception_infos, width, height, | |
| summary, action, add_info) | |
| chat_reflect = init_reflect_chat() | |
| chat_reflect = add_response_two_image("user", prompt_reflect, chat_reflect, [last_image_path, image_path]) | |
| output_reflect = inference_chat(chat_reflect, 'gpt-4o', API_url, token) | |
| output = {"reflection": output_reflect} | |
| return output | |
| else: | |
| output = {"error": "The task must be in \"caption\", \"planning\", \"decision\" and \"reflection\"."} | |
| return output |