Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| sys.path.append(os.getcwd()) | |
| from cllm.agents.base import Action | |
| BUILTIN_SEG_BY_POINTS = "Segment the given image based on the prompt points." | |
| BUILTIN_SEG_BY_MASK = "Segment the given image based on the prompt mask." | |
| # BUILTIN_REMOVE_BY_MASK = "Remove the object based on the given mask." | |
| BUILTIN_IMAGE_TO_EDGE = "Generate the edge from the given image." | |
| BUILTIN_GENERATE_SIMILAR_IMAGE = "Generate a new image similar to the input image" | |
| # BUILTIN_GENERATE_SIMILAR_IMAGE2 = "Generate a similar image from the given image 2" | |
| # BUILTIN_GENERATE_SIMILAR_IMAGE3 = "Image to image. 3" | |
| BUILTIN_GENERATE_SIMILAR_IMAGE4 = "Generate a new image similar to image 4" | |
| BUILTIN_GENERATE_IMAGE_HED = "Generate a new image based on HED result from input image" | |
| BUILTIN_GENERATE_IMAGE_DEPTH = ( | |
| "Generate a new image based on depth map from input image" | |
| ) | |
| BUILTIN_GENERATE_IMAGE_OCR = "Please extract the text from the image" | |
| BUILTIN_TEXT_EDGE_TO_IMAGE = "Generate an image based on the given edge map." | |
| BUILTIN_GENERATE_IMAGE = "Generate a new image that shows a woman is skiing" | |
| BUILTIN_IMAGE_TO_VIDEO = "Generate a video from the image" | |
| BUILTIN_COUNT_OBJECTS = "Provide me with the count of bears in the input image" | |
| BUILTIN_VIDEO_TO_WEBPAGE = "Generate a web page for input video" | |
| BUILTIN_TEXT_TO_MUSIC = "Please generate a piece of music based on given prompt. Here is the prompt: An 80s driving pop song with heavy drums and synth pads in the background" | |
| BUILTIN_IMAGE_ERASING1 = "Erase the wine glass from the photo" | |
| BUILTIN_IMAGE_ERASING2 = "Erase the cats in the photo" | |
| BUILTIN_IMAGE_CROPPING = "Crop the cats from the photo" | |
| BUILTIN_IMAGE_SEG = "give me the mask of elephant." | |
| BUILTIN_IMAGE_HIGHLIGHT = "highlight the elephant." | |
| BUILTIN_TEXT_SPEECH = "translate text into speech" | |
| BUILTIN_DUBBING = "dub this video with the given audio" | |
| BUILTIN_COUNT_OBJECTS2 = "Count the horse in the image." | |
| BUILTIN_IMAGE_TO_VIDEO2 = "Generate an image that shows a serene and beautiful landscape with a calm lake reflecting the blue sky and white clouds. Then generate a video to introduce this image." | |
| BUILTIN_IMAGE_TO_VIDEO3 = "Create a visual and auditory representation of a peaceful and scenic landscape. The image should depict a serene and beautiful landscape with a calm lake reflecting the blue sky. The music should match the image. Finally, combine the image and the music into a video that showcases the beauty of nature." | |
| BUILTIN_VIDEO_CLS = "Recognize the action in the video" | |
| BUILTIN_VIDEO_CLS = "Recognize the action in the video" | |
| BUILTIN_AUDIO_CLS = "Recognize the event in this audio" | |
| BUILTIN_IMAGE2MUSIC = "Generate a piece of music for this image" | |
| BUILTIN_VIDEO2MUSIC = ( | |
| "Generate a piece of music for this video and dub the video with generated music" | |
| ) | |
| BUILTIN_PLANS = { | |
| # BUILTIN_REMOVE_BY_MASK: [ | |
| # [ | |
| # Action( | |
| # tool_name="image_inpainting", | |
| # inputs={"image": "image", "mask": "image.mask"}, | |
| # outputs=["<GENERATED>-0"], | |
| # ) | |
| # ] | |
| # ], | |
| BUILTIN_IMAGE_TO_EDGE: [ | |
| [ | |
| Action( | |
| tool_name="image_to_edge", | |
| inputs={"image": "image"}, | |
| outputs=["<GENERATED>-0"], | |
| ) | |
| ] | |
| ], | |
| BUILTIN_TEXT_EDGE_TO_IMAGE: [ | |
| [ | |
| Action( | |
| tool_name="image_captioning", | |
| inputs={"image": "image"}, | |
| outputs=["<TOOL-GENERATED>-prompt"], | |
| ), | |
| Action( | |
| tool_name="edge_text_to_image", | |
| inputs={ | |
| "edge": "image.edge", | |
| "text": "<TOOL-GENERATED>-prompt", | |
| }, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ] | |
| ], | |
| BUILTIN_GENERATE_SIMILAR_IMAGE: [ | |
| [ | |
| Action( | |
| tool_name="image_to_edge", | |
| inputs={"image": "image"}, | |
| outputs=["<TOOL-GENERATED>-edge"], | |
| ), | |
| Action( | |
| tool_name="image_captioning", | |
| inputs={"image": "image"}, | |
| outputs=["<TOOL-GENERATED>-prompt"], | |
| ), | |
| Action( | |
| tool_name="edge_text_to_image", | |
| inputs={ | |
| "edge": "<TOOL-GENERATED>-edge", | |
| "text": "<TOOL-GENERATED>-prompt", | |
| }, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ] | |
| ], | |
| # BUILTIN_GENERATE_SIMILAR_IMAGE2: [ | |
| # [ | |
| # Action( | |
| # tool_name="image_captioning", | |
| # inputs={"image": "image"}, | |
| # outputs=["<TOOL-GENERATED>-prompt"], | |
| # ), | |
| # Action( | |
| # tool_name="text_to_image", | |
| # inputs={"text": "<TOOL-GENERATED>-prompt"}, | |
| # outputs=["<GENERATED>-0"], | |
| # ), | |
| # ] | |
| # ], | |
| # BUILTIN_GENERATE_SIMILAR_IMAGE3: [ | |
| # [ | |
| # Action( | |
| # tool_name="image_to_image", | |
| # inputs={"image": "image"}, | |
| # outputs=["<GENERATED>-0"], | |
| # ), | |
| # ] | |
| # ], | |
| BUILTIN_GENERATE_IMAGE_HED: [ | |
| [ | |
| Action( | |
| tool_name="image_to_hed", | |
| inputs={"image": "image"}, | |
| outputs=["<TOOL-GENERATED>-image_to_hed-hed-0"], | |
| ), | |
| Action( | |
| tool_name="hed_text_to_image", | |
| inputs={ | |
| "text": "beautiful mountains and sunset", | |
| "hed": "<TOOL-GENERATED>-image_to_hed-hed-0", | |
| }, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ] | |
| ], | |
| BUILTIN_GENERATE_IMAGE_DEPTH: [ | |
| [ | |
| Action( | |
| tool_name="image_captioning", | |
| inputs={ | |
| "image": "image", | |
| }, | |
| outputs=["<TOOL-GENERATED>-image_captioning-text-0"], | |
| ), | |
| Action( | |
| tool_name="image_to_depth", | |
| inputs={"image": "image"}, | |
| outputs=["<TOOL-GENERATED>-image_to_depth-depth-0"], | |
| ), | |
| Action( | |
| tool_name="depth_text_to_image", | |
| inputs={ | |
| "text": "<TOOL-GENERATED>-image_captioning-text-0", | |
| "depth": "<TOOL-GENERATED>-image_to_depth-depth-0", | |
| }, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ] | |
| ], | |
| BUILTIN_GENERATE_IMAGE_OCR: [ | |
| [ | |
| Action( | |
| tool_name="optical_character_recognition", | |
| inputs={"image": "image"}, | |
| outputs=["<GENERATED>-0"], | |
| ) | |
| ] | |
| ], | |
| BUILTIN_COUNT_OBJECTS: [ | |
| [ | |
| Action( | |
| tool_name="object_detection", | |
| inputs={"image": "image"}, | |
| outputs=["<TOOL-GENERATED>-object_detection-bbox-0"], | |
| ), | |
| Action( | |
| tool_name="select_bbox", | |
| inputs={ | |
| "bbox_list": "<TOOL-GENERATED>-object_detection-bbox-0", | |
| "condition": "bear", | |
| }, | |
| outputs=["<TOOL-GENERATED>-select_bbox-bbox-0"], | |
| ), | |
| Action( | |
| tool_name="count_objects", | |
| inputs={"bbox_list": "<TOOL-GENERATED>-select_bbox-bbox-0"}, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ], | |
| [ | |
| Action( | |
| tool_name="image_question_answering", | |
| inputs={ | |
| "text": "Provide me with the count of bears in the input image", | |
| "image": "image", | |
| }, | |
| outputs=["<GENERATED>-1"], | |
| ) | |
| ], | |
| ], | |
| BUILTIN_VIDEO_TO_WEBPAGE: [ | |
| [ | |
| Action( | |
| tool_name="video_captioning", | |
| inputs={"video": "video"}, | |
| outputs=["<TOOL-GENERATED>-text-0"], | |
| ), | |
| Action( | |
| tool_name="text_to_music", | |
| inputs={"text": "<TOOL-GENERATED>-text-0"}, | |
| outputs=["<TOOL-GENERATED>-text_to_music-audio-0"], | |
| ), | |
| Action( | |
| tool_name="dub_video", | |
| inputs={ | |
| "video": "video", | |
| "audio": "<TOOL-GENERATED>-text_to_music-audio-0", | |
| }, | |
| outputs=["<TOOL-GENERATED>-dub_video-video-0"], | |
| ), | |
| Action( | |
| tool_name="title_generation", | |
| inputs={"text": "<TOOL-GENERATED>-text-0"}, | |
| outputs=["<TOOL-GENERATED>-text-1"], | |
| ), | |
| Action( | |
| tool_name="text_to_tags", | |
| inputs={"text": "<TOOL-GENERATED>-text-0"}, | |
| outputs=["<TOOL-GENERATED>-tags-0"], | |
| ), | |
| Action( | |
| tool_name="video_to_webpage", | |
| inputs={ | |
| "video": "<TOOL-GENERATED>-dub_video-video-0", | |
| "title": "<TOOL-GENERATED>-text-1", | |
| "tags": "<TOOL-GENERATED>-tags-0", | |
| "description": "<TOOL-GENERATED>-text-0", | |
| }, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ] | |
| ], | |
| BUILTIN_TEXT_TO_MUSIC: [ | |
| [ | |
| Action( | |
| tool_name="text_to_music", | |
| inputs={ | |
| "text": "An 80s driving pop song with heavy drums and synth pads in the background" | |
| }, | |
| outputs=["<GENERATED>-audio-0"], | |
| ) | |
| ] | |
| ], | |
| BUILTIN_IMAGE_ERASING1: [ | |
| [ | |
| Action( | |
| tool_name="image_instance_segmentation", | |
| inputs={"image": "image"}, | |
| outputs=["<TOOL-GENERATED>-image_instance_segmentation-mask-0"], | |
| ), | |
| Action( | |
| tool_name="select_mask", | |
| inputs={ | |
| "mask_list": "<TOOL-GENERATED>-image_instance_segmentation-mask-0", | |
| "condition": "wine glass", | |
| }, | |
| outputs=["<TOOL-GENERATED>-select_mask-mask-1"], | |
| ), | |
| Action( | |
| tool_name="image_inpainting", | |
| inputs={ | |
| "image": "image", | |
| "mask": "<TOOL-GENERATED>-select_mask-mask-0", | |
| }, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ] | |
| ], | |
| BUILTIN_IMAGE_ERASING2: [ | |
| [ | |
| Action( | |
| tool_name="image_instance_segmentation", | |
| inputs={"image": "image"}, | |
| outputs=["<TOOL-GENERATED>-image_instance_segmentation-mask-0"], | |
| ), | |
| Action( | |
| tool_name="select_mask", | |
| inputs={ | |
| "mask_list": "<TOOL-GENERATED>-image_instance_segmentation-mask-0", | |
| "condition": "cat", | |
| }, | |
| outputs=["<TOOL-GENERATED>-select_mask-mask-0"], | |
| ), | |
| Action( | |
| tool_name="image_inpainting", | |
| inputs={ | |
| "image": "image", | |
| "mask": "<TOOL-GENERATED>-select_mask-mask-0", | |
| }, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ] | |
| ], | |
| BUILTIN_IMAGE_CROPPING: [ | |
| [ | |
| Action( | |
| tool_name="object_detection", | |
| inputs={"image": "image"}, | |
| outputs=["<TOOL-GENERATED>-object_detection-bbox-0"], | |
| ), | |
| Action( | |
| tool_name="select_bbox", | |
| inputs={ | |
| "bbox_list": "<TOOL-GENERATED>-object_detection-bbox-0", | |
| "condition": "cat", | |
| }, | |
| outputs=["<TOOL-GENERATED>-select_bbox-bbox-0"], | |
| ), | |
| Action( | |
| tool_name="image_cropping", | |
| inputs={ | |
| "image": "image", | |
| "object": "<TOOL-GENERATED>-select_bbox-bbox-0", | |
| }, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ] | |
| ], | |
| BUILTIN_IMAGE_SEG: [ | |
| [ | |
| Action( | |
| tool_name="image_instance_segmentation", | |
| inputs={"image": "image"}, | |
| outputs=["<TOOL-GENERATED>-image_instance_segmentation-mask-0"], | |
| ), | |
| Action( | |
| tool_name="select_mask", | |
| inputs={ | |
| "mask_list": "<TOOL-GENERATED>-image_instance_segmentation-mask-0", | |
| "condition": "elephant", | |
| }, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ] | |
| ], | |
| BUILTIN_IMAGE_HIGHLIGHT: [ | |
| [ | |
| Action( | |
| tool_name="object_detection", | |
| inputs={"image": "image"}, | |
| outputs=["<TOOL-GENERATED>-object_detection-bbox-0"], | |
| ), | |
| Action( | |
| tool_name="select_bbox", | |
| inputs={ | |
| "bbox_list": "<TOOL-GENERATED>-object_detection-bbox-0", | |
| "condition": "elephant", | |
| }, | |
| outputs=["<TOOL-GENERATED>-select_bbox-bbox-0"], | |
| ), | |
| Action( | |
| tool_name="highlight_object_on_image", | |
| inputs={ | |
| "image": "image", | |
| "bbox": "<TOOL-GENERATED>-select_bbox-bbox-0", | |
| }, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ] | |
| ], | |
| BUILTIN_TEXT_SPEECH: [ | |
| [ | |
| Action( | |
| tool_name="text_to_speech", | |
| inputs={ | |
| "text": "Hope is the thing with feathers That perches in the soul, And sings the tune without the words, And never stops at all" | |
| }, | |
| outputs=["<GENERATED>-0"], | |
| ) | |
| ] | |
| ], | |
| BUILTIN_DUBBING: [ | |
| [ | |
| Action( | |
| tool_name="dub_video", | |
| inputs={"video": "video", "audio": "audio"}, | |
| outputs=["<GENERATED>-0"], | |
| ) | |
| ] | |
| ], | |
| BUILTIN_GENERATE_SIMILAR_IMAGE4: [ | |
| [ | |
| Action( | |
| tool_name="segment_anything", | |
| inputs={"image": "image"}, | |
| outputs=["<TOOL-GENERATED>-seg"], | |
| ), | |
| Action( | |
| tool_name="image_captioning", | |
| inputs={"image": "image"}, | |
| outputs=["<TOOL-GENERATED>-prompt"], | |
| ), | |
| Action( | |
| tool_name="segmentation_text_to_image", | |
| inputs={ | |
| "segmentation": "<TOOL-GENERATED>-seg", | |
| "text": "<TOOL-GENERATED>-prompt", | |
| }, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ] | |
| ], | |
| BUILTIN_GENERATE_IMAGE: [ | |
| [ | |
| Action( | |
| tool_name="text_to_image", | |
| inputs={"text": "a woman is skiing"}, | |
| outputs=["<GENERATED>-0"], | |
| ) | |
| ] | |
| ], | |
| BUILTIN_IMAGE_TO_VIDEO: [ | |
| [ | |
| Action( | |
| tool_name="image_to_video", | |
| inputs={"image": "image"}, | |
| outputs=["<GENERATED>-0"], | |
| ) | |
| ] | |
| ], | |
| BUILTIN_COUNT_OBJECTS2: [ | |
| [ | |
| Action( | |
| tool_name="object_detection", | |
| inputs={"image": "image"}, | |
| outputs=["<TOOL-GENERATED>-object_detection-bbox-0"], | |
| ), | |
| Action( | |
| tool_name="select_bbox", | |
| inputs={ | |
| "bbox_list": "<TOOL-GENERATED>-object_detection-bbox-0", | |
| "condition": "horse", | |
| }, | |
| outputs=["<TOOL-GENERATED>-select_bbox-bbox-0"], | |
| ), | |
| Action( | |
| tool_name="count_objects", | |
| inputs={"bbox_list": "<TOOL-GENERATED>-select_bbox-bbox-0"}, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ], | |
| [ | |
| Action( | |
| tool_name="image_question_answering", | |
| inputs={ | |
| "text": "Provide me with the count of horses in the input image", | |
| "image": "image", | |
| }, | |
| outputs=["<GENERATED>-1"], | |
| ) | |
| ], | |
| ], | |
| BUILTIN_IMAGE_TO_VIDEO2: [ | |
| [ | |
| Action( | |
| tool_name="text_to_image", | |
| inputs={ | |
| "text": "A serene and beautiful landscape with a calm lake reflecting the blue sky and white clouds." | |
| }, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ], | |
| [ | |
| Action( | |
| tool_name="image_captioning", | |
| inputs={"image": "<GENERATED>-0"}, | |
| outputs=["<TOOL-GENERATED>-text-0"], | |
| ), | |
| Action( | |
| tool_name="text_to_speech", | |
| inputs={"text": "<TOOL-GENERATED>-text-0"}, | |
| outputs=["<TOOL-GENERATED>-text_to_speech-audio-0"], | |
| ), | |
| Action( | |
| tool_name="image_audio_to_video", | |
| inputs={ | |
| "image": "<GENERATED>-0", | |
| "audio": "<TOOL-GENERATED>-text_to_speech-audio-0", | |
| }, | |
| outputs=["<GENERATED>-1"], | |
| ), | |
| ], | |
| ], | |
| BUILTIN_IMAGE_TO_VIDEO3: [ | |
| [ | |
| Action( | |
| tool_name="text_to_image", | |
| inputs={ | |
| "text": "A serene and beautiful landscape with a calm lake reflecting the blue sky." | |
| }, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ], | |
| [ | |
| Action( | |
| tool_name="image_captioning", | |
| inputs={"image": "<GENERATED>-0"}, | |
| outputs=["<TOOL-GENERATED>-text-0"], | |
| ), | |
| Action( | |
| tool_name="text_to_music", | |
| inputs={"text": "<TOOL-GENERATED>-text-0"}, | |
| outputs=["<GENERATED>-1"], | |
| ), | |
| ], | |
| [ | |
| Action( | |
| tool_name="image_to_video", | |
| inputs={ | |
| "image": "<GENERATED>-0", | |
| }, | |
| outputs=["<TOOL-GENERATED>-image_to_video-video-0"], | |
| ), | |
| Action( | |
| tool_name="dub_video", | |
| inputs={ | |
| "video": "<TOOL-GENERATED>-image_to_video-video-0", | |
| "audio": "<GENERATED>-1", | |
| }, | |
| outputs=["<GENERATED>-2"], | |
| ), | |
| ], | |
| ], | |
| BUILTIN_VIDEO_CLS: [ | |
| [ | |
| Action( | |
| tool_name="video_classification", | |
| inputs={"video": "video"}, | |
| outputs=["<GENERATED>-0"], | |
| ) | |
| ] | |
| ], | |
| BUILTIN_AUDIO_CLS: [ | |
| [ | |
| Action( | |
| tool_name="audio_classification", | |
| inputs={"audio": "audio"}, | |
| outputs=["<GENERATED>-0"], | |
| ) | |
| ] | |
| ], | |
| BUILTIN_IMAGE2MUSIC: [ | |
| [ | |
| Action( | |
| tool_name="image_captioning", | |
| inputs={"image": "image"}, | |
| outputs=["<TOOL-GENERATED>-text-0"], | |
| ), | |
| Action( | |
| tool_name="text_to_music", | |
| inputs={"text": "<TOOL-GENERATED>-text-0"}, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ] | |
| ], | |
| BUILTIN_VIDEO2MUSIC: [ | |
| [ | |
| Action( | |
| tool_name="video_captioning", | |
| inputs={"video": "video"}, | |
| outputs=["<TOOL-GENERATED>-text-0"], | |
| ), | |
| Action( | |
| tool_name="text_to_music", | |
| inputs={"text": "<TOOL-GENERATED>-text-0"}, | |
| outputs=["<GENERATED>-0"], | |
| ), | |
| ], | |
| [ | |
| Action( | |
| tool_name="dub_video", | |
| inputs={ | |
| "video": "video", | |
| "audio": "<GENERATED>-0", | |
| }, | |
| outputs=["<GENERATED>-1"], | |
| ), | |
| ], | |
| ], | |
| BUILTIN_SEG_BY_POINTS: [ | |
| [ | |
| Action( | |
| tool_name="image_segmentation_by_points", | |
| inputs={"image": "image", "prompt_points": "prompt_points"}, | |
| outputs=["<GENERATED>-0"], | |
| ) | |
| ] | |
| ], | |
| # BUILTIN_SEG_BY_MASK: [ | |
| # [ | |
| # Action( | |
| # tool_name='image_segmentation_by_mask', | |
| # inputs={'image': 'image', 'prompt_mask': 'prompt_mask'}, | |
| # outputs=['<GENERATED>-0'], | |
| # ) | |
| # ] | |
| # ], | |
| } | |
| def load_builtin_plans(path): | |
| import json | |
| plans = json.load(open(path, "r")) | |
| processed_plan = {} | |
| for query, actions in plans.items(): | |
| actions2 = [] | |
| for ac in actions[0]: | |
| actions2.append( | |
| Action( | |
| tool_name=ac["tool_name"], | |
| inputs=ac["inputs"], | |
| outputs=ac["outputs"], | |
| ), | |
| ) | |
| processed_plan[query] = [actions2] | |
| return processed_plan | |