Spaces:

ByteDance-Seed
/

Seed1.5-VL

Running

App Files Files Community

wondervictor commited on Jun 12

Commit

d8936c7

1 Parent(s): 6a710e2

change styles

Browse files

Files changed (5) hide show

.gradio/certificate.pem +31 -0
app.py +141 -181
assets/logo.png +0 -0
assets/logo.svg +1 -0
infer.py +5 -1

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

app.py CHANGED Viewed

@@ -9,7 +9,8 @@ import gradio as gr
 from infer import SeedVLInfer, ConversationModeI18N, ConversationModeCN
 from visualizer import draw_boxes_points_with_labels
-infer = SeedVLInfer(model_id=os.getenv('MODEL_ID'), api_key=os.getenv('API_KEY'))
 label_translations = {
     "gr_chatinterface_ofl": {
@@ -59,42 +60,48 @@ label_translations = {
     }
 }
 def add_escape(text: str):
     return text.replace('<', '\<').replace('>', '\>')
 def remove_escape(text: str):
     return text.replace('\<', '<').replace('\>', '>')
 def plot_boxes_points_detections(image_path, message):
     detection_pattern = r'\[\s*{.*?}\s*\]'
-    detection_matches = re.finditer(detection_pattern, message, flags=re.DOTALL)
     bboxes, categories = [], []
     for match in detection_matches:
         matched_str = match.group(0)
         detections = json.loads(matched_str)
         for detection in detections:
             cat, bbox_str = detection['category'], detection['bbox']
-            bbox_str = bbox_str.replace('<bbox>', '').replace('</bbox>', '').replace('</bbox', '')
             bbox = list(map(float, bbox_str.split(' ')))
             bboxes.append(bbox)
             categories.append(cat)
     if not bboxes:
         box_pattern = r'<bbox>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</bbox>'
         box_matches = re.finditer(box_pattern, message)
-        bboxes = [
-            [float(match.group(1)), float(match.group(2)),
-            float(match.group(3)), float(match.group(4))]
-            for match in box_matches
-        ]
     points = []
     if not bboxes:
         point_pattern = r'<point>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</point>'
         point_matches = re.finditer(point_pattern, message)
-        points = [
-            [float(match.group(1)), float(match.group(2))]
-            for match in point_matches
-        ]
     if not bboxes and not points:
         return
@@ -110,19 +117,26 @@ def plot_boxes_points_detections(image_path, message):
     if points.size:
         points[:, 0] *= w
         points[:, 1] *= h
-    output_image = draw_boxes_points_with_labels(image, bboxes, points, categories)
     return output_image
-def general_chat(inputs: dict, gr_history: list, infer_history: list,
-                 if_thinking: bool, temperature: float, online: bool = False):
     if 'text' in inputs:
         inputs['text'] = remove_escape(inputs['text'])
     mode = ConversationModeI18N.D if if_thinking else ConversationModeI18N.G
-    for response_text, infer_history, finished in infer(inputs=inputs,
-                                              history=infer_history,
-                                              mode=mode,
-                                              temperature=temperature,
-                                              online=online):
         if if_thinking:
             reasoning_text, response_text = response_text.split('</think>')
             reasoning_text = reasoning_text.lstrip('<think>')
@@ -141,13 +155,16 @@ def general_chat(inputs: dict, gr_history: list, infer_history: list,
                 "role": "assistant",
                 "content": add_escape(response_text)
             }]
-        if finished and len(inputs.get('files', [])) == 1 and not inputs['files'][0].endswith('.mp4'):
             image_path = inputs['files'][0]
             response_text = infer_history[-1]['content']
             try:
                 if if_thinking:
-                    reasoning_text, response_text = response_text.split('</think>')
-                output_image = plot_boxes_points_detections(image_path, response_text)
                 if output_image is not None:
                     response_message.append({
                         "role": "assistant",
@@ -157,6 +174,7 @@ def general_chat(inputs: dict, gr_history: list, infer_history: list,
                 print(e)
         yield response_message, infer_history
 def online_record_chat(text: str, gr_history: list, gr_webcam_images: list,
                        gr_counter: int, infer_history: list, if_thinking: bool,
                        temperature: float):
@@ -166,74 +184,28 @@ def online_record_chat(text: str, gr_history: list, gr_webcam_images: list,
     inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]}
     yield f'received {len(gr_webcam_images)} new frames, processing...', gr_counter + len(
         gr_webcam_images), infer_history
-    for response_message, infer_history in general_chat(
-            inputs, gr_history, infer_history, if_thinking, temperature, online=True):
         yield response_message, gr.skip(), infer_history
-with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            gr_title = gr.Markdown('# Seed1.5-VL')
-            with gr.Row():
-                gr.Markdown(
-                    """
-                    <div style="display:flex; flex-direction:column; gap:10px;">
-                    <a
-                        href="https://github.com/ByteDance-Seed/Seed1.5-VL"
-                        target="_blank"
-                        style="
-                        display: inline-flex;
-                        align-items: center;
-                        gap: 8px;
-                        white-space: nowrap;
-                        text-decoration: none;
-                        "
-                    >
-                        <img
-                        src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/github/github-original.svg"
-                        alt="GitHub"
-                        width="24"
-                        >
-                        Seed1.5-VL Cookbook
-                    </a>
-                    </div>
-                    """
-                )
-                gr.Markdown(
-                    """
-                    <div style="display:flex; flex-direction:column; gap:10px;">
-                    <a
-                        href="https://huggingface.co/papers/2505.07062"
-                        target="_blank"
-                        style="
-                        display: inline-flex;
-                        align-items: center;
-                        gap: 8px;
-                        white-space: nowrap;
-                        text-decoration: none;
-                        "
-                    >
-                        <img
-                        src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
-                        alt="Paper"
-                        width="24"
-                        >
-                        Seed1.5-VL Paper
-                    </a>
-                    </div>
-                    """,
-                )
-                gr.Markdown('')
-                gr.Markdown('')
-                gr.Markdown('')
-        gr_lang_selector = gr.Dropdown(choices=["English", "中文"],
-            value="English",
-            label="🌐 English Interface/中文界面",
-            interactive=True,
-            min_width=400,
-            scale=0)
     with gr.Tabs():
         with gr.Tab("Offline") as gr_tab_ofl:
             gr_infer_history = gr.State([])
@@ -262,15 +234,16 @@ with gr.Blocks() as demo:
                 ],
                 additional_outputs=[gr_infer_history],
             )
             def add_escape_fn(inputs: dict):
                 if inputs and 'text' in inputs:
                     inputs['text'] = add_escape(inputs['text'])
                 return inputs
             gr_chatinterface_ofl.textbox.submit(
                 fn=add_escape_fn,
                 inputs=[gr_chatinterface_ofl.saved_input],
-                outputs=[gr_chatinterface_ofl.saved_input]
-            )
             gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear],
                   fn=lambda: [],
                   outputs=[gr_infer_history])
@@ -280,8 +253,8 @@ with gr.Blocks() as demo:
                     label=label_translations['gr_thinking']['English'],
                 )
                 gr_thinking_ofl.change(lambda x: x,
-                                        inputs=gr_thinking_ofl,
-                                        outputs=gr_thinking_hidden)
                 gr_temperature_ofl = gr.Slider(
                     minimum=0.0,
                     maximum=2.0,
@@ -290,101 +263,84 @@ with gr.Blocks() as demo:
                     label=label_translations['gr_temperature']['English'],
                     interactive=True)
                 gr_temperature_ofl.change(lambda x: x,
-                                            inputs=gr_temperature_ofl,
-                                            outputs=gr_temperature_hidden)
-                gr_clear_button_ofl = gr.Button(value=label_translations['gr_clear_button']['English'])
                 def clear_history_fn():
                     return None, [], [], [], []
                 gr_clear_button_ofl.click(
-                    fn=clear_history_fn,
                     outputs=[
-                        gr_chatinterface_ofl.conversation_id,
-                        gr_chatinterface_ofl.saved_conversations,
                         gr_chatinterface_ofl.chatbot,
-                        gr_chatinterface_ofl.chatbot_state,
-                        gr_infer_history
-                    ]
-                )
             with gr.Column(visible=True) as gr_examples_en:
                 gr.Examples(
-                    label='7 Examples: text, image, video, multiple images/videos, visual puzzle, points grounding, open-vocabulary detection.',
-                    examples=[
-                        {
-                            "text": "Who are you?",
-                            "files": []
-                        },
-                        {
-                            "text": "Introduce this.",
-                            "files": ["examples/bancopy.jpg"]
-                        },
-                        {
-                            "text":
-                            """Find Curry's "Good Night" celebration time.""",
-                            "files":
-                            ["examples/I7pTpMjqNRM_1080p_small.mp4"]
-                        },
-                        {
-                            "text":
-                            "Share your feelings.",
-                            "files": [
-                                "examples/newyork.jpg",
-                                "examples/beijing.jpg"
-                            ]
-                        },
-                        {
-                            "text": "Look and answer.",
-                            "files": ["examples/puzzle.jpg"]
-                        },
-                        {
-                            "text": "Please point out all the hats on people's heads in the image, output concatenated point coordinates like <point>x y</point><point>x y</point>",
-                            "files": ["examples/000000001000.jpeg"]
-                        },
-                        {
-                            "text": """Please detect all plate, photo, kid, cup in the image, and output all objects in the JSON format, which is a list of dict like [{"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""",
-                            "files": ["examples/000000018380.jpeg"]
-                        }
-                    ],
                     inputs=[gr_chatinterface_ofl.textbox],
                 )
             with gr.Column(visible=False) as gr_examples_cn:
                 gr.Examples(
                     label='七个示例：文本，图像，视频，多个图像/视频，视觉解谜，坐标定位，开放式物体检测。',
-                    examples=[
-                        {
-                            "text": "你是谁？",
-                            "files": []
-                        },
-                        {
-                            "text": "介绍一下。",
-                            "files": ["examples/bancopy.jpg"]
-                        },
-                        {
-                            "text":
-                            "找到库里的“晚安”庆祝时间段。",
-                            "files":
-                            ["examples/I7pTpMjqNRM_1080p_small.mp4"]
-                        },
-                        {
-                            "text":
-                            "你有什么感想？",
-                            "files": [
-                                "examples/newyork.jpg",
-                                "examples/beijing.jpg"
-                            ]
-                        },
-                        {
-                            "text": "看图回答。",
-                            "files": ["examples/puzzle.jpg"]
-                        },
-                        {
-                            "text": "请点出图像中所有戴在头上的帽子, 输出串联的点坐标<point>x y</point><point>x y</point>",
-                            "files": ["examples/000000001000.jpeg"]
-                        },
-                        {
-                            "text": """请检测图像中所有的盘子、照片、小孩和杯子。请以JSON格式输出一个由字典组成的列表，就像：[{"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""",
-                            "files": ["examples/000000018380.jpeg"]
-                        }
-                    ],
                     inputs=[gr_chatinterface_ofl.textbox],
                 )
         with gr.Tab("Online") as gr_tab_ol:
@@ -473,19 +429,23 @@ with gr.Blocks() as demo:
                                     lambda x: x,
                                     inputs=gr_temperature_ol,
                                     outputs=gr_temperature_hidden)
-                                gr_clear_button_ol = gr.Button(value=label_translations['gr_clear_button']['English'])
                                 def clear_history_fn():
                                     return None, [], [], [], []
                                 gr_clear_button_ol.click(
-                                    fn=clear_history_fn,
                                     outputs=[
-                                        gr_chatinterface_ol.conversation_id,
-                                        gr_chatinterface_ol.saved_conversations,
                                         gr_chatinterface_ol.chatbot,
-                                        gr_chatinterface_ol.chatbot_state,
                                         gr_infer_history_ol
-                                    ]
-                                )
     def update_lang(lang: str):
         return (

 from infer import SeedVLInfer, ConversationModeI18N, ConversationModeCN
 from visualizer import draw_boxes_points_with_labels
+infer = SeedVLInfer(model_id=os.getenv('MODEL_ID'),
+                    api_key=os.getenv('API_KEY'))
 label_translations = {
     "gr_chatinterface_ofl": {
     }
 }
 def add_escape(text: str):
     return text.replace('<', '\<').replace('>', '\>')
 def remove_escape(text: str):
     return text.replace('\<', '<').replace('\>', '>')
 def plot_boxes_points_detections(image_path, message):
     detection_pattern = r'\[\s*{.*?}\s*\]'
+    detection_matches = re.finditer(detection_pattern,
+                                    message,
+                                    flags=re.DOTALL)
     bboxes, categories = [], []
     for match in detection_matches:
         matched_str = match.group(0)
         detections = json.loads(matched_str)
         for detection in detections:
             cat, bbox_str = detection['category'], detection['bbox']
+            bbox_str = bbox_str.replace('<bbox>',
+                                        '').replace('</bbox>',
+                                                    '').replace('</bbox', '')
             bbox = list(map(float, bbox_str.split(' ')))
             bboxes.append(bbox)
             categories.append(cat)
     if not bboxes:
         box_pattern = r'<bbox>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</bbox>'
         box_matches = re.finditer(box_pattern, message)
+        bboxes = [[
+            float(match.group(1)),
+            float(match.group(2)),
+            float(match.group(3)),
+            float(match.group(4))
+        ] for match in box_matches]
     points = []
     if not bboxes:
         point_pattern = r'<point>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</point>'
         point_matches = re.finditer(point_pattern, message)
+        points = [[float(match.group(1)),
+                   float(match.group(2))] for match in point_matches]
     if not bboxes and not points:
         return
     if points.size:
         points[:, 0] *= w
         points[:, 1] *= h
+    output_image = draw_boxes_points_with_labels(image, bboxes, points,
+                                                 categories)
     return output_image
+def general_chat(inputs: dict,
+                 gr_history: list,
+                 infer_history: list,
+                 if_thinking: bool,
+                 temperature: float,
+                 online: bool = False):
     if 'text' in inputs:
         inputs['text'] = remove_escape(inputs['text'])
     mode = ConversationModeI18N.D if if_thinking else ConversationModeI18N.G
+    for response_text, infer_history, finished in infer(
+            inputs=inputs,
+            history=infer_history,
+            mode=mode,
+            temperature=temperature,
+            online=online):
         if if_thinking:
             reasoning_text, response_text = response_text.split('</think>')
             reasoning_text = reasoning_text.lstrip('<think>')
                 "role": "assistant",
                 "content": add_escape(response_text)
             }]
+        if finished and len(inputs.get(
+                'files', [])) == 1 and not inputs['files'][0].endswith('.mp4'):
             image_path = inputs['files'][0]
             response_text = infer_history[-1]['content']
             try:
                 if if_thinking:
+                    reasoning_text, response_text = response_text.split(
+                        '</think>')
+                output_image = plot_boxes_points_detections(
+                    image_path, response_text)
                 if output_image is not None:
                     response_message.append({
                         "role": "assistant",
                 print(e)
         yield response_message, infer_history
 def online_record_chat(text: str, gr_history: list, gr_webcam_images: list,
                        gr_counter: int, infer_history: list, if_thinking: bool,
                        temperature: float):
     inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]}
     yield f'received {len(gr_webcam_images)} new frames, processing...', gr_counter + len(
         gr_webcam_images), infer_history
+    for response_message, infer_history in general_chat(inputs,
+                                                        gr_history,
+                                                        infer_history,
+                                                        if_thinking,
+                                                        temperature,
+                                                        online=True):
         yield response_message, gr.skip(), infer_history
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Row():
         with gr.Column():
+            gr_title = gr.Markdown('<h1>Seed1.5-VL</h1>')
+            gr_desc = gr.Markdown('<h3>Advancing Multimodal Understanding and Reasoning.</h3>')
+        gr_lang_selector = gr.Dropdown(choices=["English", "中文"],
+                                       value="English",
+                                       label="🌐 English Interface/中文界面",
+                                       interactive=True,
+                                       min_width=400,
+                                       scale=0)
     with gr.Tabs():
         with gr.Tab("Offline") as gr_tab_ofl:
             gr_infer_history = gr.State([])
                 ],
                 additional_outputs=[gr_infer_history],
             )
             def add_escape_fn(inputs: dict):
                 if inputs and 'text' in inputs:
                     inputs['text'] = add_escape(inputs['text'])
                 return inputs
             gr_chatinterface_ofl.textbox.submit(
                 fn=add_escape_fn,
                 inputs=[gr_chatinterface_ofl.saved_input],
+                outputs=[gr_chatinterface_ofl.saved_input])
             gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear],
                   fn=lambda: [],
                   outputs=[gr_infer_history])
                     label=label_translations['gr_thinking']['English'],
                 )
                 gr_thinking_ofl.change(lambda x: x,
+                                       inputs=gr_thinking_ofl,
+                                       outputs=gr_thinking_hidden)
                 gr_temperature_ofl = gr.Slider(
                     minimum=0.0,
                     maximum=2.0,
                     label=label_translations['gr_temperature']['English'],
                     interactive=True)
                 gr_temperature_ofl.change(lambda x: x,
+                                          inputs=gr_temperature_ofl,
+                                          outputs=gr_temperature_hidden)
+                gr_clear_button_ofl = gr.Button(
+                    value=label_translations['gr_clear_button']['English'])
                 def clear_history_fn():
                     return None, [], [], [], []
                 gr_clear_button_ofl.click(
+                    fn=clear_history_fn,
                     outputs=[
+                        gr_chatinterface_ofl.conversation_id,
+                        gr_chatinterface_ofl.saved_conversations,
                         gr_chatinterface_ofl.chatbot,
+                        gr_chatinterface_ofl.chatbot_state, gr_infer_history
+                    ])
             with gr.Column(visible=True) as gr_examples_en:
                 gr.Examples(
+                    label=
+                    '7 Examples: text, image, video, multiple images/videos, visual puzzle, points grounding, open-vocabulary detection.',
+                    examples=[{
+                        "text": "Who are you?",
+                        "files": []
+                    }, {
+                        "text": "Introduce this.",
+                        "files": ["examples/bancopy.jpg"]
+                    }, {
+                        "text":
+                        """Find Curry's "Good Night" celebration time.""",
+                        "files": ["examples/I7pTpMjqNRM_1080p_small.mp4"]
+                    }, {
+                        "text":
+                        "Share your feelings.",
+                        "files":
+                        ["examples/newyork.jpg", "examples/beijing.jpg"]
+                    }, {
+                        "text": "Look and answer.",
+                        "files": ["examples/puzzle.jpg"]
+                    }, {
+                        "text":
+                        "Please point out all the hats on people's heads in the image, output concatenated point coordinates like <point>x y</point><point>x y</point>",
+                        "files": ["examples/000000001000.jpeg"]
+                    }, {
+                        "text":
+                        """Please detect all plate, photo, kid, cup in the image, and output all objects in the JSON format, which is a list of dict like [{"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""",
+                        "files": ["examples/000000018380.jpeg"]
+                    }],
                     inputs=[gr_chatinterface_ofl.textbox],
                 )
             with gr.Column(visible=False) as gr_examples_cn:
                 gr.Examples(
                     label='七个示例：文本，图像，视频，多个图像/视频，视觉解谜，坐标定位，开放式物体检测。',
+                    examples=[{
+                        "text": "你是谁？",
+                        "files": []
+                    }, {
+                        "text": "介绍一下。",
+                        "files": ["examples/bancopy.jpg"]
+                    }, {
+                        "text": "找到库里的“晚安”庆祝时间段。",
+                        "files": ["examples/I7pTpMjqNRM_1080p_small.mp4"]
+                    }, {
+                        "text":
+                        "你有什么感想？",
+                        "files":
+                        ["examples/newyork.jpg", "examples/beijing.jpg"]
+                    }, {
+                        "text": "看图回答。",
+                        "files": ["examples/puzzle.jpg"]
+                    }, {
+                        "text":
+                        "请点出图像中所有戴在头上的帽子, 输出串联的点坐标<point>x y</point><point>x y</point>",
+                        "files": ["examples/000000001000.jpeg"]
+                    }, {
+                        "text":
+                        """请检测图像中所有的盘子、照片、小孩和杯子。请以JSON格式输出一个由字典组成的列表，就像：[{"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""",
+                        "files": ["examples/000000018380.jpeg"]
+                    }],
                     inputs=[gr_chatinterface_ofl.textbox],
                 )
         with gr.Tab("Online") as gr_tab_ol:
                                     lambda x: x,
                                     inputs=gr_temperature_ol,
                                     outputs=gr_temperature_hidden)
+                                gr_clear_button_ol = gr.Button(
+                                    value=label_translations['gr_clear_button']
+                                    ['English'])
                                 def clear_history_fn():
                                     return None, [], [], [], []
                                 gr_clear_button_ol.click(
+                                    fn=clear_history_fn,
                                     outputs=[
+                                        gr_chatinterface_ol.conversation_id,
+                                        gr_chatinterface_ol.
+                                        saved_conversations,
                                         gr_chatinterface_ol.chatbot,
+                                        gr_chatinterface_ol.chatbot_state,
                                         gr_infer_history_ol
+                                    ])
     def update_lang(lang: str):
         return (

assets/logo.png ADDED Viewed

assets/logo.svg ADDED Viewed

infer.py CHANGED Viewed

@@ -7,7 +7,11 @@ import base64
 import requests
 import torch
-import decord
 import numpy as np
 from PIL import Image, ImageSequence
 from torchvision.io import read_image, encode_jpeg

 import requests
 import torch
+try:
+    import decord
+except ImportError:
+    print("Please install decord first.")
+    pass
 import numpy as np
 from PIL import Image, ImageSequence
 from torchvision.io import read_image, encode_jpeg