Explainable-Vision-Language-Model

Running on Zero

khang119966 commited on Apr 13

Commit

ba17d2e

verified ·

1 Parent(s): ad2b5c4

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -144,6 +144,7 @@ tokenizer = AutoTokenizer.from_pretrained("khang119966/Vintern-1B-v3_5-explainab
 @spaces.GPU
 def generate_video(image, prompt, max_tokens):
     pixel_values, target_aspect_ratio = load_image(image, max_num=6).to(torch.bfloat16).cuda()
     generation_config = dict(max_new_tokens= int(max_tokens), do_sample=False, num_beams = 3, repetition_penalty=2.5)
     response, query = model.chat(tokenizer, pixel_values, '<image>\n'+prompt, generation_config, return_history=False, \
@@ -156,9 +157,9 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            image = gr.Image(label="Upload your image", type="pil")
-            prompt = gr.Textbox(label="Describe your prompt")
-            max_tokens = gr.Slider(label="Max token output (⚠️ Choose <100 for faster response)", minimum=1, maximum=512, value=100)
             btn = gr.Button("Attenion Video")
         video = gr.Video(label="Attenion Video")

 @spaces.GPU
 def generate_video(image, prompt, max_tokens):
+    print(image)
     pixel_values, target_aspect_ratio = load_image(image, max_num=6).to(torch.bfloat16).cuda()
     generation_config = dict(max_new_tokens= int(max_tokens), do_sample=False, num_beams = 3, repetition_penalty=2.5)
     response, query = model.chat(tokenizer, pixel_values, '<image>\n'+prompt, generation_config, return_history=False, \
     with gr.Row():
         with gr.Column():
+            image = gr.Image(label="Upload your image")
+            prompt = gr.Textbox(label="Describe your prompt", value="List all the text." )
+            max_tokens = gr.Slider(label="Max token output (⚠️ Choose <100 for faster response)", minimum=1, maximum=512, value=50)
             btn = gr.Button("Attenion Video")
         video = gr.Video(label="Attenion Video")