Explainable-Vision-Language-Model

Running on Zero

App Files Files Community

khang119966 commited on Apr 13

Commit

f727381

verified ·

1 Parent(s): 42bde0f

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -3

app.py CHANGED Viewed

@@ -450,7 +450,8 @@ tokenizer = AutoTokenizer.from_pretrained("khang119966/Vintern-1B-v3_5-explainab
 @spaces.GPU
 def generate_video(image, prompt, max_tokens):
     print(image)
-    pixel_values, target_aspect_ratio = load_image(image, max_num=6).to(torch.bfloat16).cuda()
     generation_config = dict(max_new_tokens= int(max_tokens), do_sample=False, num_beams = 3, repetition_penalty=2.5)
     response, query = model.chat(tokenizer, pixel_values, '<image>\n'+prompt, generation_config, return_history=False, \
                             attention_visualize=True,last_visualize_layers=7,raw_image_path=test_image,target_aspect_ratio=target_aspect_ratio)
@@ -458,6 +459,65 @@ def generate_video(image, prompt, max_tokens):
     generation_output = response
     raw_image_path = image
     return "path_to_generated_video.mp4"
 with gr.Blocks() as demo:
@@ -468,8 +528,8 @@ with gr.Blocks() as demo:
             image = gr.Image(label="Upload your image", type = 'filepath')
             prompt = gr.Textbox(label="Describe your prompt", value="List all the text." )
             max_tokens = gr.Slider(label="Max token output (⚠️ Choose <100 for faster response)", minimum=1, maximum=512, value=50)
-            btn = gr.Button("Attenion Video")
-        video = gr.Video(label="Attenion Video")
     btn.click(fn=generate_video, inputs=[image, prompt, max_tokens], outputs=video)

 @spaces.GPU
 def generate_video(image, prompt, max_tokens):
     print(image)
+    pixel_values, target_aspect_ratio = load_image(image, max_num=6)
+    pixel_values = pixel_values.to(torch.bfloat16).cuda()
     generation_config = dict(max_new_tokens= int(max_tokens), do_sample=False, num_beams = 3, repetition_penalty=2.5)
     response, query = model.chat(tokenizer, pixel_values, '<image>\n'+prompt, generation_config, return_history=False, \
                             attention_visualize=True,last_visualize_layers=7,raw_image_path=test_image,target_aspect_ratio=target_aspect_ratio)
     generation_output = response
     raw_image_path = image
+    attentions_tensors = []
+    for tok_ in generation_output["attentions"]:
+        attentions_tensors.append([])
+        for lay_ in tok_ :
+            attentions_tensors[-1].append(lay_.detach().cpu().type(torch.float).numpy())
+    attention_scores = attentions_tensors
+    query_ = tokenizer(query)
+    start_img_token_index = int(np.where(np.array(query_["input_ids"])==tokenizer("<img>")["input_ids"][0])[0]+1)
+    end_img_token_index = int(np.where(np.array(query_["input_ids"])==tokenizer("</img>")["input_ids"][0])[0]-256)
+    if end_img_token_index - start_img_token_index  == 0 :
+        end_img_token_index = int(np.where(np.array(query_["input_ids"])==tokenizer("</img>")["input_ids"][0])[0])
+    # Đọc ảnh gốc
+    image = cv2.imread(raw_image_path)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    # Resize ảnh nhỏ hơn để giảm dung lượng GIF
+    scale_factor = 1.  # Giảm 50% kích thước
+    alpha = 0.4
+    # Lưu danh sách frames GIF
+    visualization_frames = []
+    # Chuỗi sinh ra
+    generated_text = ""
+    frame_step = 1
+    input_token = ""
+    params_for_text = []
+    params_for_hidden = []
+    heatmap_imgs = []
+    top_visual_tokens_focus_tables = []
+    # Lặp qua từng token
+    for index_focus in tqdm.tqdm(range(0, generation_output.sequences.shape[1], frame_step)):
+        predict_token_text = tokenizer.decode(generation_output.sequences[0, index_focus])
+        generated_text += predict_token_text  # Ghép chữ lại
+        # Tạo heatmap trung bình từ các lớp attention
+        heat_maps, top_visual_tokens_focus = visualize_attention_hiddenstate(attention_scores[index_focus], head=None,
+                                         start_img_token_index=start_img_token_index, end_img_token_index=end_img_token_index,
+                                         target_aspect_ratio=target_aspect_ratio)
+        heatmap = np.array(heat_maps[0])
+        # Resize heatmap về kích thước ảnh gốc
+        heatmap = cv2.resize(heatmap, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_CUBIC)
+        # Làm mượt heatmap
+        heatmap_smooth = gaussian_filter(heatmap, sigma=1)
+        # Chuẩn hóa heatmap về 0-255
+        heatmap_norm = cv2.normalize(heatmap_smooth, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
+        heatmap_color = cv2.applyColorMap(heatmap_norm, cv2.COLORMAP_JET)
+        heatmap_color = cv2.cvtColor(heatmap_color, cv2.COLOR_BGR2RGB)
+        # Overlay ảnh heatmap lên ảnh gốc
+        overlay = cv2.addWeighted(image, 1 - alpha, heatmap_color, alpha, 0)
+        prev_text = generated_text[:-len(input_token)-len(predict_token_text)] + "   "
+        params_for_text.append((prev_text, input_token, predict_token_text))
+        hidden_tabel = extract_next_token_table_data(model, tokenizer, generation_output, index_focus)
+        params_for_hidden.append((hidden_tabel,predict_token_text))
+        input_token = predict_token_text
+        heatmap_imgs.append(overlay)
     return "path_to_generated_video.mp4"
 with gr.Blocks() as demo:
             image = gr.Image(label="Upload your image", type = 'filepath')
             prompt = gr.Textbox(label="Describe your prompt", value="List all the text." )
             max_tokens = gr.Slider(label="Max token output (⚠️ Choose <100 for faster response)", minimum=1, maximum=512, value=50)
+            btn = gr.Button("Inference")
+        video = gr.Video(label="Visualization Video")
     btn.click(fn=generate_video, inputs=[image, prompt, max_tokens], outputs=video)