Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -450,7 +450,8 @@ tokenizer = AutoTokenizer.from_pretrained("khang119966/Vintern-1B-v3_5-explainab
|
|
| 450 |
@spaces.GPU
|
| 451 |
def generate_video(image, prompt, max_tokens):
|
| 452 |
print(image)
|
| 453 |
-
pixel_values, target_aspect_ratio = load_image(image, max_num=6)
|
|
|
|
| 454 |
generation_config = dict(max_new_tokens= int(max_tokens), do_sample=False, num_beams = 3, repetition_penalty=2.5)
|
| 455 |
response, query = model.chat(tokenizer, pixel_values, '<image>\n'+prompt, generation_config, return_history=False, \
|
| 456 |
attention_visualize=True,last_visualize_layers=7,raw_image_path=test_image,target_aspect_ratio=target_aspect_ratio)
|
|
@@ -458,6 +459,65 @@ def generate_video(image, prompt, max_tokens):
|
|
| 458 |
generation_output = response
|
| 459 |
raw_image_path = image
|
| 460 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
return "path_to_generated_video.mp4"
|
| 462 |
|
| 463 |
with gr.Blocks() as demo:
|
|
@@ -468,8 +528,8 @@ with gr.Blocks() as demo:
|
|
| 468 |
image = gr.Image(label="Upload your image", type = 'filepath')
|
| 469 |
prompt = gr.Textbox(label="Describe your prompt", value="List all the text." )
|
| 470 |
max_tokens = gr.Slider(label="Max token output (⚠️ Choose <100 for faster response)", minimum=1, maximum=512, value=50)
|
| 471 |
-
btn = gr.Button("
|
| 472 |
-
video = gr.Video(label="
|
| 473 |
|
| 474 |
btn.click(fn=generate_video, inputs=[image, prompt, max_tokens], outputs=video)
|
| 475 |
|
|
|
|
| 450 |
@spaces.GPU
|
| 451 |
def generate_video(image, prompt, max_tokens):
|
| 452 |
print(image)
|
| 453 |
+
pixel_values, target_aspect_ratio = load_image(image, max_num=6)
|
| 454 |
+
pixel_values = pixel_values.to(torch.bfloat16).cuda()
|
| 455 |
generation_config = dict(max_new_tokens= int(max_tokens), do_sample=False, num_beams = 3, repetition_penalty=2.5)
|
| 456 |
response, query = model.chat(tokenizer, pixel_values, '<image>\n'+prompt, generation_config, return_history=False, \
|
| 457 |
attention_visualize=True,last_visualize_layers=7,raw_image_path=test_image,target_aspect_ratio=target_aspect_ratio)
|
|
|
|
| 459 |
generation_output = response
|
| 460 |
raw_image_path = image
|
| 461 |
|
| 462 |
+
attentions_tensors = []
|
| 463 |
+
for tok_ in generation_output["attentions"]:
|
| 464 |
+
attentions_tensors.append([])
|
| 465 |
+
for lay_ in tok_ :
|
| 466 |
+
attentions_tensors[-1].append(lay_.detach().cpu().type(torch.float).numpy())
|
| 467 |
+
attention_scores = attentions_tensors
|
| 468 |
+
query_ = tokenizer(query)
|
| 469 |
+
start_img_token_index = int(np.where(np.array(query_["input_ids"])==tokenizer("<img>")["input_ids"][0])[0]+1)
|
| 470 |
+
end_img_token_index = int(np.where(np.array(query_["input_ids"])==tokenizer("</img>")["input_ids"][0])[0]-256)
|
| 471 |
+
if end_img_token_index - start_img_token_index == 0 :
|
| 472 |
+
end_img_token_index = int(np.where(np.array(query_["input_ids"])==tokenizer("</img>")["input_ids"][0])[0])
|
| 473 |
+
|
| 474 |
+
# Đọc ảnh gốc
|
| 475 |
+
image = cv2.imread(raw_image_path)
|
| 476 |
+
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 477 |
+
# Resize ảnh nhỏ hơn để giảm dung lượng GIF
|
| 478 |
+
scale_factor = 1. # Giảm 50% kích thước
|
| 479 |
+
alpha = 0.4
|
| 480 |
+
# Lưu danh sách frames GIF
|
| 481 |
+
visualization_frames = []
|
| 482 |
+
# Chuỗi sinh ra
|
| 483 |
+
generated_text = ""
|
| 484 |
+
frame_step = 1
|
| 485 |
+
input_token = ""
|
| 486 |
+
|
| 487 |
+
params_for_text = []
|
| 488 |
+
params_for_hidden = []
|
| 489 |
+
heatmap_imgs = []
|
| 490 |
+
top_visual_tokens_focus_tables = []
|
| 491 |
+
# Lặp qua từng token
|
| 492 |
+
for index_focus in tqdm.tqdm(range(0, generation_output.sequences.shape[1], frame_step)):
|
| 493 |
+
predict_token_text = tokenizer.decode(generation_output.sequences[0, index_focus])
|
| 494 |
+
generated_text += predict_token_text # Ghép chữ lại
|
| 495 |
+
# Tạo heatmap trung bình từ các lớp attention
|
| 496 |
+
heat_maps, top_visual_tokens_focus = visualize_attention_hiddenstate(attention_scores[index_focus], head=None,
|
| 497 |
+
start_img_token_index=start_img_token_index, end_img_token_index=end_img_token_index,
|
| 498 |
+
target_aspect_ratio=target_aspect_ratio)
|
| 499 |
+
|
| 500 |
+
heatmap = np.array(heat_maps[0])
|
| 501 |
+
# Resize heatmap về kích thước ảnh gốc
|
| 502 |
+
heatmap = cv2.resize(heatmap, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_CUBIC)
|
| 503 |
+
# Làm mượt heatmap
|
| 504 |
+
heatmap_smooth = gaussian_filter(heatmap, sigma=1)
|
| 505 |
+
# Chuẩn hóa heatmap về 0-255
|
| 506 |
+
heatmap_norm = cv2.normalize(heatmap_smooth, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
|
| 507 |
+
heatmap_color = cv2.applyColorMap(heatmap_norm, cv2.COLORMAP_JET)
|
| 508 |
+
heatmap_color = cv2.cvtColor(heatmap_color, cv2.COLOR_BGR2RGB)
|
| 509 |
+
# Overlay ảnh heatmap lên ảnh gốc
|
| 510 |
+
overlay = cv2.addWeighted(image, 1 - alpha, heatmap_color, alpha, 0)
|
| 511 |
+
|
| 512 |
+
prev_text = generated_text[:-len(input_token)-len(predict_token_text)] + " "
|
| 513 |
+
params_for_text.append((prev_text, input_token, predict_token_text))
|
| 514 |
+
|
| 515 |
+
hidden_tabel = extract_next_token_table_data(model, tokenizer, generation_output, index_focus)
|
| 516 |
+
params_for_hidden.append((hidden_tabel,predict_token_text))
|
| 517 |
+
|
| 518 |
+
input_token = predict_token_text
|
| 519 |
+
heatmap_imgs.append(overlay)
|
| 520 |
+
|
| 521 |
return "path_to_generated_video.mp4"
|
| 522 |
|
| 523 |
with gr.Blocks() as demo:
|
|
|
|
| 528 |
image = gr.Image(label="Upload your image", type = 'filepath')
|
| 529 |
prompt = gr.Textbox(label="Describe your prompt", value="List all the text." )
|
| 530 |
max_tokens = gr.Slider(label="Max token output (⚠️ Choose <100 for faster response)", minimum=1, maximum=512, value=50)
|
| 531 |
+
btn = gr.Button("Inference")
|
| 532 |
+
video = gr.Video(label="Visualization Video")
|
| 533 |
|
| 534 |
btn.click(fn=generate_video, inputs=[image, prompt, max_tokens], outputs=video)
|
| 535 |
|