Spaces:
Build error
Build error
[fix] fix mask np.uint8 bug
Browse files- app/src/brushedit_app.py +17 -17
- app/src/vlm_template.py +3 -3
app/src/brushedit_app.py
CHANGED
|
@@ -528,23 +528,23 @@ def update_vlm_model(vlm_name):
|
|
| 528 |
else:
|
| 529 |
if os.path.exists(vlm_local_path):
|
| 530 |
vlm_processor = LlavaNextProcessor.from_pretrained(vlm_local_path)
|
| 531 |
-
vlm_model = LlavaNextForConditionalGeneration.from_pretrained(vlm_local_path, torch_dtype=
|
| 532 |
else:
|
| 533 |
if vlm_name == "llava-v1.6-mistral-7b-hf (Preload)":
|
| 534 |
vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
|
| 535 |
-
vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=
|
| 536 |
elif vlm_name == "llama3-llava-next-8b-hf (Preload)":
|
| 537 |
vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llama3-llava-next-8b-hf")
|
| 538 |
-
vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llama3-llava-next-8b-hf", torch_dtype=
|
| 539 |
elif vlm_name == "llava-v1.6-vicuna-13b-hf (Preload)":
|
| 540 |
vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-vicuna-13b-hf")
|
| 541 |
-
vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-vicuna-13b-hf", torch_dtype=
|
| 542 |
elif vlm_name == "llava-v1.6-34b-hf (Preload)":
|
| 543 |
vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-34b-hf")
|
| 544 |
-
vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-34b-hf", torch_dtype=
|
| 545 |
elif vlm_name == "llava-next-72b-hf (Preload)":
|
| 546 |
vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-next-72b-hf")
|
| 547 |
-
vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-next-72b-hf", torch_dtype=
|
| 548 |
elif vlm_type == "qwen2-vl":
|
| 549 |
if vlm_processor != "" and vlm_model != "":
|
| 550 |
vlm_model.to(device)
|
|
@@ -552,17 +552,17 @@ def update_vlm_model(vlm_name):
|
|
| 552 |
else:
|
| 553 |
if os.path.exists(vlm_local_path):
|
| 554 |
vlm_processor = Qwen2VLProcessor.from_pretrained(vlm_local_path)
|
| 555 |
-
vlm_model = Qwen2VLForConditionalGeneration.from_pretrained(vlm_local_path, torch_dtype=
|
| 556 |
else:
|
| 557 |
if vlm_name == "qwen2-vl-2b-instruct (Preload)":
|
| 558 |
vlm_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
| 559 |
-
vlm_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", torch_dtype=
|
| 560 |
elif vlm_name == "qwen2-vl-7b-instruct (Preload)":
|
| 561 |
vlm_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
|
| 562 |
-
vlm_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype=
|
| 563 |
elif vlm_name == "qwen2-vl-72b-instruct (Preload)":
|
| 564 |
vlm_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-72B-Instruct")
|
| 565 |
-
vlm_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-72B-Instruct", torch_dtype=
|
| 566 |
elif vlm_type == "openai":
|
| 567 |
pass
|
| 568 |
return "success"
|
|
@@ -654,10 +654,10 @@ def process(input_image,
|
|
| 654 |
original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
|
| 655 |
original_image = np.array(original_image)
|
| 656 |
if input_mask is not None:
|
| 657 |
-
input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
|
| 658 |
input_mask = np.array(input_mask)
|
| 659 |
if original_mask is not None:
|
| 660 |
-
original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
|
| 661 |
original_mask = np.array(original_mask)
|
| 662 |
gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
|
| 663 |
else:
|
|
@@ -673,10 +673,10 @@ def process(input_image,
|
|
| 673 |
original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
|
| 674 |
original_image = np.array(original_image)
|
| 675 |
if input_mask is not None:
|
| 676 |
-
input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
|
| 677 |
input_mask = np.array(input_mask)
|
| 678 |
if original_mask is not None:
|
| 679 |
-
original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
|
| 680 |
original_mask = np.array(original_mask)
|
| 681 |
|
| 682 |
if invert_mask_state:
|
|
@@ -722,7 +722,7 @@ def process(input_image,
|
|
| 722 |
sam_predictor,
|
| 723 |
sam_automask_generator,
|
| 724 |
groundingdino_model,
|
| 725 |
-
device)
|
| 726 |
except Exception as e:
|
| 727 |
raise gr.Error("Please select the correct VLM model and input the correct API Key first!")
|
| 728 |
|
|
@@ -831,9 +831,9 @@ def process_mask(input_image,
|
|
| 831 |
sam_predictor,
|
| 832 |
sam_automask_generator,
|
| 833 |
groundingdino_model,
|
| 834 |
-
device)
|
| 835 |
else:
|
| 836 |
-
original_mask = input_mask
|
| 837 |
category = None
|
| 838 |
|
| 839 |
## resize mask if needed
|
|
|
|
| 528 |
else:
|
| 529 |
if os.path.exists(vlm_local_path):
|
| 530 |
vlm_processor = LlavaNextProcessor.from_pretrained(vlm_local_path)
|
| 531 |
+
vlm_model = LlavaNextForConditionalGeneration.from_pretrained(vlm_local_path, torch_dtype=torch_dtype, device_map=device)
|
| 532 |
else:
|
| 533 |
if vlm_name == "llava-v1.6-mistral-7b-hf (Preload)":
|
| 534 |
vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
|
| 535 |
+
vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch_dtype, device_map=device)
|
| 536 |
elif vlm_name == "llama3-llava-next-8b-hf (Preload)":
|
| 537 |
vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llama3-llava-next-8b-hf")
|
| 538 |
+
vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llama3-llava-next-8b-hf", torch_dtype=torch_dtype, device_map=device)
|
| 539 |
elif vlm_name == "llava-v1.6-vicuna-13b-hf (Preload)":
|
| 540 |
vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-vicuna-13b-hf")
|
| 541 |
+
vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-vicuna-13b-hf", torch_dtype=torch_dtype, device_map=device)
|
| 542 |
elif vlm_name == "llava-v1.6-34b-hf (Preload)":
|
| 543 |
vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-34b-hf")
|
| 544 |
+
vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-34b-hf", torch_dtype=torch_dtype, device_map=device)
|
| 545 |
elif vlm_name == "llava-next-72b-hf (Preload)":
|
| 546 |
vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-next-72b-hf")
|
| 547 |
+
vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-next-72b-hf", torch_dtype=torch_dtype, device_map=device)
|
| 548 |
elif vlm_type == "qwen2-vl":
|
| 549 |
if vlm_processor != "" and vlm_model != "":
|
| 550 |
vlm_model.to(device)
|
|
|
|
| 552 |
else:
|
| 553 |
if os.path.exists(vlm_local_path):
|
| 554 |
vlm_processor = Qwen2VLProcessor.from_pretrained(vlm_local_path)
|
| 555 |
+
vlm_model = Qwen2VLForConditionalGeneration.from_pretrained(vlm_local_path, torch_dtype=torch_dtype, device_map=device)
|
| 556 |
else:
|
| 557 |
if vlm_name == "qwen2-vl-2b-instruct (Preload)":
|
| 558 |
vlm_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
| 559 |
+
vlm_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch_dtype, device_map=device)
|
| 560 |
elif vlm_name == "qwen2-vl-7b-instruct (Preload)":
|
| 561 |
vlm_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
|
| 562 |
+
vlm_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype=torch_dtype, device_map=device)
|
| 563 |
elif vlm_name == "qwen2-vl-72b-instruct (Preload)":
|
| 564 |
vlm_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-72B-Instruct")
|
| 565 |
+
vlm_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-72B-Instruct", torch_dtype=torch_dtype, device_map=device)
|
| 566 |
elif vlm_type == "openai":
|
| 567 |
pass
|
| 568 |
return "success"
|
|
|
|
| 654 |
original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
|
| 655 |
original_image = np.array(original_image)
|
| 656 |
if input_mask is not None:
|
| 657 |
+
input_mask = resize(Image.fromarray(np.squeeze(input_mask).astype(np.uint8)), target_width=int(output_w), target_height=int(output_h))
|
| 658 |
input_mask = np.array(input_mask)
|
| 659 |
if original_mask is not None:
|
| 660 |
+
original_mask = resize(Image.fromarray(np.squeeze(original_mask).astype(np.uint8)), target_width=int(output_w), target_height=int(output_h))
|
| 661 |
original_mask = np.array(original_mask)
|
| 662 |
gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
|
| 663 |
else:
|
|
|
|
| 673 |
original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
|
| 674 |
original_image = np.array(original_image)
|
| 675 |
if input_mask is not None:
|
| 676 |
+
input_mask = resize(Image.fromarray(np.squeeze(input_mask).astype(np.uint8)), target_width=int(output_w), target_height=int(output_h))
|
| 677 |
input_mask = np.array(input_mask)
|
| 678 |
if original_mask is not None:
|
| 679 |
+
original_mask = resize(Image.fromarray(np.squeeze(original_mask).astype(np.uint8)), target_width=int(output_w), target_height=int(output_h))
|
| 680 |
original_mask = np.array(original_mask)
|
| 681 |
|
| 682 |
if invert_mask_state:
|
|
|
|
| 722 |
sam_predictor,
|
| 723 |
sam_automask_generator,
|
| 724 |
groundingdino_model,
|
| 725 |
+
device).astype(np.uint8)
|
| 726 |
except Exception as e:
|
| 727 |
raise gr.Error("Please select the correct VLM model and input the correct API Key first!")
|
| 728 |
|
|
|
|
| 831 |
sam_predictor,
|
| 832 |
sam_automask_generator,
|
| 833 |
groundingdino_model,
|
| 834 |
+
device).astype(np.uint8)
|
| 835 |
else:
|
| 836 |
+
original_mask = input_mask.astype(np.uint8)
|
| 837 |
category = None
|
| 838 |
|
| 839 |
## resize mask if needed
|
app/src/vlm_template.py
CHANGED
|
@@ -7,7 +7,7 @@ from transformers import (
|
|
| 7 |
Qwen2VLForConditionalGeneration, Qwen2VLProcessor
|
| 8 |
)
|
| 9 |
## init device
|
| 10 |
-
device = "
|
| 11 |
torch_dtype = torch.float16
|
| 12 |
|
| 13 |
|
|
@@ -103,10 +103,10 @@ vlms_list = [
|
|
| 103 |
),
|
| 104 |
"model": Qwen2VLForConditionalGeneration.from_pretrained(
|
| 105 |
"models/vlms/Qwen2-VL-7B-Instruct", torch_dtype=torch_dtype, device_map=device
|
| 106 |
-
).to(
|
| 107 |
Qwen2VLForConditionalGeneration.from_pretrained(
|
| 108 |
"Qwen/Qwen2-VL-7B-Instruct", torch_dtype=torch_dtype, device_map=device
|
| 109 |
-
).to(
|
| 110 |
},
|
| 111 |
{
|
| 112 |
"type": "openai",
|
|
|
|
| 7 |
Qwen2VLForConditionalGeneration, Qwen2VLProcessor
|
| 8 |
)
|
| 9 |
## init device
|
| 10 |
+
device = "cuda"
|
| 11 |
torch_dtype = torch.float16
|
| 12 |
|
| 13 |
|
|
|
|
| 103 |
),
|
| 104 |
"model": Qwen2VLForConditionalGeneration.from_pretrained(
|
| 105 |
"models/vlms/Qwen2-VL-7B-Instruct", torch_dtype=torch_dtype, device_map=device
|
| 106 |
+
).to(device) if os.path.exists("models/vlms/Qwen2-VL-7B-Instruct") else
|
| 107 |
Qwen2VLForConditionalGeneration.from_pretrained(
|
| 108 |
"Qwen/Qwen2-VL-7B-Instruct", torch_dtype=torch_dtype, device_map=device
|
| 109 |
+
).to(device),
|
| 110 |
},
|
| 111 |
{
|
| 112 |
"type": "openai",
|