Spaces:
Runtime error
Runtime error
| from typing import Tuple | |
| import gradio as gr | |
| import numpy as np | |
| import supervision as sv | |
| import torch | |
| import time | |
| from PIL import Image | |
| from torchvision.transforms import ToTensor | |
| # from transformers import SamModel, SamProcessor | |
| from efficient_sam.build_efficient_sam import build_efficient_sam_vits | |
| from efficientvit.models.efficientvit.sam import EfficientViTSamPredictor | |
| from efficientvit.sam_model_zoo import create_sam_model | |
| MARKDOWN = """ | |
| # EfficientViT-SAM vs EfficientSAM vs SAM | |
| Paper source: | |
| [EfficientViT-SAM](https://arxiv.org/abs/2402.05008) and [EfficientSAM](https://arxiv.org/abs/2312.00863) and | |
| [SAM](https://arxiv.org/abs/2304.02643) | |
| \n | |
| Github Source Code: [Link](https://github.com/pg56714/Segment-Anything-Arena) | |
| \n | |
| The SAM model takes one minute to run to completion, which slow down other models. Currently, EfficientViT-SAM and EfficientSAM are displayed first. | |
| The source code for all three models is available, but the SAM is commented out. | |
| """ | |
| BOX_EXAMPLES = [ | |
| ["https://media.roboflow.com/efficient-sam/corgi.jpg", 801, 510, 1782, 993], | |
| ] | |
| PROMPT_COLOR = sv.Color.from_hex("#D3D3D3") | |
| MASK_COLOR = sv.Color.from_hex("#FF0000") | |
| DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # SAM_MODEL = SamModel.from_pretrained("facebook/sam-vit-huge").to(DEVICE).eval() | |
| # SAM_PROCESSOR = SamProcessor.from_pretrained("facebook/sam-vit-huge") | |
| EFFICIENT_SAM_MODEL = build_efficient_sam_vits().to(DEVICE).eval() | |
| MASK_ANNOTATOR = sv.MaskAnnotator(color=MASK_COLOR, color_lookup=sv.ColorLookup.INDEX) | |
| EFFICIENTVITSAM = EfficientViTSamPredictor( | |
| create_sam_model(name="xl1", weight_url="./weights/xl1.pt").to(DEVICE).eval() | |
| ) | |
| def annotate_image_with_box_prompt_result( | |
| image: np.ndarray, | |
| detections: sv.Detections, | |
| x_min: int, | |
| y_min: int, | |
| x_max: int, | |
| y_max: int, | |
| ) -> np.ndarray: | |
| h, w, _ = image.shape | |
| bgr_image = image[:, :, ::-1] | |
| annotated_bgr_image = MASK_ANNOTATOR.annotate( | |
| scene=bgr_image.copy(), detections=detections | |
| ) | |
| annotated_bgr_image = sv.draw_rectangle( | |
| scene=annotated_bgr_image, | |
| rect=sv.Rect( | |
| x=x_min, | |
| y=y_min, | |
| width=int(x_max - x_min), | |
| height=int(y_max - y_min), | |
| ), | |
| color=PROMPT_COLOR, | |
| thickness=sv.calculate_optimal_line_thickness(resolution_wh=(w, h)), | |
| ) | |
| return annotated_bgr_image[:, :, ::-1] | |
| def efficientvit_sam_box_inference( | |
| image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int | |
| ) -> np.ndarray: | |
| t1 = time.time() | |
| box = np.array([[x_min, y_min, x_max, y_max]]) | |
| EFFICIENTVITSAM.set_image(image) | |
| mask = EFFICIENTVITSAM.predict(box=box, multimask_output=False) | |
| mask = mask[0] | |
| detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask) | |
| result = annotate_image_with_box_prompt_result( | |
| image=image, | |
| detections=detections, | |
| x_max=x_max, | |
| x_min=x_min, | |
| y_max=y_max, | |
| y_min=y_min, | |
| ) | |
| t2 = time.time() | |
| print(f"timecost: {t2-t1}") | |
| return result | |
| def inference_with_box( | |
| image: np.ndarray, | |
| box: np.ndarray, | |
| model: torch.jit.ScriptModule, | |
| device: torch.device, | |
| ) -> np.ndarray: | |
| bbox = torch.reshape(torch.tensor(box), [1, 1, 2, 2]) | |
| bbox_labels = torch.reshape(torch.tensor([2, 3]), [1, 1, 2]) | |
| img_tensor = ToTensor()(image) | |
| predicted_logits, predicted_iou = model( | |
| img_tensor[None, ...].to(device), | |
| bbox.to(device), | |
| bbox_labels.to(device), | |
| ) | |
| predicted_logits = predicted_logits.cpu() | |
| all_masks = torch.ge(torch.sigmoid(predicted_logits[0, 0, :, :, :]), 0.5).numpy() | |
| predicted_iou = predicted_iou[0, 0, ...].cpu().detach().numpy() | |
| max_predicted_iou = -1 | |
| selected_mask_using_predicted_iou = None | |
| for m in range(all_masks.shape[0]): | |
| curr_predicted_iou = predicted_iou[m] | |
| if ( | |
| curr_predicted_iou > max_predicted_iou | |
| or selected_mask_using_predicted_iou is None | |
| ): | |
| max_predicted_iou = curr_predicted_iou | |
| selected_mask_using_predicted_iou = all_masks[m] | |
| return selected_mask_using_predicted_iou | |
| def efficient_sam_box_inference( | |
| image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int | |
| ) -> np.ndarray: | |
| t1 = time.time() | |
| box = np.array([[x_min, y_min], [x_max, y_max]]) | |
| mask = inference_with_box(image, box, EFFICIENT_SAM_MODEL, DEVICE) | |
| mask = mask[np.newaxis, ...] | |
| detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask) | |
| result = annotate_image_with_box_prompt_result( | |
| image=image, | |
| detections=detections, | |
| x_max=x_max, | |
| x_min=x_min, | |
| y_max=y_max, | |
| y_min=y_min, | |
| ) | |
| t2 = time.time() | |
| print(f"timecost: {t2-t1}") | |
| return result | |
| # def sam_box_inference( | |
| # image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int | |
| # ) -> np.ndarray: | |
| # t1 = time.time() | |
| # input_boxes = [[[x_min, y_min, x_max, y_max]]] | |
| # inputs = SAM_PROCESSOR( | |
| # Image.fromarray(image), input_boxes=[input_boxes], return_tensors="pt" | |
| # ).to(DEVICE) | |
| # with torch.no_grad(): | |
| # outputs = SAM_MODEL(**inputs) | |
| # mask = SAM_PROCESSOR.image_processor.post_process_masks( | |
| # outputs.pred_masks.cpu(), | |
| # inputs["original_sizes"].cpu(), | |
| # inputs["reshaped_input_sizes"].cpu(), | |
| # )[0][0][0].numpy() | |
| # mask = mask[np.newaxis, ...] | |
| # detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask) | |
| # result = annotate_image_with_box_prompt_result( | |
| # image=image, | |
| # detections=detections, | |
| # x_max=x_max, | |
| # x_min=x_min, | |
| # y_max=y_max, | |
| # y_min=y_min, | |
| # ) | |
| # t2 = time.time() | |
| # print(f"timecost: {t2-t1}") | |
| # return result | |
| def box_inference( | |
| image: np.ndarray, x_min: int, y_min: int, x_max: int, y_max: int | |
| ) -> Tuple[np.ndarray, np.ndarray]: | |
| return ( | |
| efficientvit_sam_box_inference(image, x_min, y_min, x_max, y_max), | |
| efficient_sam_box_inference(image, x_min, y_min, x_max, y_max), | |
| # sam_box_inference(image, x_min, y_min, x_max, y_max), | |
| ) | |
| # def clear(_: np.ndarray) -> Tuple[None, None, None]: | |
| # return None, None, None | |
| def clear(_: np.ndarray) -> Tuple[None, None]: | |
| return None, None | |
| box_input_image = gr.Image() | |
| x_min_number = gr.Number(label="x_min") | |
| y_min_number = gr.Number(label="y_min") | |
| x_max_number = gr.Number(label="x_max") | |
| y_max_number = gr.Number(label="y_max") | |
| box_inputs = [box_input_image, x_min_number, y_min_number, x_max_number, y_max_number] | |
| with gr.Blocks() as demo: | |
| gr.Markdown(MARKDOWN) | |
| with gr.Row(): | |
| box_input_image.render() | |
| efficientvit_sam_box_output_image = gr.Image(label="EfficientVit-SAM") | |
| efficient_sam_box_output_image = gr.Image(label="EfficientSAM") | |
| # sam_box_output_image = gr.Image(label="SAM") | |
| with gr.Row(): | |
| x_min_number.render() | |
| y_min_number.render() | |
| x_max_number.render() | |
| y_max_number.render() | |
| submit_box_inference_button = gr.Button( | |
| value="Submit", scale=1, variant="primary" | |
| ) | |
| gr.Examples( | |
| # fn=box_inference, | |
| examples=BOX_EXAMPLES, | |
| inputs=box_inputs, | |
| outputs=[ | |
| efficientvit_sam_box_output_image, | |
| efficient_sam_box_output_image, | |
| # sam_box_output_image, | |
| ], | |
| ) | |
| submit_box_inference_button.click( | |
| efficientvit_sam_box_inference, | |
| inputs=box_inputs, | |
| outputs=efficientvit_sam_box_output_image, | |
| ) | |
| submit_box_inference_button.click( | |
| efficient_sam_box_inference, | |
| inputs=box_inputs, | |
| outputs=efficient_sam_box_output_image, | |
| ) | |
| # submit_box_inference_button.click( | |
| # sam_box_inference, inputs=box_inputs, outputs=sam_box_output_image | |
| # ) | |
| box_input_image.change( | |
| clear, | |
| inputs=box_input_image, | |
| outputs=[ | |
| efficientvit_sam_box_output_image, | |
| efficient_sam_box_output_image, | |
| # sam_box_output_image, | |
| ], | |
| ) | |
| demo.launch(debug=False, show_error=True) | |