Update code snippet (#6)
Browse files- Update code snippet (8de90a4fd688503eacbb4f2a7d4cdd4f9c06324d)
- Update README.md (27d9485a27e88c5f0435d0ed8382dcb3b911ab46)
Co-authored-by: Niels Rogge <nielsr@users.noreply.huggingface.co>
    	
        README.md
    CHANGED
    
    | @@ -34,12 +34,11 @@ The model uses a CLIP backbone with a ViT-B/16 Transformer architecture as an im | |
| 34 | 
             
            ```python
         | 
| 35 | 
             
            import requests
         | 
| 36 | 
             
            from PIL import Image
         | 
| 37 | 
            -
            import numpy as np
         | 
| 38 | 
             
            import torch
         | 
| 39 | 
            -
            from transformers import AutoProcessor, Owlv2ForObjectDetection
         | 
| 40 | 
            -
            from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
         | 
| 41 |  | 
| 42 | 
            -
             | 
|  | |
|  | |
| 43 | 
             
            model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
         | 
| 44 |  | 
| 45 | 
             
            url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         | 
| @@ -47,33 +46,16 @@ image = Image.open(requests.get(url, stream=True).raw) | |
| 47 | 
             
            texts = [["a photo of a cat", "a photo of a dog"]]
         | 
| 48 | 
             
            inputs = processor(text=texts, images=image, return_tensors="pt")
         | 
| 49 |  | 
| 50 | 
            -
            # forward pass
         | 
| 51 | 
             
            with torch.no_grad():
         | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
            # Note: boxes need to be visualized on the padded, unnormalized image
         | 
| 55 | 
            -
            # hence we'll set the target image sizes (height, width) based on that
         | 
| 56 | 
            -
             | 
| 57 | 
            -
            def get_preprocessed_image(pixel_values):
         | 
| 58 | 
            -
                pixel_values = pixel_values.squeeze().numpy()
         | 
| 59 | 
            -
                unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]
         | 
| 60 | 
            -
                unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
         | 
| 61 | 
            -
                unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
         | 
| 62 | 
            -
                unnormalized_image = Image.fromarray(unnormalized_image)
         | 
| 63 | 
            -
                return unnormalized_image
         | 
| 64 | 
            -
             | 
| 65 | 
            -
            unnormalized_image = get_preprocessed_image(inputs.pixel_values)
         | 
| 66 | 
            -
             | 
| 67 | 
            -
            target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
         | 
| 68 | 
            -
            # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
         | 
| 69 | 
            -
            results = processor.post_process_object_detection(
         | 
| 70 | 
            -
                outputs=outputs, threshold=0.2, target_sizes=target_sizes
         | 
| 71 | 
            -
            )
         | 
| 72 |  | 
|  | |
|  | |
|  | |
|  | |
| 73 | 
             
            i = 0  # Retrieve predictions for the first image for the corresponding text queries
         | 
| 74 | 
             
            text = texts[i]
         | 
| 75 | 
             
            boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
         | 
| 76 | 
            -
             | 
| 77 | 
             
            for box, score, label in zip(boxes, scores, labels):
         | 
| 78 | 
             
                box = [round(i, 2) for i in box.tolist()]
         | 
| 79 | 
             
                print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
         | 
|  | |
| 34 | 
             
            ```python
         | 
| 35 | 
             
            import requests
         | 
| 36 | 
             
            from PIL import Image
         | 
|  | |
| 37 | 
             
            import torch
         | 
|  | |
|  | |
| 38 |  | 
| 39 | 
            +
            from transformers import Owlv2Processor, Owlv2ForObjectDetection
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
         | 
| 42 | 
             
            model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
         | 
| 43 |  | 
| 44 | 
             
            url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         | 
|  | |
| 46 | 
             
            texts = [["a photo of a cat", "a photo of a dog"]]
         | 
| 47 | 
             
            inputs = processor(text=texts, images=image, return_tensors="pt")
         | 
| 48 |  | 
|  | |
| 49 | 
             
            with torch.no_grad():
         | 
| 50 | 
            +
              outputs = model(**inputs)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 51 |  | 
| 52 | 
            +
            # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
         | 
| 53 | 
            +
            target_sizes = torch.Tensor([image.size[::-1]])
         | 
| 54 | 
            +
            # Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax)
         | 
| 55 | 
            +
            results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
         | 
| 56 | 
             
            i = 0  # Retrieve predictions for the first image for the corresponding text queries
         | 
| 57 | 
             
            text = texts[i]
         | 
| 58 | 
             
            boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
         | 
|  | |
| 59 | 
             
            for box, score, label in zip(boxes, scores, labels):
         | 
| 60 | 
             
                box = [round(i, 2) for i in box.tolist()]
         | 
| 61 | 
             
                print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
         | 

 
		