File size: 5,448 Bytes
fcaa164 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
from PIL import Image
import io
import json
def crop_image(image, x:float, y:float, width:float, height:float):
"""Crop the image based on the normalized coordinates.
Return the cropped image.
This has the effect of zooming in on the image crop.
Args:
image (PIL.Image.Image): the input image
x (float): the horizontal coordinate of the upper-left corner of the box
y (float): the vertical coordinate of that corner
width (float): the box width
height (float): the box height
Returns:
cropped_img (PIL.Image.Image): the cropped image
Example:
image = Image.open("sample_img.jpg")
cropped_img = crop_image(image, 0.2, 0.3, 0.5, 0.4)
display(cropped_img)
"""
# get height and width of image
w, h = image.size
# limit the range of x and y
x = min(max(0, x), 1)
y = min(max(0, y), 1)
x2 = min(max(0, x+width), 1)
y2 = min(max(0, y+height), 1)
cropped_img = image.crop((x*w, y*h, x2*w, y2*h))
buffer = io.BytesIO()
cropped_img.save(buffer, format="JPEG")
buffer.seek(0) # Reset buffer position
# Load as a JpegImageFile
jpeg_image = Image.open(buffer)
return jpeg_image
def zoom_in_image_by_bbox(image, box, padding=0.01):
"""A simple wrapper function to crop the image based on the bounding box.
The zoom factor cannot be too small. Minimum is 0.1
Args:
image (PIL.Image.Image): the input image
box (List[float]): the bounding box in the format of [x, y, w, h]
padding (float, optional): The padding for the image crop, outside of the bounding box. Defaults to 0.05.
Returns:
cropped_img (PIL.Image.Image): the cropped image
Example:
image = Image.open("sample_img.jpg")
annotated_img, boxes = detection(image, "bus")
cropped_img = zoom_in_image_by_bbox(image, boxes[0], padding=0.1)
display(cropped_img)
"""
assert padding >= 0.01, "The padding should be at least 0.01"
x, y, w, h = box
x, y, w, h = x-padding, y-padding, w+2*padding, h+2*padding
return crop_image(image, x, y, w, h)
def parse_inch_string(inch_str: str) -> float:
"""
Convert a string like '12.0 Inches' into a float (12.0).
"""
return float(inch_str.replace(" Inches", "").strip())
def convert_pptx_bboxes_to_image_space(bbox_dict, slide_width_in, slide_height_in):
"""
Convert each PPTX bounding box (in inches) to normalized image coords.
bbox_dict format example:
{
'TitleAndAuthor': {
'left': '12.0 Inches', 'top': '1.0 Inches',
'width': '24.0 Inches', 'height': '2.0 Inches'
},
...
}
Returns a dictionary with the same keys, but values as [x_norm, y_norm, w_norm, h_norm].
"""
result = {}
for label, box in bbox_dict.items():
left_in = parse_inch_string(box['left'])
top_in = parse_inch_string(box['top'])
width_in = parse_inch_string(box['width'])
height_in = parse_inch_string(box['height'])
x_norm = left_in / slide_width_in
y_norm = top_in / slide_height_in
w_norm = width_in / slide_width_in
h_norm = height_in / slide_height_in
result[label] = [x_norm, y_norm, w_norm, h_norm]
return result
def convert_pptx_bboxes_json_to_image_json(bbox_json_str, slide_width_in, slide_height_in):
"""
Convert bounding boxes (in inches) from a JSON string to normalized image coords [0..1].
Args:
bbox_json_str (str): JSON text of the bounding box dictionary you provided.
Example of the structure (in JSON):
{
"TitleAndAuthor": {
"left": "12.0 Inches",
"top": "1.0 Inches",
"width": "24.0 Inches",
"height": "2.0 Inches"
},
"Abstract-Section Title": { ... },
...
}
slide_width_in (float): The total slide width in inches.
slide_height_in (float): The total slide height in inches.
Returns:
str: A JSON string, where each key maps to [x_norm, y_norm, w_norm, h_norm].
"""
def parse_inch_string(inch_str: str) -> float:
"""Helper to parse '12.0 Inches' -> 12.0 (float)."""
return float(inch_str.replace(" Inches", "").strip())
# 1) Parse the incoming JSON string to a Python dict
if type(bbox_json_str) == str:
bbox_dict = json.loads(bbox_json_str)
else:
bbox_dict = bbox_json_str
# 2) Convert each bounding box to normalized coordinates [x, y, w, h]
normalized_bboxes = {}
for label, box in bbox_dict.items():
left_in = parse_inch_string(box['left'])
top_in = parse_inch_string(box['top'])
width_in = parse_inch_string(box['width'])
height_in = parse_inch_string(box['height'])
x_norm = left_in / slide_width_in
y_norm = top_in / slide_height_in
w_norm = width_in / slide_width_in
h_norm = height_in / slide_height_in
normalized_bboxes[label] = [x_norm, y_norm, w_norm, h_norm]
# 3) Return as a JSON string
return normalized_bboxes
|