|
|
from PIL import Image |
|
|
import io |
|
|
import json |
|
|
|
|
|
def crop_image(image, x:float, y:float, width:float, height:float): |
|
|
"""Crop the image based on the normalized coordinates. |
|
|
Return the cropped image. |
|
|
This has the effect of zooming in on the image crop. |
|
|
|
|
|
Args: |
|
|
image (PIL.Image.Image): the input image |
|
|
x (float): the horizontal coordinate of the upper-left corner of the box |
|
|
y (float): the vertical coordinate of that corner |
|
|
width (float): the box width |
|
|
height (float): the box height |
|
|
|
|
|
Returns: |
|
|
cropped_img (PIL.Image.Image): the cropped image |
|
|
|
|
|
Example: |
|
|
image = Image.open("sample_img.jpg") |
|
|
cropped_img = crop_image(image, 0.2, 0.3, 0.5, 0.4) |
|
|
display(cropped_img) |
|
|
""" |
|
|
|
|
|
|
|
|
w, h = image.size |
|
|
|
|
|
|
|
|
x = min(max(0, x), 1) |
|
|
y = min(max(0, y), 1) |
|
|
x2 = min(max(0, x+width), 1) |
|
|
y2 = min(max(0, y+height), 1) |
|
|
|
|
|
cropped_img = image.crop((x*w, y*h, x2*w, y2*h)) |
|
|
|
|
|
buffer = io.BytesIO() |
|
|
cropped_img.save(buffer, format="JPEG") |
|
|
buffer.seek(0) |
|
|
|
|
|
|
|
|
jpeg_image = Image.open(buffer) |
|
|
return jpeg_image |
|
|
|
|
|
|
|
|
def zoom_in_image_by_bbox(image, box, padding=0.01): |
|
|
"""A simple wrapper function to crop the image based on the bounding box. |
|
|
The zoom factor cannot be too small. Minimum is 0.1 |
|
|
|
|
|
Args: |
|
|
image (PIL.Image.Image): the input image |
|
|
box (List[float]): the bounding box in the format of [x, y, w, h] |
|
|
padding (float, optional): The padding for the image crop, outside of the bounding box. Defaults to 0.05. |
|
|
|
|
|
Returns: |
|
|
cropped_img (PIL.Image.Image): the cropped image |
|
|
|
|
|
Example: |
|
|
image = Image.open("sample_img.jpg") |
|
|
annotated_img, boxes = detection(image, "bus") |
|
|
cropped_img = zoom_in_image_by_bbox(image, boxes[0], padding=0.1) |
|
|
display(cropped_img) |
|
|
""" |
|
|
assert padding >= 0.01, "The padding should be at least 0.01" |
|
|
x, y, w, h = box |
|
|
x, y, w, h = x-padding, y-padding, w+2*padding, h+2*padding |
|
|
return crop_image(image, x, y, w, h) |
|
|
|
|
|
|
|
|
def parse_inch_string(inch_str: str) -> float: |
|
|
""" |
|
|
Convert a string like '12.0 Inches' into a float (12.0). |
|
|
""" |
|
|
return float(inch_str.replace(" Inches", "").strip()) |
|
|
|
|
|
def convert_pptx_bboxes_to_image_space(bbox_dict, slide_width_in, slide_height_in): |
|
|
""" |
|
|
Convert each PPTX bounding box (in inches) to normalized image coords. |
|
|
|
|
|
bbox_dict format example: |
|
|
{ |
|
|
'TitleAndAuthor': { |
|
|
'left': '12.0 Inches', 'top': '1.0 Inches', |
|
|
'width': '24.0 Inches', 'height': '2.0 Inches' |
|
|
}, |
|
|
... |
|
|
} |
|
|
|
|
|
Returns a dictionary with the same keys, but values as [x_norm, y_norm, w_norm, h_norm]. |
|
|
""" |
|
|
result = {} |
|
|
for label, box in bbox_dict.items(): |
|
|
left_in = parse_inch_string(box['left']) |
|
|
top_in = parse_inch_string(box['top']) |
|
|
width_in = parse_inch_string(box['width']) |
|
|
height_in = parse_inch_string(box['height']) |
|
|
|
|
|
x_norm = left_in / slide_width_in |
|
|
y_norm = top_in / slide_height_in |
|
|
w_norm = width_in / slide_width_in |
|
|
h_norm = height_in / slide_height_in |
|
|
|
|
|
result[label] = [x_norm, y_norm, w_norm, h_norm] |
|
|
return result |
|
|
|
|
|
def convert_pptx_bboxes_json_to_image_json(bbox_json_str, slide_width_in, slide_height_in): |
|
|
""" |
|
|
Convert bounding boxes (in inches) from a JSON string to normalized image coords [0..1]. |
|
|
|
|
|
Args: |
|
|
bbox_json_str (str): JSON text of the bounding box dictionary you provided. |
|
|
Example of the structure (in JSON): |
|
|
{ |
|
|
"TitleAndAuthor": { |
|
|
"left": "12.0 Inches", |
|
|
"top": "1.0 Inches", |
|
|
"width": "24.0 Inches", |
|
|
"height": "2.0 Inches" |
|
|
}, |
|
|
"Abstract-Section Title": { ... }, |
|
|
... |
|
|
} |
|
|
slide_width_in (float): The total slide width in inches. |
|
|
slide_height_in (float): The total slide height in inches. |
|
|
|
|
|
Returns: |
|
|
str: A JSON string, where each key maps to [x_norm, y_norm, w_norm, h_norm]. |
|
|
""" |
|
|
|
|
|
def parse_inch_string(inch_str: str) -> float: |
|
|
"""Helper to parse '12.0 Inches' -> 12.0 (float).""" |
|
|
return float(inch_str.replace(" Inches", "").strip()) |
|
|
|
|
|
|
|
|
if type(bbox_json_str) == str: |
|
|
bbox_dict = json.loads(bbox_json_str) |
|
|
else: |
|
|
bbox_dict = bbox_json_str |
|
|
|
|
|
|
|
|
normalized_bboxes = {} |
|
|
for label, box in bbox_dict.items(): |
|
|
left_in = parse_inch_string(box['left']) |
|
|
top_in = parse_inch_string(box['top']) |
|
|
width_in = parse_inch_string(box['width']) |
|
|
height_in = parse_inch_string(box['height']) |
|
|
|
|
|
x_norm = left_in / slide_width_in |
|
|
y_norm = top_in / slide_height_in |
|
|
w_norm = width_in / slide_width_in |
|
|
h_norm = height_in / slide_height_in |
|
|
|
|
|
normalized_bboxes[label] = [x_norm, y_norm, w_norm, h_norm] |
|
|
|
|
|
|
|
|
return normalized_bboxes |
|
|
|
|
|
|