AutoPage / utils /critic_utils.py
Mqleet's picture
upd code
fcaa164
raw
history blame
5.45 kB
from PIL import Image
import io
import json
def crop_image(image, x:float, y:float, width:float, height:float):
"""Crop the image based on the normalized coordinates.
Return the cropped image.
This has the effect of zooming in on the image crop.
Args:
image (PIL.Image.Image): the input image
x (float): the horizontal coordinate of the upper-left corner of the box
y (float): the vertical coordinate of that corner
width (float): the box width
height (float): the box height
Returns:
cropped_img (PIL.Image.Image): the cropped image
Example:
image = Image.open("sample_img.jpg")
cropped_img = crop_image(image, 0.2, 0.3, 0.5, 0.4)
display(cropped_img)
"""
# get height and width of image
w, h = image.size
# limit the range of x and y
x = min(max(0, x), 1)
y = min(max(0, y), 1)
x2 = min(max(0, x+width), 1)
y2 = min(max(0, y+height), 1)
cropped_img = image.crop((x*w, y*h, x2*w, y2*h))
buffer = io.BytesIO()
cropped_img.save(buffer, format="JPEG")
buffer.seek(0) # Reset buffer position
# Load as a JpegImageFile
jpeg_image = Image.open(buffer)
return jpeg_image
def zoom_in_image_by_bbox(image, box, padding=0.01):
"""A simple wrapper function to crop the image based on the bounding box.
The zoom factor cannot be too small. Minimum is 0.1
Args:
image (PIL.Image.Image): the input image
box (List[float]): the bounding box in the format of [x, y, w, h]
padding (float, optional): The padding for the image crop, outside of the bounding box. Defaults to 0.05.
Returns:
cropped_img (PIL.Image.Image): the cropped image
Example:
image = Image.open("sample_img.jpg")
annotated_img, boxes = detection(image, "bus")
cropped_img = zoom_in_image_by_bbox(image, boxes[0], padding=0.1)
display(cropped_img)
"""
assert padding >= 0.01, "The padding should be at least 0.01"
x, y, w, h = box
x, y, w, h = x-padding, y-padding, w+2*padding, h+2*padding
return crop_image(image, x, y, w, h)
def parse_inch_string(inch_str: str) -> float:
"""
Convert a string like '12.0 Inches' into a float (12.0).
"""
return float(inch_str.replace(" Inches", "").strip())
def convert_pptx_bboxes_to_image_space(bbox_dict, slide_width_in, slide_height_in):
"""
Convert each PPTX bounding box (in inches) to normalized image coords.
bbox_dict format example:
{
'TitleAndAuthor': {
'left': '12.0 Inches', 'top': '1.0 Inches',
'width': '24.0 Inches', 'height': '2.0 Inches'
},
...
}
Returns a dictionary with the same keys, but values as [x_norm, y_norm, w_norm, h_norm].
"""
result = {}
for label, box in bbox_dict.items():
left_in = parse_inch_string(box['left'])
top_in = parse_inch_string(box['top'])
width_in = parse_inch_string(box['width'])
height_in = parse_inch_string(box['height'])
x_norm = left_in / slide_width_in
y_norm = top_in / slide_height_in
w_norm = width_in / slide_width_in
h_norm = height_in / slide_height_in
result[label] = [x_norm, y_norm, w_norm, h_norm]
return result
def convert_pptx_bboxes_json_to_image_json(bbox_json_str, slide_width_in, slide_height_in):
"""
Convert bounding boxes (in inches) from a JSON string to normalized image coords [0..1].
Args:
bbox_json_str (str): JSON text of the bounding box dictionary you provided.
Example of the structure (in JSON):
{
"TitleAndAuthor": {
"left": "12.0 Inches",
"top": "1.0 Inches",
"width": "24.0 Inches",
"height": "2.0 Inches"
},
"Abstract-Section Title": { ... },
...
}
slide_width_in (float): The total slide width in inches.
slide_height_in (float): The total slide height in inches.
Returns:
str: A JSON string, where each key maps to [x_norm, y_norm, w_norm, h_norm].
"""
def parse_inch_string(inch_str: str) -> float:
"""Helper to parse '12.0 Inches' -> 12.0 (float)."""
return float(inch_str.replace(" Inches", "").strip())
# 1) Parse the incoming JSON string to a Python dict
if type(bbox_json_str) == str:
bbox_dict = json.loads(bbox_json_str)
else:
bbox_dict = bbox_json_str
# 2) Convert each bounding box to normalized coordinates [x, y, w, h]
normalized_bboxes = {}
for label, box in bbox_dict.items():
left_in = parse_inch_string(box['left'])
top_in = parse_inch_string(box['top'])
width_in = parse_inch_string(box['width'])
height_in = parse_inch_string(box['height'])
x_norm = left_in / slide_width_in
y_norm = top_in / slide_height_in
w_norm = width_in / slide_width_in
h_norm = height_in / slide_height_in
normalized_bboxes[label] = [x_norm, y_norm, w_norm, h_norm]
# 3) Return as a JSON string
return normalized_bboxes