File size: 5,448 Bytes
fcaa164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from PIL import Image
import io
import json

def crop_image(image, x:float, y:float, width:float, height:float):
    """Crop the image based on the normalized coordinates.
    Return the cropped image.
    This has the effect of zooming in on the image crop.

    Args:
        image (PIL.Image.Image): the input image
        x (float): the horizontal coordinate of the upper-left corner of the box
        y (float): the vertical coordinate of that corner
        width (float): the box width
        height (float): the box height

    Returns:
        cropped_img (PIL.Image.Image): the cropped image
        
    Example:
        image = Image.open("sample_img.jpg")
        cropped_img = crop_image(image, 0.2, 0.3, 0.5, 0.4)
        display(cropped_img)
    """
    
    # get height and width of image
    w, h = image.size
    
    # limit the range of x and y
    x = min(max(0, x), 1)
    y = min(max(0, y), 1)
    x2 = min(max(0, x+width), 1)
    y2 = min(max(0, y+height), 1)
    
    cropped_img = image.crop((x*w, y*h, x2*w, y2*h))

    buffer = io.BytesIO()
    cropped_img.save(buffer, format="JPEG")
    buffer.seek(0)  # Reset buffer position

    # Load as a JpegImageFile
    jpeg_image = Image.open(buffer)
    return jpeg_image


def zoom_in_image_by_bbox(image, box, padding=0.01):
    """A simple wrapper function to crop the image based on the bounding box.
    The zoom factor cannot be too small. Minimum is 0.1

    Args:
        image (PIL.Image.Image): the input image
        box (List[float]): the bounding box in the format of [x, y, w, h]
        padding (float, optional): The padding for the image crop, outside of the bounding box. Defaults to 0.05.

    Returns:
        cropped_img (PIL.Image.Image): the cropped image
        
    Example:
        image = Image.open("sample_img.jpg")
        annotated_img, boxes = detection(image, "bus")
        cropped_img = zoom_in_image_by_bbox(image, boxes[0], padding=0.1)
        display(cropped_img)
    """
    assert padding >= 0.01, "The padding should be at least 0.01"
    x, y, w, h = box
    x, y, w, h = x-padding, y-padding, w+2*padding, h+2*padding
    return crop_image(image, x, y, w, h)


def parse_inch_string(inch_str: str) -> float:
    """
    Convert a string like '12.0 Inches' into a float (12.0).
    """
    return float(inch_str.replace(" Inches", "").strip())

def convert_pptx_bboxes_to_image_space(bbox_dict, slide_width_in, slide_height_in):
    """
    Convert each PPTX bounding box (in inches) to normalized image coords.

    bbox_dict format example:
    {
      'TitleAndAuthor': {
         'left': '12.0 Inches', 'top': '1.0 Inches',
         'width': '24.0 Inches', 'height': '2.0 Inches'
      },
      ...
    }

    Returns a dictionary with the same keys, but values as [x_norm, y_norm, w_norm, h_norm].
    """
    result = {}
    for label, box in bbox_dict.items():
        left_in   = parse_inch_string(box['left'])
        top_in    = parse_inch_string(box['top'])
        width_in  = parse_inch_string(box['width'])
        height_in = parse_inch_string(box['height'])

        x_norm = left_in / slide_width_in
        y_norm = top_in  / slide_height_in
        w_norm = width_in  / slide_width_in
        h_norm = height_in / slide_height_in

        result[label] = [x_norm, y_norm, w_norm, h_norm]
    return result

def convert_pptx_bboxes_json_to_image_json(bbox_json_str, slide_width_in, slide_height_in):
    """
    Convert bounding boxes (in inches) from a JSON string to normalized image coords [0..1].

    Args:
        bbox_json_str (str): JSON text of the bounding box dictionary you provided.
                             Example of the structure (in JSON):
                             {
                                 "TitleAndAuthor": {
                                    "left": "12.0 Inches",
                                    "top": "1.0 Inches",
                                    "width": "24.0 Inches",
                                    "height": "2.0 Inches"
                                 },
                                 "Abstract-Section Title": { ... },
                                 ...
                             }
        slide_width_in (float): The total slide width in inches.
        slide_height_in (float): The total slide height in inches.

    Returns:
        str: A JSON string, where each key maps to [x_norm, y_norm, w_norm, h_norm].
    """

    def parse_inch_string(inch_str: str) -> float:
        """Helper to parse '12.0 Inches' -> 12.0 (float)."""
        return float(inch_str.replace(" Inches", "").strip())

    # 1) Parse the incoming JSON string to a Python dict
    if type(bbox_json_str) == str:
        bbox_dict = json.loads(bbox_json_str)
    else:
        bbox_dict = bbox_json_str

    # 2) Convert each bounding box to normalized coordinates [x, y, w, h]
    normalized_bboxes = {}
    for label, box in bbox_dict.items():
        left_in   = parse_inch_string(box['left'])
        top_in    = parse_inch_string(box['top'])
        width_in  = parse_inch_string(box['width'])
        height_in = parse_inch_string(box['height'])

        x_norm = left_in / slide_width_in
        y_norm = top_in  / slide_height_in
        w_norm = width_in  / slide_width_in
        h_norm = height_in / slide_height_in

        normalized_bboxes[label] = [x_norm, y_norm, w_norm, h_norm]

    # 3) Return as a JSON string
    return normalized_bboxes