Spaces:

Mqleet
/

AutoPage

Running

File size: 5,448 Bytes

fcaa164

from PIL import Image
import io
import json

def crop_image(image, x:float, y:float, width:float, height:float):
    """Crop the image based on the normalized coordinates.
    Return the cropped image.
    This has the effect of zooming in on the image crop.

    Args:
        image (PIL.Image.Image): the input image
        x (float): the horizontal coordinate of the upper-left corner of the box
        y (float): the vertical coordinate of that corner
        width (float): the box width
        height (float): the box height

    Returns:
        cropped_img (PIL.Image.Image): the cropped image
        
    Example:
        image = Image.open("sample_img.jpg")
        cropped_img = crop_image(image, 0.2, 0.3, 0.5, 0.4)
        display(cropped_img)
    """
    
    # get height and width of image
    w, h = image.size
    
    # limit the range of x and y
    x = min(max(0, x), 1)
    y = min(max(0, y), 1)
    x2 = min(max(0, x+width), 1)
    y2 = min(max(0, y+height), 1)
    
    cropped_img = image.crop((x*w, y*h, x2*w, y2*h))

    buffer = io.BytesIO()
    cropped_img.save(buffer, format="JPEG")
    buffer.seek(0)  # Reset buffer position

    # Load as a JpegImageFile
    jpeg_image = Image.open(buffer)
    return jpeg_image


def zoom_in_image_by_bbox(image, box, padding=0.01):
    """A simple wrapper function to crop the image based on the bounding box.
    The zoom factor cannot be too small. Minimum is 0.1

    Args:
        image (PIL.Image.Image): the input image
        box (List[float]): the bounding box in the format of [x, y, w, h]
        padding (float, optional): The padding for the image crop, outside of the bounding box. Defaults to 0.05.

    Returns:
        cropped_img (PIL.Image.Image): the cropped image
        
    Example:
        image = Image.open("sample_img.jpg")
        annotated_img, boxes = detection(image, "bus")
        cropped_img = zoom_in_image_by_bbox(image, boxes[0], padding=0.1)
        display(cropped_img)
    """
    assert padding >= 0.01, "The padding should be at least 0.01"
    x, y, w, h = box
    x, y, w, h = x-padding, y-padding, w+2*padding, h+2*padding
    return crop_image(image, x, y, w, h)


def parse_inch_string(inch_str: str) -> float:
    """
    Convert a string like '12.0 Inches' into a float (12.0).
    """
    return float(inch_str.replace(" Inches", "").strip())

def convert_pptx_bboxes_to_image_space(bbox_dict, slide_width_in, slide_height_in):
    """
    Convert each PPTX bounding box (in inches) to normalized image coords.

    bbox_dict format example:
    {
      'TitleAndAuthor': {
         'left': '12.0 Inches', 'top': '1.0 Inches',
         'width': '24.0 Inches', 'height': '2.0 Inches'
      },
      ...
    }

    Returns a dictionary with the same keys, but values as [x_norm, y_norm, w_norm, h_norm].
    """
    result = {}
    for label, box in bbox_dict.items():
        left_in   = parse_inch_string(box['left'])
        top_in    = parse_inch_string(box['top'])
        width_in  = parse_inch_string(box['width'])
        height_in = parse_inch_string(box['height'])

        x_norm = left_in / slide_width_in
        y_norm = top_in  / slide_height_in
        w_norm = width_in  / slide_width_in
        h_norm = height_in / slide_height_in

        result[label] = [x_norm, y_norm, w_norm, h_norm]
    return result

def convert_pptx_bboxes_json_to_image_json(bbox_json_str, slide_width_in, slide_height_in):
    """
    Convert bounding boxes (in inches) from a JSON string to normalized image coords [0..1].

    Args:
        bbox_json_str (str): JSON text of the bounding box dictionary you provided.
                             Example of the structure (in JSON):
                             {
                                 "TitleAndAuthor": {
                                    "left": "12.0 Inches",
                                    "top": "1.0 Inches",
                                    "width": "24.0 Inches",
                                    "height": "2.0 Inches"
                                 },
                                 "Abstract-Section Title": { ... },
                                 ...
                             }
        slide_width_in (float): The total slide width in inches.
        slide_height_in (float): The total slide height in inches.

    Returns:
        str: A JSON string, where each key maps to [x_norm, y_norm, w_norm, h_norm].
    """

    def parse_inch_string(inch_str: str) -> float:
        """Helper to parse '12.0 Inches' -> 12.0 (float)."""
        return float(inch_str.replace(" Inches", "").strip())

    # 1) Parse the incoming JSON string to a Python dict
    if type(bbox_json_str) == str:
        bbox_dict = json.loads(bbox_json_str)
    else:
        bbox_dict = bbox_json_str

    # 2) Convert each bounding box to normalized coordinates [x, y, w, h]
    normalized_bboxes = {}
    for label, box in bbox_dict.items():
        left_in   = parse_inch_string(box['left'])
        top_in    = parse_inch_string(box['top'])
        width_in  = parse_inch_string(box['width'])
        height_in = parse_inch_string(box['height'])

        x_norm = left_in / slide_width_in
        y_norm = top_in  / slide_height_in
        w_norm = width_in  / slide_width_in
        h_norm = height_in / slide_height_in

        normalized_bboxes[label] = [x_norm, y_norm, w_norm, h_norm]

    # 3) Return as a JSON string
    return normalized_bboxes