Spaces:
Sleeping
Sleeping
ivelin
commited on
Commit
·
ec06acb
1
Parent(s):
12b6ec4
fix: image preprocessing
Browse filesSigned-off-by: ivelin <ivelin.eth@gmail.com>
- .gitignore +1 -0
- Inference_Playground_Donut_UI_RefExp_Gradio.ipynb +0 -0
- app.py +48 -15
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
Inference_Playground_Donut_UI_RefExp_Gradio.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app.py
CHANGED
|
@@ -12,34 +12,64 @@ pretrained_revision = 'main'
|
|
| 12 |
# use 'main' for latest revision
|
| 13 |
print(f"Loading model checkpoint: {pretrained_repo_name}")
|
| 14 |
|
| 15 |
-
processor = DonutProcessor.from_pretrained(
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 19 |
model.to(device)
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
def translate_point_coords_from_out_to_in(point=None, input_image_size=None, output_image_size=None):
|
| 22 |
"""
|
| 23 |
Convert relative prediction coordinates from resized encoder tensor image
|
| 24 |
to original input image size.
|
| 25 |
Args:
|
| 26 |
-
original_point: x, y coordinates of the point coordinates in [0..1] range in the original image
|
| 27 |
input_image_size: (width, height) tuple
|
| 28 |
output_image_size: (width, height) tuple
|
| 29 |
-
"""
|
| 30 |
assert point is not None
|
| 31 |
assert input_image_size is not None
|
| 32 |
assert output_image_size is not None
|
| 33 |
-
|
|
|
|
| 34 |
input_width, input_height = input_image_size
|
| 35 |
output_width, output_height = output_image_size
|
| 36 |
-
|
| 37 |
ratio = min(output_width/input_width, output_height/input_height)
|
| 38 |
-
|
| 39 |
resized_height = int(input_height*ratio)
|
| 40 |
-
# print(f'>>> resized_height={resized_height}')
|
| 41 |
resized_width = int(input_width*ratio)
|
| 42 |
-
|
|
|
|
| 43 |
|
| 44 |
if resized_height == input_height and resized_width == input_width:
|
| 45 |
return
|
|
@@ -51,8 +81,9 @@ def translate_point_coords_from_out_to_in(point=None, input_image_size=None, out
|
|
| 51 |
if resized_height < output_height:
|
| 52 |
# adjust for padding pixels
|
| 53 |
point['y'] *= (output_height / resized_height)
|
| 54 |
-
|
| 55 |
-
|
|
|
|
| 56 |
|
| 57 |
def process_refexp(image: Image, prompt: str):
|
| 58 |
|
|
@@ -125,9 +156,11 @@ def process_refexp(image: Image, prompt: str):
|
|
| 125 |
print(f"processed prompt: {prompt}")
|
| 126 |
|
| 127 |
# convert coordinates from tensor image size to input image size
|
| 128 |
-
out_size = (
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
| 131 |
x = math.floor(width*center_point["x"])
|
| 132 |
y = math.floor(height*center_point["y"])
|
| 133 |
|
|
@@ -183,4 +216,4 @@ demo = gr.Interface(fn=process_refexp,
|
|
| 183 |
cache_examples=False
|
| 184 |
)
|
| 185 |
|
| 186 |
-
demo.launch()
|
|
|
|
| 12 |
# use 'main' for latest revision
|
| 13 |
print(f"Loading model checkpoint: {pretrained_repo_name}")
|
| 14 |
|
| 15 |
+
processor = DonutProcessor.from_pretrained(
|
| 16 |
+
pretrained_repo_name, revision=pretrained_revision, use_auth_token="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")
|
| 17 |
+
processor.image_processor.do_align_long_axis = False
|
| 18 |
+
# do not manipulate image size and position
|
| 19 |
+
processor.image_processor.do_resize = False
|
| 20 |
+
processor.image_processor.do_thumbnail = False
|
| 21 |
+
processor.image_processor.do_pad = False
|
| 22 |
+
processor.image_processor.do_rescale = False
|
| 23 |
+
|
| 24 |
+
print(f'processor image size: {processor.image_processor.size}')
|
| 25 |
+
|
| 26 |
+
model = VisionEncoderDecoderModel.from_pretrained(
|
| 27 |
+
pretrained_repo_name, use_auth_token="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", revision=pretrained_revision)
|
| 28 |
|
| 29 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 30 |
model.to(device)
|
| 31 |
|
| 32 |
+
|
| 33 |
+
def prepare_image_for_encoder(image=None, output_image_size=None):
|
| 34 |
+
"""
|
| 35 |
+
First, resizes the input image to fill as much as possible of the output image size
|
| 36 |
+
while preserving aspect ratio. Positions the resized image at (0,0) and fills
|
| 37 |
+
the rest of the gap space in the output image with black(0).
|
| 38 |
+
Args:
|
| 39 |
+
image: PIL image
|
| 40 |
+
output_image_size: (width, height) tuple
|
| 41 |
+
"""
|
| 42 |
+
assert image is not None
|
| 43 |
+
assert output_image_size is not None
|
| 44 |
+
image.thumbnail(output_image_size)
|
| 45 |
+
oimg = Image.new(mode=image.mode, size=output_image_size, color=0)
|
| 46 |
+
oimg.paste(image, box=(0, 0))
|
| 47 |
+
return oimg
|
| 48 |
+
|
| 49 |
+
|
| 50 |
def translate_point_coords_from_out_to_in(point=None, input_image_size=None, output_image_size=None):
|
| 51 |
"""
|
| 52 |
Convert relative prediction coordinates from resized encoder tensor image
|
| 53 |
to original input image size.
|
| 54 |
Args:
|
| 55 |
+
original_point: x, y coordinates of the point coordinates in [0..1] range in the original image
|
| 56 |
input_image_size: (width, height) tuple
|
| 57 |
output_image_size: (width, height) tuple
|
| 58 |
+
"""
|
| 59 |
assert point is not None
|
| 60 |
assert input_image_size is not None
|
| 61 |
assert output_image_size is not None
|
| 62 |
+
print(
|
| 63 |
+
f"point={point}, input_image_size={input_image_size}, output_image_size={output_image_size}")
|
| 64 |
input_width, input_height = input_image_size
|
| 65 |
output_width, output_height = output_image_size
|
| 66 |
+
|
| 67 |
ratio = min(output_width/input_width, output_height/input_height)
|
| 68 |
+
|
| 69 |
resized_height = int(input_height*ratio)
|
|
|
|
| 70 |
resized_width = int(input_width*ratio)
|
| 71 |
+
print(f'>>> resized_width={resized_width}')
|
| 72 |
+
print(f'>>> resized_height={resized_height}')
|
| 73 |
|
| 74 |
if resized_height == input_height and resized_width == input_width:
|
| 75 |
return
|
|
|
|
| 81 |
if resized_height < output_height:
|
| 82 |
# adjust for padding pixels
|
| 83 |
point['y'] *= (output_height / resized_height)
|
| 84 |
+
print(
|
| 85 |
+
f"translated point={point}, resized_image_size: {resized_width, resized_height}")
|
| 86 |
+
|
| 87 |
|
| 88 |
def process_refexp(image: Image, prompt: str):
|
| 89 |
|
|
|
|
| 156 |
print(f"processed prompt: {prompt}")
|
| 157 |
|
| 158 |
# convert coordinates from tensor image size to input image size
|
| 159 |
+
out_size = (
|
| 160 |
+
processor.image_processor.size['width'], processor.image_processor.size['height'])
|
| 161 |
+
translate_point_coords_from_out_to_in(
|
| 162 |
+
point=center_point, input_image_size=image.size, output_image_size=out_size)
|
| 163 |
+
|
| 164 |
x = math.floor(width*center_point["x"])
|
| 165 |
y = math.floor(height*center_point["y"])
|
| 166 |
|
|
|
|
| 216 |
cache_examples=False
|
| 217 |
)
|
| 218 |
|
| 219 |
+
demo.launch(share=True)
|