Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -34,7 +34,14 @@ from torchvision import transforms
|
|
| 34 |
from models.controlnet import ControlNetModel
|
| 35 |
from models.unet_2d_condition import UNet2DConditionModel
|
| 36 |
|
|
|
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
def _generate_vlm_prompt(
|
| 40 |
vlm_model: Qwen2_5_VLForConditionalGeneration,
|
|
@@ -107,25 +114,25 @@ snapshot_download(
|
|
| 107 |
|
| 108 |
|
| 109 |
snapshot_download(
|
| 110 |
-
repo_id="stabilityai/
|
| 111 |
-
local_dir="preset/models/
|
| 112 |
)
|
| 113 |
|
| 114 |
-
|
| 115 |
snapshot_download(
|
| 116 |
repo_id="xinyu1205/recognize_anything_model",
|
| 117 |
local_dir="preset/models/"
|
| 118 |
)
|
| 119 |
|
|
|
|
| 120 |
# Load scheduler, tokenizer and models.
|
| 121 |
-
pretrained_model_path = 'preset/models/
|
| 122 |
seesr_model_path = 'preset/models/seesr'
|
| 123 |
|
| 124 |
scheduler = DDIMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler")
|
| 125 |
text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder")
|
| 126 |
tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
|
| 127 |
vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae")
|
| 128 |
-
|
| 129 |
unet = UNet2DConditionModel.from_pretrained(seesr_model_path, subfolder="unet")
|
| 130 |
controlnet = ControlNetModel.from_pretrained(seesr_model_path, subfolder="controlnet")
|
| 131 |
|
|
@@ -185,9 +192,9 @@ def magnify(
|
|
| 185 |
user_prompt = "",
|
| 186 |
positive_prompt = "clean, high-resolution, 8k, best quality, masterpiece",
|
| 187 |
negative_prompt = "dotted, noise, blur, lowres, oversmooth, longbody, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
|
| 188 |
-
num_inference_steps =
|
| 189 |
scale_factor = 4,
|
| 190 |
-
cfg_scale =
|
| 191 |
seed = 123,
|
| 192 |
latent_tiled_size = 320,
|
| 193 |
latent_tiled_overlap = 4,
|
|
@@ -288,15 +295,15 @@ with gr.Blocks(css=css, theme=theme) as demo:
|
|
| 288 |
input_image = gr.Image(type="pil", height=512)
|
| 289 |
run_button = gr.Button("π Magnify 4x", variant="primary")
|
| 290 |
duration_time = gr.Text(label="duration time", value=60, visible=False)
|
| 291 |
-
with gr.Accordion("Options"):
|
| 292 |
user_prompt = gr.Textbox(label="User Prompt", value="")
|
| 293 |
positive_prompt = gr.Textbox(label="Positive Prompt", value="clean, high-resolution, 8k, best quality, masterpiece")
|
| 294 |
negative_prompt = gr.Textbox(
|
| 295 |
label="Negative Prompt",
|
| 296 |
value="dotted, noise, blur, lowres, oversmooth, longbody, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality"
|
| 297 |
)
|
| 298 |
-
cfg_scale = gr.Slider(label="Classifier Free Guidance Scale (Set to 1.0 in sd-turbo)", minimum=1, maximum=10, value=
|
| 299 |
-
num_inference_steps = gr.Slider(label="Inference Steps", minimum=2, maximum=100, value=
|
| 300 |
seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, value=231)
|
| 301 |
sample_times = gr.Slider(label="Sample Times", minimum=1, maximum=10, step=1, value=1)
|
| 302 |
latent_tiled_size = gr.Slider(label="Diffusion Tile Size", minimum=128, maximum=480, value=320, step=1)
|
|
@@ -331,7 +338,7 @@ with gr.Blocks(css=css, theme=theme) as demo:
|
|
| 331 |
inputs = [
|
| 332 |
input_image,
|
| 333 |
]
|
| 334 |
-
run_button.click(fn=magnify, inputs=
|
| 335 |
input_image.upload(fn=preprocess_image,inputs=input_image, outputs=input_image)
|
| 336 |
|
| 337 |
demo.launch(share=True)
|
|
|
|
| 34 |
from models.controlnet import ControlNetModel
|
| 35 |
from models.unet_2d_condition import UNet2DConditionModel
|
| 36 |
|
| 37 |
+
# VLM_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
|
| 38 |
|
| 39 |
+
# vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 40 |
+
# VLM_NAME,
|
| 41 |
+
# torch_dtype="auto",
|
| 42 |
+
# device_map="auto" # immediately dispatches layers onto available GPUs
|
| 43 |
+
# )
|
| 44 |
+
# vlm_processor = AutoProcessor.from_pretrained(VLM_NAME)
|
| 45 |
|
| 46 |
def _generate_vlm_prompt(
|
| 47 |
vlm_model: Qwen2_5_VLForConditionalGeneration,
|
|
|
|
| 114 |
|
| 115 |
|
| 116 |
snapshot_download(
|
| 117 |
+
repo_id="stabilityai/stable-diffusion-2-1-base",
|
| 118 |
+
local_dir="preset/models/stable-diffusion-2-1-base"
|
| 119 |
)
|
| 120 |
|
|
|
|
| 121 |
snapshot_download(
|
| 122 |
repo_id="xinyu1205/recognize_anything_model",
|
| 123 |
local_dir="preset/models/"
|
| 124 |
)
|
| 125 |
|
| 126 |
+
|
| 127 |
# Load scheduler, tokenizer and models.
|
| 128 |
+
pretrained_model_path = 'preset/models/stable-diffusion-2-1-base'
|
| 129 |
seesr_model_path = 'preset/models/seesr'
|
| 130 |
|
| 131 |
scheduler = DDIMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler")
|
| 132 |
text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder")
|
| 133 |
tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
|
| 134 |
vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae")
|
| 135 |
+
feature_extractor = CLIPImageProcessor.from_pretrained(f"{pretrained_model_path}/feature_extractor")
|
| 136 |
unet = UNet2DConditionModel.from_pretrained(seesr_model_path, subfolder="unet")
|
| 137 |
controlnet = ControlNetModel.from_pretrained(seesr_model_path, subfolder="controlnet")
|
| 138 |
|
|
|
|
| 192 |
user_prompt = "",
|
| 193 |
positive_prompt = "clean, high-resolution, 8k, best quality, masterpiece",
|
| 194 |
negative_prompt = "dotted, noise, blur, lowres, oversmooth, longbody, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
|
| 195 |
+
num_inference_steps = 50,
|
| 196 |
scale_factor = 4,
|
| 197 |
+
cfg_scale = 7.5,
|
| 198 |
seed = 123,
|
| 199 |
latent_tiled_size = 320,
|
| 200 |
latent_tiled_overlap = 4,
|
|
|
|
| 295 |
input_image = gr.Image(type="pil", height=512)
|
| 296 |
run_button = gr.Button("π Magnify 4x", variant="primary")
|
| 297 |
duration_time = gr.Text(label="duration time", value=60, visible=False)
|
| 298 |
+
with gr.Accordion("Options", visible=False):
|
| 299 |
user_prompt = gr.Textbox(label="User Prompt", value="")
|
| 300 |
positive_prompt = gr.Textbox(label="Positive Prompt", value="clean, high-resolution, 8k, best quality, masterpiece")
|
| 301 |
negative_prompt = gr.Textbox(
|
| 302 |
label="Negative Prompt",
|
| 303 |
value="dotted, noise, blur, lowres, oversmooth, longbody, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality"
|
| 304 |
)
|
| 305 |
+
cfg_scale = gr.Slider(label="Classifier Free Guidance Scale (Set to 1.0 in sd-turbo)", minimum=1, maximum=10, value=7.5, step=0)
|
| 306 |
+
num_inference_steps = gr.Slider(label="Inference Steps", minimum=2, maximum=100, value=50, step=1)
|
| 307 |
seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, value=231)
|
| 308 |
sample_times = gr.Slider(label="Sample Times", minimum=1, maximum=10, step=1, value=1)
|
| 309 |
latent_tiled_size = gr.Slider(label="Diffusion Tile Size", minimum=128, maximum=480, value=320, step=1)
|
|
|
|
| 338 |
inputs = [
|
| 339 |
input_image,
|
| 340 |
]
|
| 341 |
+
run_button.click(fn=magnify, inputs=input_image, outputs=[result_gallery])
|
| 342 |
input_image.upload(fn=preprocess_image,inputs=input_image, outputs=input_image)
|
| 343 |
|
| 344 |
demo.launch(share=True)
|