Spaces:
Runtime error
Runtime error
| import spaces | |
| import gradio as gr | |
| import os | |
| import sys | |
| from typing import List | |
| # sys.path.append(os.getcwd()) | |
| import numpy as np | |
| from PIL import Image | |
| import torch | |
| from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor | |
| from qwen_vl_utils import process_vision_info | |
| from gradio_imageslider import ImageSlider | |
| print(f'torch version:{torch.__version__}') | |
| import torch.utils.checkpoint | |
| from pytorch_lightning import seed_everything | |
| from diffusers import AutoencoderKL, DDIMScheduler | |
| from diffusers.utils import check_min_version | |
| from diffusers.utils.import_utils import is_xformers_available | |
| from transformers import CLIPTextModel, CLIPTokenizer, CLIPImageProcessor | |
| from huggingface_hub import hf_hub_download, snapshot_download | |
| from pipelines.pipeline_seesr import StableDiffusionControlNetPipeline | |
| from utils.wavelet_color_fix import wavelet_color_fix, adain_color_fix | |
| from ram.models.ram_lora import ram | |
| from ram import inference_ram as inference | |
| from torchvision import transforms | |
| from models.controlnet import ControlNetModel | |
| from models.unet_2d_condition import UNet2DConditionModel | |
| # VLM_NAME = "Qwen/Qwen2.5-VL-3B-Instruct" | |
| # vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| # VLM_NAME, | |
| # torch_dtype="auto", | |
| # device_map="auto" # immediately dispatches layers onto available GPUs | |
| # ) | |
| # vlm_processor = AutoProcessor.from_pretrained(VLM_NAME) | |
| def _generate_vlm_prompt( | |
| vlm_model: Qwen2_5_VLForConditionalGeneration, | |
| vlm_processor: AutoProcessor, | |
| process_vision_info, | |
| pil_image: Image.Image, | |
| device: str = "cuda" | |
| ) -> str: | |
| """ | |
| Given two PIL.Image inputs: | |
| - prev_pil: the “full” image at the previous recursion. | |
| - zoomed_pil: the cropped+resized (zoom) image for this step. | |
| Returns a single “recursive_multiscale” prompt string. | |
| """ | |
| message_text = ( | |
| "The give a detailed description of this image." | |
| "describe each element with fine details." | |
| ) | |
| messages = [ | |
| {"role": "system", "content": message_text}, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": pil_image}, | |
| ], | |
| }, | |
| ] | |
| text = vlm_processor.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = vlm_processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| ).to(device) | |
| generated = vlm_model.generate(**inputs, max_new_tokens=128) | |
| trimmed = [ | |
| out_ids[len(in_ids):] | |
| for in_ids, out_ids in zip(inputs.input_ids, generated) | |
| ] | |
| out_text = vlm_processor.batch_decode( | |
| trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| )[0] | |
| return out_text.strip() | |
| tensor_transforms = transforms.Compose([ | |
| transforms.ToTensor(), | |
| ]) | |
| ram_transforms = transforms.Compose([ | |
| transforms.Resize((384, 384)), | |
| transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |
| ]) | |
| snapshot_download( | |
| repo_id="alexnasa/SEESR", | |
| local_dir="preset/models" | |
| ) | |
| snapshot_download( | |
| repo_id="stabilityai/stable-diffusion-2-1-base", | |
| local_dir="preset/models/stable-diffusion-2-1-base" | |
| ) | |
| snapshot_download( | |
| repo_id="xinyu1205/recognize_anything_model", | |
| local_dir="preset/models/" | |
| ) | |
| # Load scheduler, tokenizer and models. | |
| pretrained_model_path = 'preset/models/stable-diffusion-2-1-base' | |
| seesr_model_path = 'preset/models/seesr' | |
| scheduler = DDIMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler") | |
| text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder") | |
| tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer") | |
| vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae") | |
| feature_extractor = CLIPImageProcessor.from_pretrained(f"{pretrained_model_path}/feature_extractor") | |
| unet = UNet2DConditionModel.from_pretrained(seesr_model_path, subfolder="unet") | |
| controlnet = ControlNetModel.from_pretrained(seesr_model_path, subfolder="controlnet") | |
| # Freeze vae and text_encoder | |
| vae.requires_grad_(False) | |
| text_encoder.requires_grad_(False) | |
| unet.requires_grad_(False) | |
| controlnet.requires_grad_(False) | |
| # unet.to("cuda") | |
| # controlnet.to("cuda") | |
| # unet.enable_xformers_memory_efficient_attention() | |
| # controlnet.enable_xformers_memory_efficient_attention() | |
| # Get the validation pipeline | |
| validation_pipeline = StableDiffusionControlNetPipeline( | |
| vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, feature_extractor=None, | |
| unet=unet, controlnet=controlnet, scheduler=scheduler, safety_checker=None, requires_safety_checker=False, | |
| ) | |
| validation_pipeline._init_tiled_vae(encoder_tile_size=1024, | |
| decoder_tile_size=224) | |
| weight_dtype = torch.float16 | |
| device = "cuda" | |
| # Move text_encode and vae to gpu and cast to weight_dtype | |
| text_encoder.to(device, dtype=weight_dtype) | |
| vae.to(device, dtype=weight_dtype) | |
| unet.to(device, dtype=weight_dtype) | |
| controlnet.to(device, dtype=weight_dtype) | |
| tag_model = ram(pretrained='preset/models/ram_swin_large_14m.pth', | |
| pretrained_condition='preset/models/DAPE.pth', | |
| image_size=384, | |
| vit='swin_l') | |
| tag_model.eval() | |
| tag_model.to(device, dtype=weight_dtype) | |
| def preprocess_image(input_image: Image.Image) -> Image.Image: | |
| img = input_image.copy() | |
| img.thumbnail((512, 512), Image.Resampling.BILINEAR) | |
| return img | |
| def preprocess_n_magnify(input_image: Image.Image): | |
| preprocessed_img = preprocess_image(input_image) | |
| preprocessed_img, magnified_img = magnify(preprocessed_img) | |
| return preprocessed_img, (preprocessed_img, magnified_img) | |
| def magnify( | |
| input_image: Image.Image, | |
| user_prompt = "", | |
| positive_prompt = "clean, high-resolution, 8k, best quality, masterpiece", | |
| negative_prompt = "dotted, noise, blur, lowres, oversmooth, longbody, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality", | |
| num_inference_steps = 50, | |
| scale_factor = 4, | |
| cfg_scale = 7.5, | |
| seed = 123, | |
| latent_tiled_size = 320, | |
| latent_tiled_overlap = 4, | |
| sample_times = 1, | |
| progress=gr.Progress(track_tqdm=True), | |
| ) -> List[np.ndarray]: | |
| process_size = 512 | |
| resize_preproc = transforms.Compose([ | |
| transforms.Resize(process_size, interpolation=transforms.InterpolationMode.BILINEAR), | |
| ]) | |
| # user_prompt = _generate_vlm_prompt( | |
| # vlm_model=vlm_model, | |
| # vlm_processor=vlm_processor, | |
| # process_vision_info=process_vision_info, | |
| # pil_image=input_image, | |
| # device=device, | |
| # ) | |
| # with torch.no_grad(): | |
| seed_everything(seed) | |
| generator = torch.Generator(device=device) | |
| validation_prompt = "" | |
| lq = tensor_transforms(input_image).unsqueeze(0).to(device).half() | |
| lq = ram_transforms(lq) | |
| res = inference(lq, tag_model) | |
| ram_encoder_hidden_states = tag_model.generate_image_embeds(lq) | |
| validation_prompt = f"{res[0]}, {positive_prompt}," | |
| validation_prompt = validation_prompt if user_prompt=='' else f"{user_prompt}, {validation_prompt}" | |
| ori_width, ori_height = input_image.size | |
| resize_flag = False | |
| rscale = scale_factor | |
| input_image = input_image.resize((int(input_image.size[0] * rscale), int(input_image.size[1] * rscale))) | |
| if min(input_image.size) < process_size: | |
| input_image = resize_preproc(input_image) | |
| input_image = input_image.resize((input_image.size[0] // 8 * 8, input_image.size[1] // 8 * 8)) | |
| width, height = input_image.size | |
| resize_flag = True # | |
| images = [] | |
| for _ in range(sample_times): | |
| try: | |
| with torch.autocast("cuda"): | |
| image = validation_pipeline( | |
| validation_prompt, input_image, negative_prompt=negative_prompt, | |
| num_inference_steps=num_inference_steps, generator=generator, | |
| height=height, width=width, | |
| guidance_scale=cfg_scale, conditioning_scale=1, | |
| start_point='lr', start_steps=999,ram_encoder_hidden_states=ram_encoder_hidden_states, | |
| latent_tiled_size=latent_tiled_size, latent_tiled_overlap=latent_tiled_overlap, | |
| ).images[0] | |
| if True: # alpha<1.0: | |
| image = wavelet_color_fix(image, input_image) | |
| if resize_flag: | |
| image = image.resize((ori_width * rscale, ori_height * rscale)) | |
| except Exception as e: | |
| print(e) | |
| image = Image.new(mode="RGB", size=(512, 512)) | |
| images.append(np.array(image)) | |
| return input_image, images[0] | |
| css = """ | |
| #col-container { | |
| margin: 0 auto; | |
| max-width: 1024px; | |
| } | |
| """ | |
| theme = gr.themes.Ocean() | |
| with gr.Blocks(css=css, theme=theme) as demo: | |
| with gr.Column(elem_id="col-container"): | |
| with gr.Row(): | |
| gr.HTML( | |
| """ | |
| <div style="text-align: center;"> | |
| <p style="font-size:16px; display: inline; margin: 0;"> | |
| <strong>SeeSR</strong> – Towards Semantics-Aware Real-World Image Super-Resolution | |
| </p> | |
| <a href="https://github.com/cswry/SeeSR" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;"> | |
| <img src="https://img.shields.io/badge/GitHub-Repo-blue" alt="GitHub Repo"> | |
| </a> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(): | |
| input_image = gr.Image(type="pil", height=256) | |
| run_button = gr.Button("🔎 Magnify 4x", variant="primary") | |
| preprocessed_image = gr.Image(label="preprocessed image(256x256)", type="pil", interactive=False, height=256) | |
| with gr.Accordion("Options", visible=False): | |
| user_prompt = gr.Textbox(label="User Prompt", value="") | |
| positive_prompt = gr.Textbox(label="Positive Prompt", value="clean, high-resolution, 8k, best quality, masterpiece") | |
| negative_prompt = gr.Textbox( | |
| label="Negative Prompt", | |
| value="dotted, noise, blur, lowres, oversmooth, longbody, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality" | |
| ) | |
| cfg_scale = gr.Slider(label="Classifier Free Guidance Scale (Set to 1.0 in sd-turbo)", minimum=1, maximum=10, value=7.5, step=0) | |
| num_inference_steps = gr.Slider(label="Inference Steps", minimum=2, maximum=100, value=50, step=1) | |
| seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, value=231) | |
| sample_times = gr.Slider(label="Sample Times", minimum=1, maximum=10, step=1, value=1) | |
| latent_tiled_size = gr.Slider(label="Diffusion Tile Size", minimum=128, maximum=480, value=320, step=1) | |
| latent_tiled_overlap = gr.Slider(label="Diffusion Tile Overlap", minimum=4, maximum=16, value=4, step=1) | |
| scale_factor = gr.Number(label="SR Scale", value=4) | |
| with gr.Column(): | |
| result_gallery = ImageSlider( | |
| interactive=False, | |
| label="Magnified", | |
| position=0.5 | |
| ) | |
| examples = gr.Examples( | |
| examples=[ | |
| [ | |
| "preset/datasets/test_datasets/179.png", | |
| ], | |
| [ | |
| "preset/datasets/test_datasets/cinema.png", | |
| ], | |
| [ | |
| "preset/datasets/test_datasets/cartoon.png", | |
| ], | |
| ], | |
| inputs=[ | |
| input_image, | |
| ], | |
| outputs=[preprocessed_image, result_gallery], | |
| fn=preprocess_n_magnify, | |
| cache_examples=True, | |
| ) | |
| inputs = [ | |
| input_image, | |
| ] | |
| run_button.click(fn=magnify, inputs=preprocessed_image, outputs=[result_gallery]) | |
| input_image.upload(fn=preprocess_image,inputs=input_image, outputs=[preprocessed_image]) | |
| demo.launch(share=True) | |