Spaces:

Ryukijano
/

Image-processor

Runtime error

App Files Files Community

Image-processor / app.py

Ryukijano

Update app.py

c0583a3 verified about 1 year ago

raw

history blame

2.97 kB

	# app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Efficient Segmentation, and Diffusion Model
	import gradio as gr
	import spaces # Import the spaces module to use GPU-specific decorators
	from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor, pipeline
	from diffusers import StableDiffusionPipeline
	import torch
	import os
	from PIL import Image

	# Set up Hugging Face token for private model access
	hf_token = os.getenv("HF_TOKEN") # Fetch token from repository secrets

	# Set up Meta Llama 3.2 Vision model (using Vision Encoder-Decoder model with token)
	llama_vision_model_id = "nlpconnect/vit-gpt2-image-captioning"
	vision_model = VisionEncoderDecoderModel.from_pretrained(
	llama_vision_model_id,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	token=hf_token # Updated to use 'token' instead of 'use_auth_token'
	)
	feature_extractor = AutoFeatureExtractor.from_pretrained(llama_vision_model_id, token=hf_token)

	# Set up segmentation model using an efficient publicly available model
	segment_model_id = "facebook/detr-resnet-50"
	segment_pipe = pipeline(
	"image-segmentation",
	model=segment_model_id,
	device=0, # Force usage of GPU
	token=hf_token # Updated to use 'token'
	)

	# Set up Stable Diffusion Lite model
	stable_diffusion_model_id = "runwayml/stable-diffusion-v1-5"
	diffusion_pipe = StableDiffusionPipeline.from_pretrained(
	stable_diffusion_model_id, torch_dtype=torch.float16, token=hf_token # Updated to use 'token'
	)
	diffusion_pipe = diffusion_pipe.to("cuda") # Force usage of GPU

	# Use the GPU decorator for the function that needs GPU access
	@spaces.GPU(duration=120) # Allocates GPU for a maximum of 120 seconds
	def process_image(image):
	# Step 1: Use Vision model for initial image understanding (captioning)
	pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(vision_model.device)
	output_ids = vision_model.generate(pixel_values, max_length=50)
	caption = vision_model.config.decoder.tokenizer.decode(output_ids[0], skip_special_tokens=True)

	# Step 2: Segment important parts of the image using DETR
	segmented_result = segment_pipe(image=image)
	segments = segmented_result

	# Step 3: Modify segmented image using Diffusion model
	# Here, we modify based on the caption result and segmented area
	output_image = diffusion_pipe(prompt=f"Modify the {caption}", image=image).images[0]

	return output_image

	# Create Gradio interface
	interface = gr.Interface(
	fn=process_image,
	inputs=gr.Image(type="pil"),
	outputs="image",
	live=True, # Allow for dynamic updates if necessary
	allow_flagging="never", # Disallow flagging to keep interactions light
	title="Image Processor: Vision, Segmentation, and Modification",
	description="Upload an image to generate a caption, segment important parts, and modify the image using Stable Diffusion."
	)

	# Launch the app
	interface.launch()