Spaces:
Runtime error
Runtime error
| # app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Efficient Segmentation, and Diffusion Model | |
| import gradio as gr | |
| import spaces # Import the spaces module to use GPU-specific decorators | |
| from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor, pipeline | |
| from diffusers import StableDiffusionPipeline | |
| import torch | |
| import os | |
| from PIL import Image | |
| # Set up Hugging Face token for private model access | |
| hf_token = os.getenv("HF_TOKEN") # Fetch token from repository secrets | |
| # Set up Meta Llama 3.2 Vision model (using Vision Encoder-Decoder model with token) | |
| llama_vision_model_id = "nlpconnect/vit-gpt2-image-captioning" | |
| vision_model = VisionEncoderDecoderModel.from_pretrained( | |
| llama_vision_model_id, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto", | |
| token=hf_token # Updated to use 'token' instead of 'use_auth_token' | |
| ) | |
| feature_extractor = AutoFeatureExtractor.from_pretrained(llama_vision_model_id, token=hf_token) | |
| # Set up segmentation model using an efficient publicly available model | |
| segment_model_id = "facebook/detr-resnet-50" | |
| segment_pipe = pipeline( | |
| "image-segmentation", | |
| model=segment_model_id, | |
| device=0, # Force usage of GPU | |
| token=hf_token # Updated to use 'token' | |
| ) | |
| # Set up Stable Diffusion Lite model | |
| stable_diffusion_model_id = "runwayml/stable-diffusion-v1-5" | |
| diffusion_pipe = StableDiffusionPipeline.from_pretrained( | |
| stable_diffusion_model_id, torch_dtype=torch.float16, token=hf_token # Updated to use 'token' | |
| ) | |
| diffusion_pipe = diffusion_pipe.to("cuda") # Force usage of GPU | |
| # Use the GPU decorator for the function that needs GPU access | |
| # Allocates GPU for a maximum of 120 seconds | |
| def process_image(image): | |
| # Step 1: Use Vision model for initial image understanding (captioning) | |
| pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(vision_model.device) | |
| output_ids = vision_model.generate(pixel_values, max_length=50) | |
| caption = vision_model.config.decoder.tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
| # Step 2: Segment important parts of the image using DETR | |
| segmented_result = segment_pipe(image=image) | |
| segments = segmented_result | |
| # Step 3: Modify segmented image using Diffusion model | |
| # Here, we modify based on the caption result and segmented area | |
| output_image = diffusion_pipe(prompt=f"Modify the {caption}", image=image).images[0] | |
| return output_image | |
| # Create Gradio interface | |
| interface = gr.Interface( | |
| fn=process_image, | |
| inputs=gr.Image(type="pil"), | |
| outputs="image", | |
| live=True, # Allow for dynamic updates if necessary | |
| allow_flagging="never", # Disallow flagging to keep interactions light | |
| title="Image Processor: Vision, Segmentation, and Modification", | |
| description="Upload an image to generate a caption, segment important parts, and modify the image using Stable Diffusion." | |
| ) | |
| # Launch the app | |
| interface.launch() |