coziSoul's picture
Initial application upload
1dc063b verified
import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import requests
from torchvision import transforms
# --- Part 1: Setup for Image Captioning ---
# Load the captioning model and processor
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
def caption_image(input_image: Image.Image) -> str:
"""Generates a caption for a given PIL Image."""
inputs = caption_processor(images=input_image, return_tensors="pt")
outputs = caption_model.generate(**inputs)
caption = caption_processor.decode(outputs[0], skip_special_tokens=True)
return caption
# --- Part 2: Setup for Image Classification ---
# Load the classification model
classify_model = torch.hub.load('pytorch/vision:v0.6.0', 'resnet18', pretrained=True).eval()
# Download the labels
response = requests.get("https://git.io/JJkYN")
labels = response.text.split("\n")
def classify_image(input_image: Image.Image) -> dict:
"""Classifies an image and returns a dictionary of confidences."""
preprocess = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
processed_image = preprocess(input_image).unsqueeze(0)
with torch.no_grad():
prediction = torch.nn.functional.softmax(classify_model(processed_image)[0], dim=0)
confidences = {labels[i]: float(prediction[i]) for i in range(1000)}
return confidences
# --- Part 3: Create the Interfaces and Combine Them ---
# Define the first interface (Captioning)
iface_caption = gr.Interface(
fn=caption_image,
inputs=gr.Image(type="pil", label="Upload Image for Captioning"),
outputs=gr.Textbox(label="Generated Caption"),
title="Image Captioning with BLIP",
description="Upload an image and the AI will generate a descriptive caption."
)
# Define the second interface (Classification)
iface_classify = gr.Interface(
fn=classify_image,
inputs=gr.Image(type="pil", label="Upload Image for Classification"),
outputs=gr.Label(num_top_classes=3, label="Top 3 Predictions"),
title="Image Classification with ResNet",
description="Upload an image and the AI will predict its class."
)
# Combine the interfaces in a tabbed layout
demo = gr.TabbedInterface(
[iface_caption, iface_classify],
["Image Captioning", "Image Classification"]
)
# --- Part 4: Launch the App ---
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0")