Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	| import gradio as gr | |
| from PIL import Image | |
| import torch | |
| import soundfile as sf | |
| from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig | |
| from urllib.request import urlopen | |
| import spaces | |
| # Define model path | |
| model_path = "microsoft/Phi-4-multimodal-instruct" | |
| # Load model and processor | |
| processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_path, | |
| device_map="auto", | |
| torch_dtype="auto", | |
| trust_remote_code=True, | |
| _attn_implementation="eager", | |
| ) | |
| # Define prompt structure | |
| user_prompt = '<|user|>' | |
| assistant_prompt = '<|assistant|>' | |
| prompt_suffix = '<|end|>' | |
| # Define inference function | |
| def process_input(input_type, file, question): | |
| if not file or not question: | |
| return "Please upload a file and provide a question." | |
| # Prepare the prompt | |
| if input_type == "Image": | |
| prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}' | |
| # Open image from uploaded file | |
| image = Image.open(file) | |
| inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device) | |
| elif input_type == "Audio": | |
| prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}' | |
| # Read audio from uploaded file | |
| audio, samplerate = sf.read(file) | |
| inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device) | |
| else: | |
| return "Invalid input type selected." | |
| # Generate response | |
| with torch.no_grad(): | |
| generate_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=200, | |
| num_logits_to_keep=0, | |
| ) | |
| generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] | |
| response = processor.batch_decode( | |
| generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| )[0] | |
| return response | |
| # Gradio interface | |
| with gr.Blocks( | |
| title="Phi-4 Multimodal Demo", | |
| theme=gr.themes.Soft( | |
| primary_hue="blue", | |
| secondary_hue="gray", | |
| radius_size="lg", | |
| ), | |
| ) as demo: | |
| gr.Markdown( | |
| """ | |
| # Phi-4 Multimodal Demo | |
| Upload an **image** or **audio** file, ask a question, and get a response from the model! | |
| Built with the `microsoft/Phi-4-multimodal-instruct` model by Microsoft. | |
| Credits: Grok from xAI helped me in making this demo. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_type = gr.Radio( | |
| choices=["Image", "Audio"], | |
| label="Select Input Type", | |
| value="Image", | |
| ) | |
| file_input = gr.File( | |
| label="Upload Your File", | |
| file_types=["image", "audio"], | |
| ) | |
| question_input = gr.Textbox( | |
| label="Your Question", | |
| placeholder="e.g., 'What is shown in this image?' or 'Transcribe this audio.'", | |
| lines=2, | |
| ) | |
| submit_btn = gr.Button("Submit", variant="primary") | |
| with gr.Column(scale=2): | |
| output_text = gr.Textbox( | |
| label="Model Response", | |
| placeholder="Response will appear here...", | |
| lines=10, | |
| interactive=False, | |
| ) | |
| # Example section | |
| with gr.Accordion("Examples", open=False): | |
| gr.Markdown("Try these examples:") | |
| gr.Examples( | |
| examples=[ | |
| ["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "What is shown in this image?"], | |
| ["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."], | |
| ], | |
| inputs=[input_type, file_input, question_input], | |
| outputs=output_text, | |
| fn=process_input, | |
| cache_examples=False, | |
| ) | |
| # Connect the submit button | |
| submit_btn.click( | |
| fn=process_input, | |
| inputs=[input_type, file_input, question_input], | |
| outputs=output_text, | |
| ) | |
| # Launch the demo | |
| demo.launch() | 
