microsoft
/

Phi-4-multimodal-instruct

@@ -330,59 +330,94 @@ After obtaining the Phi-4-Mini-MM-Instruct model checkpoints, users can use this
 import requests
 import torch
 import os
 from PIL import Image
-import soundfile
-from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig,pipeline,AutoTokenizer
-processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
-    "microsoft/Phi-4-multimodal-instruct",
     device_map="cuda",
     torch_dtype="auto",
     trust_remote_code=True,
-    _attn_implementation='flash_attention_2',
 ).cuda()
-generation_config = GenerationConfig.from_pretrained(model_path, 'generation_config.json')
 user_prompt = '<|user|>'
 assistant_prompt = '<|assistant|>'
 prompt_suffix = '<|end|>'
 prompt = f'{user_prompt}<|image_1|>What is shown in this image?{prompt_suffix}{assistant_prompt}'
-url = 'https://www.ilankelman.org/stopsigns/australia.jpg'
 print(f'>>> Prompt\n{prompt}')
-image = Image.open(requests.get(url, stream=True).raw)
 inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0')
 generate_ids = model.generate(
     **inputs,
     max_new_tokens=1000,
     generation_config=generation_config,
 )
-generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
 response = processor.batch_decode(
     generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
 )[0]
 print(f'>>> Response\n{response}')
 speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation."
 prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
 print(f'>>> Prompt\n{prompt}')
-audio = soundfile.read('https://voiceage.com/wbsamples/in_mono/Trailer.wav')
-inputs = processor(text=prompt, audios=[audio], return_tensors='pt').to('cuda:0')
-generate_ids = model.generate(
-    **inputs,
-    max_new_tokens=1000,
-    generation_config=generation_config,
-)
-generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
-response = processor.batch_decode(
-    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)[0]
-print(f'>>> Response\n{response}')
 ```
 ## Responsible AI Considerations

 import requests
 import torch
 import os
+import io
 from PIL import Image
+import soundfile as sf
+from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
+# Define model path
+model_path = "microsoft/Phi-4-multimodal-instruct"
+# Load model and processor
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
+    model_path,
     device_map="cuda",
     torch_dtype="auto",
     trust_remote_code=True,
+    attn_implementation='flash_attention_2',
 ).cuda()
+# Load generation config
+generation_config = GenerationConfig.from_pretrained(model_path)
+# Define prompt structure
 user_prompt = '<|user|>'
 assistant_prompt = '<|assistant|>'
 prompt_suffix = '<|end|>'
+# Part 1: Image Processing
+print("\n--- IMAGE PROCESSING ---")
+image_url = 'https://www.ilankelman.org/stopsigns/australia.jpg'
 prompt = f'{user_prompt}<|image_1|>What is shown in this image?{prompt_suffix}{assistant_prompt}'
 print(f'>>> Prompt\n{prompt}')
+# Download and open image
+image = Image.open(requests.get(image_url, stream=True).raw)
 inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0')
+# Generate response
 generate_ids = model.generate(
     **inputs,
     max_new_tokens=1000,
     generation_config=generation_config,
 )
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
 response = processor.batch_decode(
     generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
 )[0]
 print(f'>>> Response\n{response}')
+# Part 2: Audio Processing
+print("\n--- AUDIO PROCESSING ---")
+audio_url = "https://voiceage.com/wbsamples/in_mono/Trailer.wav"
 speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation."
 prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
 print(f'>>> Prompt\n{prompt}')
+# Download audio file
+audio_response = requests.get(audio_url)
+if audio_response.status_code == 200:
+    # First save audio to a temporary file
+    temp_audio_path = "temp_audio.wav"
+    with open(temp_audio_path, "wb") as f:
+        f.write(audio_response.content)
+    # Read using soundfile
+    audio, samplerate = sf.read(temp_audio_path)
+    # Process with the model
+    inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0')
+    generate_ids = model.generate(
+        **inputs,
+        max_new_tokens=1000,
+        generation_config=generation_config,
+    )
+    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+    response = processor.batch_decode(
+        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
+    print(f'>>> Response\n{response}')
+    # Clean up
+    try:
+        os.remove(temp_audio_path)
+        print(f"Temporary file {temp_audio_path} removed successfully")
+    except Exception as e:
+        print(f"Error removing temporary file: {e}")
+else:
+    print(f"Failed to download audio file: {audio_response.status_code}")
 ```
 ## Responsible AI Considerations