nguyenbh
		
	commited on
		
		
					Commit 
							
							·
						
						17df1f5
	
1
								Parent(s):
							
							bd4b39b
								
Update examples
Browse files
    	
        README.md
    CHANGED
    
    | @@ -330,59 +330,94 @@ After obtaining the Phi-4-Mini-MM-Instruct model checkpoints, users can use this | |
| 330 | 
             
            import requests
         | 
| 331 | 
             
            import torch
         | 
| 332 | 
             
            import os
         | 
|  | |
| 333 | 
             
            from PIL import Image
         | 
| 334 | 
            -
            import soundfile
         | 
| 335 | 
            -
            from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig | 
| 336 |  | 
| 337 | 
            -
             | 
|  | |
| 338 |  | 
|  | |
|  | |
| 339 | 
             
            model = AutoModelForCausalLM.from_pretrained(
         | 
| 340 | 
            -
                 | 
| 341 | 
             
                device_map="cuda", 
         | 
| 342 | 
             
                torch_dtype="auto", 
         | 
| 343 | 
             
                trust_remote_code=True, 
         | 
| 344 | 
            -
                 | 
| 345 | 
             
            ).cuda()
         | 
| 346 |  | 
| 347 | 
            -
             | 
|  | |
| 348 |  | 
|  | |
| 349 | 
             
            user_prompt = '<|user|>'
         | 
| 350 | 
             
            assistant_prompt = '<|assistant|>'
         | 
| 351 | 
             
            prompt_suffix = '<|end|>'
         | 
| 352 |  | 
|  | |
|  | |
|  | |
| 353 | 
             
            prompt = f'{user_prompt}<|image_1|>What is shown in this image?{prompt_suffix}{assistant_prompt}'
         | 
| 354 | 
            -
            url = 'https://www.ilankelman.org/stopsigns/australia.jpg'
         | 
| 355 | 
             
            print(f'>>> Prompt\n{prompt}')
         | 
| 356 | 
            -
             | 
|  | |
|  | |
| 357 | 
             
            inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0')
         | 
|  | |
|  | |
| 358 | 
             
            generate_ids = model.generate(
         | 
| 359 | 
             
                **inputs,
         | 
| 360 | 
             
                max_new_tokens=1000,
         | 
| 361 | 
             
                generation_config=generation_config,
         | 
| 362 | 
             
            )
         | 
| 363 | 
            -
            generate_ids = generate_ids[:, inputs['input_ids'].shape[1] | 
| 364 | 
             
            response = processor.batch_decode(
         | 
| 365 | 
             
                generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
         | 
| 366 | 
             
            )[0]
         | 
| 367 | 
             
            print(f'>>> Response\n{response}')
         | 
| 368 |  | 
| 369 | 
            -
             | 
|  | |
|  | |
| 370 | 
             
            speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation."
         | 
| 371 | 
             
            prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
         | 
| 372 | 
            -
             | 
| 373 | 
             
            print(f'>>> Prompt\n{prompt}')
         | 
| 374 | 
            -
             | 
| 375 | 
            -
             | 
| 376 | 
            -
             | 
| 377 | 
            -
             | 
| 378 | 
            -
                 | 
| 379 | 
            -
                 | 
| 380 | 
            -
            )
         | 
| 381 | 
            -
             | 
| 382 | 
            -
             | 
| 383 | 
            -
                 | 
| 384 | 
            -
            ) | 
| 385 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 386 | 
             
            ```
         | 
| 387 |  | 
| 388 | 
             
            ## Responsible AI Considerations
         | 
|  | |
| 330 | 
             
            import requests
         | 
| 331 | 
             
            import torch
         | 
| 332 | 
             
            import os
         | 
| 333 | 
            +
            import io
         | 
| 334 | 
             
            from PIL import Image
         | 
| 335 | 
            +
            import soundfile as sf
         | 
| 336 | 
            +
            from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
         | 
| 337 |  | 
| 338 | 
            +
            # Define model path
         | 
| 339 | 
            +
            model_path = "microsoft/Phi-4-multimodal-instruct"
         | 
| 340 |  | 
| 341 | 
            +
            # Load model and processor
         | 
| 342 | 
            +
            processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
         | 
| 343 | 
             
            model = AutoModelForCausalLM.from_pretrained(
         | 
| 344 | 
            +
                model_path, 
         | 
| 345 | 
             
                device_map="cuda", 
         | 
| 346 | 
             
                torch_dtype="auto", 
         | 
| 347 | 
             
                trust_remote_code=True, 
         | 
| 348 | 
            +
                attn_implementation='flash_attention_2',
         | 
| 349 | 
             
            ).cuda()
         | 
| 350 |  | 
| 351 | 
            +
            # Load generation config
         | 
| 352 | 
            +
            generation_config = GenerationConfig.from_pretrained(model_path)
         | 
| 353 |  | 
| 354 | 
            +
            # Define prompt structure
         | 
| 355 | 
             
            user_prompt = '<|user|>'
         | 
| 356 | 
             
            assistant_prompt = '<|assistant|>'
         | 
| 357 | 
             
            prompt_suffix = '<|end|>'
         | 
| 358 |  | 
| 359 | 
            +
            # Part 1: Image Processing
         | 
| 360 | 
            +
            print("\n--- IMAGE PROCESSING ---")
         | 
| 361 | 
            +
            image_url = 'https://www.ilankelman.org/stopsigns/australia.jpg'
         | 
| 362 | 
             
            prompt = f'{user_prompt}<|image_1|>What is shown in this image?{prompt_suffix}{assistant_prompt}'
         | 
|  | |
| 363 | 
             
            print(f'>>> Prompt\n{prompt}')
         | 
| 364 | 
            +
             | 
| 365 | 
            +
            # Download and open image
         | 
| 366 | 
            +
            image = Image.open(requests.get(image_url, stream=True).raw)
         | 
| 367 | 
             
            inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0')
         | 
| 368 | 
            +
             | 
| 369 | 
            +
            # Generate response
         | 
| 370 | 
             
            generate_ids = model.generate(
         | 
| 371 | 
             
                **inputs,
         | 
| 372 | 
             
                max_new_tokens=1000,
         | 
| 373 | 
             
                generation_config=generation_config,
         | 
| 374 | 
             
            )
         | 
| 375 | 
            +
            generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
         | 
| 376 | 
             
            response = processor.batch_decode(
         | 
| 377 | 
             
                generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
         | 
| 378 | 
             
            )[0]
         | 
| 379 | 
             
            print(f'>>> Response\n{response}')
         | 
| 380 |  | 
| 381 | 
            +
            # Part 2: Audio Processing
         | 
| 382 | 
            +
            print("\n--- AUDIO PROCESSING ---")
         | 
| 383 | 
            +
            audio_url = "https://voiceage.com/wbsamples/in_mono/Trailer.wav"
         | 
| 384 | 
             
            speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation."
         | 
| 385 | 
             
            prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
         | 
|  | |
| 386 | 
             
            print(f'>>> Prompt\n{prompt}')
         | 
| 387 | 
            +
             | 
| 388 | 
            +
            # Download audio file
         | 
| 389 | 
            +
            audio_response = requests.get(audio_url)
         | 
| 390 | 
            +
            if audio_response.status_code == 200:
         | 
| 391 | 
            +
                # First save audio to a temporary file
         | 
| 392 | 
            +
                temp_audio_path = "temp_audio.wav"
         | 
| 393 | 
            +
                with open(temp_audio_path, "wb") as f:
         | 
| 394 | 
            +
                    f.write(audio_response.content)
         | 
| 395 | 
            +
                
         | 
| 396 | 
            +
                # Read using soundfile
         | 
| 397 | 
            +
                audio, samplerate = sf.read(temp_audio_path)
         | 
| 398 | 
            +
                
         | 
| 399 | 
            +
                # Process with the model
         | 
| 400 | 
            +
                inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0')
         | 
| 401 | 
            +
                
         | 
| 402 | 
            +
                generate_ids = model.generate(
         | 
| 403 | 
            +
                    **inputs,
         | 
| 404 | 
            +
                    max_new_tokens=1000,
         | 
| 405 | 
            +
                    generation_config=generation_config,
         | 
| 406 | 
            +
                )
         | 
| 407 | 
            +
                generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
         | 
| 408 | 
            +
                response = processor.batch_decode(
         | 
| 409 | 
            +
                    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
         | 
| 410 | 
            +
                )[0]
         | 
| 411 | 
            +
                print(f'>>> Response\n{response}')
         | 
| 412 | 
            +
                
         | 
| 413 | 
            +
                # Clean up
         | 
| 414 | 
            +
                try:
         | 
| 415 | 
            +
                    os.remove(temp_audio_path)
         | 
| 416 | 
            +
                    print(f"Temporary file {temp_audio_path} removed successfully")
         | 
| 417 | 
            +
                except Exception as e:
         | 
| 418 | 
            +
                    print(f"Error removing temporary file: {e}")
         | 
| 419 | 
            +
            else:
         | 
| 420 | 
            +
                print(f"Failed to download audio file: {audio_response.status_code}")
         | 
| 421 | 
             
            ```
         | 
| 422 |  | 
| 423 | 
             
            ## Responsible AI Considerations
         | 
