added technical report
Browse files- .gitattributes +1 -0
- README.md +31 -42
- phi_4_mm.tech_report.02252025.pdf +3 -0
    	
        .gitattributes
    CHANGED
    
    | @@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text | |
| 34 | 
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         | 
| 35 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
| 36 | 
             
            tokenizer.json filter=lfs diff=lfs merge=lfs -text
         | 
|  | 
|  | |
| 34 | 
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         | 
| 35 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
| 36 | 
             
            tokenizer.json filter=lfs diff=lfs merge=lfs -text
         | 
| 37 | 
            +
            *.pdf filter=lfs diff=lfs merge=lfs -text
         | 
    	
        README.md
    CHANGED
    
    | @@ -42,13 +42,13 @@ Polish, Portuguese, Russian, Spanish, Swedish, Thai, Turkish, Ukrainian | |
| 42 | 
             
            - Vision: English
         | 
| 43 | 
             
            - Audio: English, Chinese, German, French, Italian, Japanese, Spanish, Portuguese
         | 
| 44 |  | 
| 45 | 
            -
            π‘ [Phi-4-multimodal Portal]() <br>
         | 
| 46 | 
            -
            π° [Phi-4-multimodal Microsoft Blog]() <br>
         | 
| 47 | 
            -
            π [Phi-4-multimodal Technical Report]() <br>
         | 
| 48 | 
            -
            π©βπ³ [Phi-4-multimodal Cookbook]() <br>
         | 
| 49 | 
             
            π₯οΈ [Try It](https://aka.ms/try-phi4mm) <br>
         | 
| 50 |  | 
| 51 | 
            -
            **Phi-4**: [[multimodal-instruct](https://huggingface.co/microsoft/Phi- | 
| 52 |  | 
| 53 | 
             
            ## Intended Uses
         | 
| 54 |  | 
| @@ -218,10 +218,14 @@ torch==2.6.0 | |
| 218 | 
             
            transformers==4.48.2
         | 
| 219 | 
             
            accelerate==1.3.0
         | 
| 220 | 
             
            soundfile==0.13.1
         | 
| 221 | 
            -
            pillow== | 
|  | |
|  | |
|  | |
|  | |
| 222 | 
             
            ```
         | 
| 223 |  | 
| 224 | 
            -
            Phi-4-multimodal-instruct is also available in [Azure AI Studio]()
         | 
| 225 |  | 
| 226 | 
             
            ### Tokenizer
         | 
| 227 |  | 
| @@ -324,7 +328,7 @@ If it is a square image, the resolution would be around (8*448 by 8*448). For mu | |
| 324 |  | 
| 325 | 
             
            ### Loading the model locally
         | 
| 326 |  | 
| 327 | 
            -
            After obtaining the Phi-4- | 
| 328 |  | 
| 329 | 
             
            ```python
         | 
| 330 | 
             
            import requests
         | 
| @@ -334,6 +338,8 @@ import io | |
| 334 | 
             
            from PIL import Image
         | 
| 335 | 
             
            import soundfile as sf
         | 
| 336 | 
             
            from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
         | 
|  | |
|  | |
| 337 |  | 
| 338 | 
             
            # Define model path
         | 
| 339 | 
             
            model_path = "microsoft/Phi-4-multimodal-instruct"
         | 
| @@ -380,44 +386,27 @@ print(f'>>> Response\n{response}') | |
| 380 |  | 
| 381 | 
             
            # Part 2: Audio Processing
         | 
| 382 | 
             
            print("\n--- AUDIO PROCESSING ---")
         | 
| 383 | 
            -
            audio_url = "https:// | 
| 384 | 
             
            speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation."
         | 
| 385 | 
             
            prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
         | 
| 386 | 
             
            print(f'>>> Prompt\n{prompt}')
         | 
| 387 |  | 
| 388 | 
            -
            #  | 
| 389 | 
            -
             | 
| 390 | 
            -
             | 
| 391 | 
            -
             | 
| 392 | 
            -
             | 
| 393 | 
            -
             | 
| 394 | 
            -
             | 
| 395 | 
            -
                
         | 
| 396 | 
            -
                 | 
| 397 | 
            -
                 | 
| 398 | 
            -
             | 
| 399 | 
            -
             | 
| 400 | 
            -
             | 
| 401 | 
            -
                
         | 
| 402 | 
            -
             | 
| 403 | 
            -
             | 
| 404 | 
            -
                    max_new_tokens=1000,
         | 
| 405 | 
            -
                    generation_config=generation_config,
         | 
| 406 | 
            -
                )
         | 
| 407 | 
            -
                generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
         | 
| 408 | 
            -
                response = processor.batch_decode(
         | 
| 409 | 
            -
                    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
         | 
| 410 | 
            -
                )[0]
         | 
| 411 | 
            -
                print(f'>>> Response\n{response}')
         | 
| 412 | 
            -
                
         | 
| 413 | 
            -
                # Clean up
         | 
| 414 | 
            -
                try:
         | 
| 415 | 
            -
                    os.remove(temp_audio_path)
         | 
| 416 | 
            -
                    print(f"Temporary file {temp_audio_path} removed successfully")
         | 
| 417 | 
            -
                except Exception as e:
         | 
| 418 | 
            -
                    print(f"Error removing temporary file: {e}")
         | 
| 419 | 
            -
            else:
         | 
| 420 | 
            -
                print(f"Failed to download audio file: {audio_response.status_code}")
         | 
| 421 | 
             
            ```
         | 
| 422 |  | 
| 423 | 
             
            ## Responsible AI Considerations
         | 
|  | |
| 42 | 
             
            - Vision: English
         | 
| 43 | 
             
            - Audio: English, Chinese, German, French, Italian, Japanese, Spanish, Portuguese
         | 
| 44 |  | 
| 45 | 
            +
            π‘ [Phi-4-multimodal Portal](https://aka.ms/phi-4-multimodal/azure) <br>
         | 
| 46 | 
            +
            π° [Phi-4-multimodal Microsoft Blog](https://aka.ms/phi4techblog-feb2025) <br>
         | 
| 47 | 
            +
            π [Phi-4-multimodal Technical Report](https://aka.ms/phi-4-multimodal/techreport) <br>
         | 
| 48 | 
            +
            π©βπ³ [Phi-4-multimodal Cookbook](https://github.com/microsoft/PhiCookBook) <br>
         | 
| 49 | 
             
            π₯οΈ [Try It](https://aka.ms/try-phi4mm) <br>
         | 
| 50 |  | 
| 51 | 
            +
            **Phi-4**: [[multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) | [onnx](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)]; [[mini-instruct]](https://huggingface.co/microsoft/Phi-4-mini-instruct);
         | 
| 52 |  | 
| 53 | 
             
            ## Intended Uses
         | 
| 54 |  | 
|  | |
| 218 | 
             
            transformers==4.48.2
         | 
| 219 | 
             
            accelerate==1.3.0
         | 
| 220 | 
             
            soundfile==0.13.1
         | 
| 221 | 
            +
            pillow==11.1.0
         | 
| 222 | 
            +
            scipy==1.15.2
         | 
| 223 | 
            +
            torchvision==0.21.0
         | 
| 224 | 
            +
            backoff==2.2.1
         | 
| 225 | 
            +
            peft==0.13.2
         | 
| 226 | 
             
            ```
         | 
| 227 |  | 
| 228 | 
            +
            Phi-4-multimodal-instruct is also available in [Azure AI Studio](https://aka.ms/phi-4-multimodal/azure)
         | 
| 229 |  | 
| 230 | 
             
            ### Tokenizer
         | 
| 231 |  | 
|  | |
| 328 |  | 
| 329 | 
             
            ### Loading the model locally
         | 
| 330 |  | 
| 331 | 
            +
            After obtaining the Phi-4-multimodal-instruct model checkpoints, users can use this sample code for inference.
         | 
| 332 |  | 
| 333 | 
             
            ```python
         | 
| 334 | 
             
            import requests
         | 
|  | |
| 338 | 
             
            from PIL import Image
         | 
| 339 | 
             
            import soundfile as sf
         | 
| 340 | 
             
            from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
         | 
| 341 | 
            +
            from urllib.request import urlopen
         | 
| 342 | 
            +
             | 
| 343 |  | 
| 344 | 
             
            # Define model path
         | 
| 345 | 
             
            model_path = "microsoft/Phi-4-multimodal-instruct"
         | 
|  | |
| 386 |  | 
| 387 | 
             
            # Part 2: Audio Processing
         | 
| 388 | 
             
            print("\n--- AUDIO PROCESSING ---")
         | 
| 389 | 
            +
            audio_url = "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac"
         | 
| 390 | 
             
            speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation."
         | 
| 391 | 
             
            prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
         | 
| 392 | 
             
            print(f'>>> Prompt\n{prompt}')
         | 
| 393 |  | 
| 394 | 
            +
            # Downlowd and open audio file
         | 
| 395 | 
            +
            audio, samplerate = sf.read(io.BytesIO(urlopen(audio_url).read()))
         | 
| 396 | 
            +
             | 
| 397 | 
            +
            # Process with the model
         | 
| 398 | 
            +
            inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0')
         | 
| 399 | 
            +
             | 
| 400 | 
            +
            generate_ids = model.generate(
         | 
| 401 | 
            +
                **inputs,
         | 
| 402 | 
            +
                max_new_tokens=1000,
         | 
| 403 | 
            +
                generation_config=generation_config,
         | 
| 404 | 
            +
            )
         | 
| 405 | 
            +
            generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
         | 
| 406 | 
            +
            response = processor.batch_decode(
         | 
| 407 | 
            +
                generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
         | 
| 408 | 
            +
            )[0]
         | 
| 409 | 
            +
            print(f'>>> Response\n{response}')
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 410 | 
             
            ```
         | 
| 411 |  | 
| 412 | 
             
            ## Responsible AI Considerations
         | 
    	
        phi_4_mm.tech_report.02252025.pdf
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:a5469d9123cbee2b41729db3217cacfeaa96eaf543868caa2eeec7cf2d24547d
         | 
| 3 | 
            +
            size 5295165
         | 
