Spaces:
Build error
Build error
| import gradio as gr | |
| import time | |
| import whisper | |
| import torch | |
| from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler | |
| def SpeechToText(audio): | |
| if audio == None : return "" | |
| model = whisper.load_model("base") | |
| audio = whisper.load_audio(audio) | |
| audio = whisper.pad_or_trim(audio) | |
| # make log-Mel spectrogram and move to the same device as the model | |
| mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
| # Detect the Max probability of language ? | |
| _, probs = model.detect_language(mel) | |
| lang = f"Language: {max(probs, key=probs.get)}" | |
| # Decode audio to Text | |
| options = whisper.DecodingOptions(fp16 = False) | |
| result = whisper.decode(model, mel, options) | |
| return result.text | |
| def img_Generation(text): | |
| print(text) | |
| model_id = "stabilityai/stable-diffusion-2" | |
| #model_id = "stabilityai/stable-diffusion-2-1" | |
| # Use the Euler scheduler here instead | |
| scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler") | |
| pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16) | |
| pipe = pipe.to("cuda") | |
| image = pipe(text, num_inference_steps = 80).images[0] | |
| #image.save("img_1.png") | |
| return image | |
| def transcribe(audio): | |
| text = SpeechToText(audio) | |
| image = img_Generation(text) | |
| return image | |
| # gradio | |
| gr.Interface( | |
| fn=transcribe, | |
| inputs=gr.Audio(source="microphone", type="filepath"), | |
| outputs="image",description="A Speech to Image Generation App Using OpenAI's Whisper and Stable Diffusion V.2",title= "Whisper2IMG").launch() | |