Llama-3.2s-1B-Instruct-v0.1

Sleeping

App Files Files Community

QuietImpostor commited on Sep 27, 2024

Commit

cd51b0f

verified ·

1 Parent(s): cc0fe39

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -40

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import gradio as gr
 import torch
-import spaces
 import torchaudio
 from whisperspeech.vq_stoks import RQBottleneckTransformer
 from encodec.utils import convert_audio
@@ -8,13 +10,10 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
 from threading import Thread
 import logging
-import os
-from generate_audio import (
-    TTSProcessor,
-)
 import uuid
-device = "cpu"  # Change this to always use CPU
 vq_model = RQBottleneckTransformer.load_model(
         "whisper-vq-stoks-medium-en+pl-fixed.model"
     ).to(device)
@@ -30,12 +29,11 @@ if use_8bit:
         llm_int8_has_fp16_weight=False,
     )
 else:
-    model_kwargs["torch_dtype"] = torch.float32  # Change this to use float32 on CPU
 model = AutoModelForCausalLM.from_pretrained(llm_path, **model_kwargs).to(device)
-@spaces.CPU  # Change this to use CPU
 def audio_to_sound_tokens_whisperspeech(audio_path):
-    vq_model.ensure_whisper(device)  # Change this to use the defined device
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
         wav = torchaudio.functional.resample(wav, sr, 16000)
@@ -46,9 +44,8 @@ def audio_to_sound_tokens_whisperspeech(audio_path):
     result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
     return f'<|sound_start|>{result}<|sound_end|>'
-@spaces.CPU  # Change this to use CPU
 def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
-    vq_model.ensure_whisper(device)  # Change this to use the defined device
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
         wav = torchaudio.functional.resample(wav, sr, 16000)
@@ -59,53 +56,50 @@ def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
     result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
     return f'<|reserved_special_token_69|><|sound_start|>{result}<|sound_end|>'
-@spaces.CPU  # Change this to use CPU
 def text_to_audio_file(text):
     id = str(uuid.uuid4())
     temp_file = f"./user_audio/{id}_temp_audio.wav"
-    text = text
     text_split = "_".join(text.lower().split(" "))
     if text_split[-1] == ".":
         text_split = text_split[:-1]
-    tts = TTSProcessor(device)  # Change this to use the defined device
     tts.convert_text_to_audio_file(text, temp_file)
     print(f"Saved audio to {temp_file}")
     return temp_file
-@spaces.CPU
 def process_input(audio_file=None):
     for partial_message in process_audio(audio_file):
         yield partial_message
-@spaces.CPU
 def process_transcribe_input(audio_file=None):
     for partial_message in process_audio(audio_file, transcript=True):
         yield partial_message
 class StopOnTokens(StoppingCriteria):
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        # encode </s> token
-        stop_ids = [tokenizer.eos_token_id, 128009]  # Adjust this based on your model's tokenizer
         for stop_id in stop_ids:
             if input_ids[0][-1] == stop_id:
                 return True
         return False
-@spaces.CPU
 def process_audio(audio_file, transcript=False):
     if audio_file is None:
-            raise ValueError("No audio file provided")
     logging.info(f"Audio file received: {audio_file}")
     logging.info(f"Audio file type: {type(audio_file)}")
-    sound_tokens = audio_to_sound_tokens_whisperspeech_transcribe(audio_file)  if transcript else audio_to_sound_tokens_whisperspeech(audio_file)
     logging.info("Sound tokens generated successfully")
-    # logging.info(f"audio_file: {audio_file.name}")
     messages = [
         {"role": "user", "content": sound_tokens},
     ]
@@ -115,7 +109,7 @@ def process_audio(audio_file, transcript=False):
     input_ids = tokenizer.encode(input_str, return_tensors="pt")
     input_ids = input_ids.to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(
         input_ids=input_ids,
         streamer=streamer,
@@ -134,10 +128,7 @@ def process_audio(audio_file, transcript=False):
             break
         partial_message = partial_message.replace("assistant\n\n", "")
         yield partial_message
-# def stop_generation():
-#     # This is a placeholder. Implement actual stopping logic here if needed.
-#     return "Generation stopped.", gr.Button.update(interactive=False)
-# take all the examples from the examples folder
 good_examples = []
 for file in os.listdir("./examples"):
     if file.endswith(".wav"):
@@ -149,6 +140,7 @@ for file in os.listdir("./bad_examples"):
 examples = []
 examples.extend(good_examples)
 examples.extend(bad_examples)
 with gr.Blocks() as iface:
     gr.Markdown("# Llama3.1-S: checkpoint Aug 19, 2024")
     gr.Markdown("Enter text to convert to audio, then submit the audio to generate text or Upload Audio")
@@ -158,8 +150,7 @@ with gr.Blocks() as iface:
         input_type = gr.Radio(["text", "audio"], label="Input Type", value="audio")
         text_input = gr.Textbox(label="Text Input", visible=False)
         audio_input = gr.Audio(label="Audio", type="filepath", visible=True)
-        # audio_output = gr.Audio(label="Converted Audio", type="filepath", visible=False)
     convert_button = gr.Button("Make synthetic audio", visible=False)
     submit_button = gr.Button("Chat with AI using audio")
     transcrip_button = gr.Button("Make Model transcribe the audio")
@@ -169,11 +160,11 @@ with gr.Blocks() as iface:
     def update_visibility(input_type):
         return (gr.update(visible=input_type == "text"),
                 gr.update(visible=input_type == "text"))
     def convert_and_display(text):
         audio_file = text_to_audio_file(text)
         return audio_file
-    def process_example(file_path):
-        return update_visibility("audio")
     input_type.change(
         update_visibility,
         inputs=[input_type],
@@ -198,7 +189,6 @@ with gr.Blocks() as iface:
     )
     gr.Examples(examples, inputs=[audio_input])
 iface.queue()
-iface.launch()
-# launch locally
-# iface.launch(server_name="0.0.0.0")

+import os
+os.environ['NUMPY_EXPERIMENTAL_ARRAY_FUNCTION'] = '0'
 import gradio as gr
 import torch
 import torchaudio
 from whisperspeech.vq_stoks import RQBottleneckTransformer
 from encodec.utils import convert_audio
 from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
 from threading import Thread
 import logging
+from generate_audio import TTSProcessor
 import uuid
+device = "cpu"
 vq_model = RQBottleneckTransformer.load_model(
         "whisper-vq-stoks-medium-en+pl-fixed.model"
     ).to(device)
         llm_int8_has_fp16_weight=False,
     )
 else:
+    model_kwargs["torch_dtype"] = torch.float32
 model = AutoModelForCausalLM.from_pretrained(llm_path, **model_kwargs).to(device)
 def audio_to_sound_tokens_whisperspeech(audio_path):
+    vq_model.ensure_whisper(device)
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
         wav = torchaudio.functional.resample(wav, sr, 16000)
     result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
     return f'<|sound_start|>{result}<|sound_end|>'
 def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
+    vq_model.ensure_whisper(device)
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
         wav = torchaudio.functional.resample(wav, sr, 16000)
     result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
     return f'<|reserved_special_token_69|><|sound_start|>{result}<|sound_end|>'
 def text_to_audio_file(text):
     id = str(uuid.uuid4())
     temp_file = f"./user_audio/{id}_temp_audio.wav"
     text_split = "_".join(text.lower().split(" "))
     if text_split[-1] == ".":
         text_split = text_split[:-1]
+    tts = TTSProcessor(device)
     tts.convert_text_to_audio_file(text, temp_file)
     print(f"Saved audio to {temp_file}")
     return temp_file
+def run_on_cpu(func):
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+    return wrapper
+@run_on_cpu
 def process_input(audio_file=None):
     for partial_message in process_audio(audio_file):
         yield partial_message
+@run_on_cpu
 def process_transcribe_input(audio_file=None):
     for partial_message in process_audio(audio_file, transcript=True):
         yield partial_message
 class StopOnTokens(StoppingCriteria):
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        stop_ids = [tokenizer.eos_token_id, 128009]
         for stop_id in stop_ids:
             if input_ids[0][-1] == stop_id:
                 return True
         return False
 def process_audio(audio_file, transcript=False):
     if audio_file is None:
+        raise ValueError("No audio file provided")
     logging.info(f"Audio file received: {audio_file}")
     logging.info(f"Audio file type: {type(audio_file)}")
+    sound_tokens = audio_to_sound_tokens_whisperspeech_transcribe(audio_file) if transcript else audio_to_sound_tokens_whisperspeech(audio_file)
     logging.info("Sound tokens generated successfully")
     messages = [
         {"role": "user", "content": sound_tokens},
     ]
     input_ids = tokenizer.encode(input_str, return_tensors="pt")
     input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(
         input_ids=input_ids,
         streamer=streamer,
             break
         partial_message = partial_message.replace("assistant\n\n", "")
         yield partial_message
 good_examples = []
 for file in os.listdir("./examples"):
     if file.endswith(".wav"):
 examples = []
 examples.extend(good_examples)
 examples.extend(bad_examples)
 with gr.Blocks() as iface:
     gr.Markdown("# Llama3.1-S: checkpoint Aug 19, 2024")
     gr.Markdown("Enter text to convert to audio, then submit the audio to generate text or Upload Audio")
         input_type = gr.Radio(["text", "audio"], label="Input Type", value="audio")
         text_input = gr.Textbox(label="Text Input", visible=False)
         audio_input = gr.Audio(label="Audio", type="filepath", visible=True)
     convert_button = gr.Button("Make synthetic audio", visible=False)
     submit_button = gr.Button("Chat with AI using audio")
     transcrip_button = gr.Button("Make Model transcribe the audio")
     def update_visibility(input_type):
         return (gr.update(visible=input_type == "text"),
                 gr.update(visible=input_type == "text"))
     def convert_and_display(text):
         audio_file = text_to_audio_file(text)
         return audio_file
     input_type.change(
         update_visibility,
         inputs=[input_type],
     )
     gr.Examples(examples, inputs=[audio_input])
 iface.queue()
+iface.launch()