Gijs Wijngaard commited on
Commit
b31d71d
·
1 Parent(s): 27a9c0b
Files changed (2) hide show
  1. app.py +7 -7
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,7 +1,9 @@
1
  import gradio as gr
2
- import soundfile as sf
3
  import torch
4
  from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
 
 
 
5
 
6
 
7
  MODEL_ID = "Qwen/Qwen2.5-Omni-7B" if False else "Qwen/Qwen2.5-Omni-7B" # keep explicit string
@@ -15,7 +17,7 @@ model.disable_talker()
15
 
16
  processor = Qwen2_5OmniProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
17
 
18
-
19
  def run_omni(audio_path: str, instruction: str, max_tokens: int = 512) -> str:
20
  if not audio_path:
21
  return "Please upload an audio file."
@@ -39,15 +41,13 @@ def run_omni(audio_path: str, instruction: str, max_tokens: int = 512) -> str:
39
  ]
40
 
41
  text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
42
-
43
- audio, sr = sf.read(audio_path)
44
- audios = [(audio, sr)]
45
 
46
  inputs = processor(
47
  text=text,
48
  audio=audios,
49
- images=[],
50
- videos=[],
51
  return_tensors="pt",
52
  padding=True,
53
  )
 
1
  import gradio as gr
 
2
  import torch
3
  from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
4
+ from qwen_omni_utils import process_mm_info
5
+
6
+ import spaces
7
 
8
 
9
  MODEL_ID = "Qwen/Qwen2.5-Omni-7B" if False else "Qwen/Qwen2.5-Omni-7B" # keep explicit string
 
17
 
18
  processor = Qwen2_5OmniProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
19
 
20
+ @spaces.GPU
21
  def run_omni(audio_path: str, instruction: str, max_tokens: int = 512) -> str:
22
  if not audio_path:
23
  return "Please upload an audio file."
 
41
  ]
42
 
43
  text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
44
+ audios, images, videos = process_mm_info(conversation, use_audio_in_video=False)
 
 
45
 
46
  inputs = processor(
47
  text=text,
48
  audio=audios,
49
+ images=images,
50
+ videos=videos,
51
  return_tensors="pt",
52
  padding=True,
53
  )
requirements.txt CHANGED
@@ -3,4 +3,5 @@ torch>=2.1.0
3
  transformers>=4.43.0
4
  accelerate>=0.30.0
5
  soundfile>=0.12.1
6
-
 
 
3
  transformers>=4.43.0
4
  accelerate>=0.30.0
5
  soundfile>=0.12.1
6
+ spaces
7
+ qwen-omni-utils