Spaces:

Staticaliza
/

Sense

Paused

App Files Files Community

Staticaliza commited on May 28

Commit

bcbc1e7

verified ·

1 Parent(s): 0bb8329

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -35

app.py CHANGED Viewed

@@ -39,7 +39,7 @@ footer {
 input_prefixes = {
     "Image": "(A image file called █ has been attached, describe the image content) ",
     "GIF": "(A GIF file called █ has been attached, describe the GIF content) ",
-    "Video": "(A video with audio file called █ has been attached, describe the video content and the audio content embedded into the video) ",
     "Audio": "(A audio file called █ has been attached, describe the audio content) ",
 }
@@ -94,42 +94,119 @@ def build_audio_omni(path, prefix, instruction, sr=AUDIO_SR):
     audio_np, _ = librosa.load(path, sr=sr, mono=True)
     return ["<unit>", audio_np, prefix + instruction]
-@spaces.GPU(duration=60)
-def generate(input, instruction=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
-    if not input: return "No input provided."
     extension = os.path.splitext(input)[1].lower()
-    filetype = next((k for k, v in filetypes.items() if extension in v), None)
-    if not filetype: return "Unsupported file type."
-    filename = os.path.basename(input)
-    prefix = input_prefixes[filetype].replace("█", filename)
-    if filetype == "Video":
-        omni_content = build_omni_chunks(input, prefix, instruction)
-    elif filetype == "Image":
-        omni_content = build_image_omni(input, prefix, instruction)
-    elif filetype == "GIF":
-        omni_content = build_gif_omni(input, prefix, instruction)
-    elif filetype == "Audio":
-        omni_content = build_audio_omni(input, prefix, instruction)
-    sys_msg = repo.get_sys_prompt(mode="omni", language="en")
-    msgs = [sys_msg, {"role": "user", "content": omni_content}]
-    params = {
-        "msgs": msgs,
-        "tokenizer": tokenizer,
-        "sampling": sampling,
-        "temperature": temperature,
-        "top_p": top_p,
-        "top_k": top_k,
-        "repetition_penalty": repetition_penalty,
-        "max_new_tokens": max_tokens,
-        "omni_input": True,
     }
-    output = repo.chat(**params)
     torch.cuda.empty_cache()
     gc.collect()
     return output

 input_prefixes = {
     "Image": "(A image file called █ has been attached, describe the image content) ",
     "GIF": "(A GIF file called █ has been attached, describe the GIF content) ",
+    "Video": "(A audio video file called █ has been attached, describe the video content and the audio content) ",
     "Audio": "(A audio file called █ has been attached, describe the audio content) ",
 }
     audio_np, _ = librosa.load(path, sr=sr, mono=True)
     return ["<unit>", audio_np, prefix + instruction]
+ef infer_filetype(ext):
+    return next((k for k, v in filetypes.items() if ext in v), None)
+def uniform_sample(seq, n):
+    step = max(len(seq) // n, 1)
+    return seq[::step][:n]
+def frames_from_video(path):
+    vr = VideoReader(path, ctx = cpu(0))
+    idx = uniform_sample(range(len(vr)), MAX_FRAMES)
+    batch = vr.get_batch(idx).asnumpy()
+    return [Image.fromarray(frame.astype("uint8")) for frame in batch]
+def audio_from_video(path):
+    clip = VideoFileClip(path)
+    audio = clip.audio.to_soundarray(fps = AUDIO_SR)
+    clip.close()
+    return librosa.to_mono(audio.T)
+def load_audio(path):
+    audio_np, _ = librosa.load(path, sr = AUDIO_SR, mono = True)
+    return audio_np
+def build_video_omni(path, prefix, instruction):
+    frames = frames_from_video(path)
+    audio = audio_from_video(path)
+    return processor.build_omni_input(
+        frames       = frames,
+        audio        = audio,
+        prefix       = prefix,
+        instruction  = instruction,
+        max_frames   = MAX_FRAMES,
+        sr           = AUDIO_SR
+    )
+def build_image_omni(path, prefix, instruction):
+    image = Image.open(path).convert("RGB")
+    return processor.build_omni_input(
+        frames      = [image],
+        audio       = None,
+        prefix      = prefix,
+        instruction = instruction
+    )
+def build_gif_omni(path, prefix, instruction):
+    img    = Image.open(path)
+    frames = [frame.copy().convert("RGB") for frame in ImageSequence.Iterator(img)]
+    frames = uniform_sample(frames, MAX_FRAMES)
+    return processor.build_omni_input(
+        frames      = frames,
+        audio       = None,
+        prefix      = prefix,
+        instruction = instruction
+    )
+def build_audio_omni(path, prefix, instruction):
+    audio = load_audio(path)
+    return processor.build_omni_input(
+        frames      = None,
+        audio       = audio,
+        prefix      = prefix,
+        instruction = instruction,
+        sr          = AUDIO_SR
+    )
+@spaces.GPU(duration = 60)
+def generate(input,
+             instruction        = DEFAULT_INPUT,
+             sampling           = False,
+             temperature        = 0.7,
+             top_p              = 0.8,
+             top_k              = 100,
+             repetition_penalty = 1.05,
+             max_tokens         = 512):
+    if not input:
+        return "no input provided."
     extension = os.path.splitext(input)[1].lower()
+    filetype  = infer_filetype(extension)
+    if not filetype:
+        return "unsupported file type."
+    filename      = os.path.basename(input)
+    prefix        = input_prefixes[filetype].replace("█", filename)
+    builder_map   = {
+        "Image": build_image_omni,
+        "GIF"  : build_gif_omni,
+        "Video": build_video_omni,
+        "Audio": build_audio_omni
     }
+    omni_content  = builder_map[filetype](input, prefix, instruction)
+    sys_msg       = repo.get_sys_prompt(mode = "omni", language = "en")
+    msgs          = [sys_msg, { "role": "user", "content": omni_content }]
+    output        = repo.chat(
+        msgs                = msgs,
+        tokenizer           = tokenizer,
+        sampling            = sampling,
+        temperature         = temperature,
+        top_p               = top_p,
+        top_k               = top_k,
+        repetition_penalty  = repetition_penalty,
+        max_new_tokens      = max_tokens,
+        omni_input          = True,
+        use_image_id        = False,
+        max_slice_nums      = 2
+    )
     torch.cuda.empty_cache()
     gc.collect()
     return output