Spaces:

Staticaliza
/

Sense

Paused

App Files Files Community

Staticaliza commited on May 28

Commit

3e7bef2

verified ·

1 Parent(s): c81c545

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -12

app.py CHANGED Viewed

@@ -36,7 +36,7 @@ footer {
 }
 '''
-global_instruction = "Describe the given content with as much keywords and always take a guess."
 input_prefixes = {
     "Image": "A image file called █ has been attached, describe the image content.",
@@ -69,7 +69,11 @@ def frames_from_video(path):
 def audio_from_video(path):
     clip = VideoFileClip(path)
     with tempfile.NamedTemporaryFile(suffix = ".wav", delete = True) as tmp:
-        clip.audio.write_audiofile(tmp.name, codec = "pcm_s16le", fps = AUDIO_SR, verbose = False, logger = None)
         audio_np, _ = librosa.load(tmp.name, sr = AUDIO_SR, mono = True)
     clip.close()
     return audio_np
@@ -79,16 +83,23 @@ def load_audio(path):
     return audio_np
 def build_video_omni(path, instruction):
-    frames   = frames_from_video(path)
-    audio    = audio_from_video(path)
     contents = [instruction]
-    total    = max(len(frames), math.ceil(len(audio) / AUDIO_SR))
-    for i in range(total):
         frame = frames[i] if i < len(frames) else frames[-1]
-        chunk = audio[AUDIO_SR * i : AUDIO_SR * (i + 1)]
         contents.extend(["<unit>", frame, chunk])
-    return contents
 def build_image_omni(path, instruction):
     image = Image.open(path).convert("RGB")
     return [instruction, image]
@@ -127,10 +138,8 @@ def generate(input,
         "Audio": build_audio_omni
     }
-    instruction = f"{global_instruction}\n{prefix}\n{instruction}"
-    omni_content = builder_map[filetype](input, instruction)
-    sys_msg      = repo.get_sys_prompt(mode = "omni", language = "en")
-    msgs         = [sys_msg, { "role": "user", "content": omni_content }]
     print(msgs)

 }
 '''
+global_instruction = "You will analyze video, audio and text input and output your description of the given content with as much keywords and always take a guess."
 input_prefixes = {
     "Image": "A image file called █ has been attached, describe the image content.",
 def audio_from_video(path):
     clip = VideoFileClip(path)
     with tempfile.NamedTemporaryFile(suffix = ".wav", delete = True) as tmp:
+        clip.audio.write_audiofile(tmp.name,
+                                   codec = "pcm_s16le",
+                                   fps   = AUDIO_SR,
+                                   verbose = False,
+                                   logger  = None)
         audio_np, _ = librosa.load(tmp.name, sr = AUDIO_SR, mono = True)
     clip.close()
     return audio_np
     return audio_np
 def build_video_omni(path, instruction):
+    frames = frames_from_video(path)
+    audio  = audio_from_video(path)
     contents = [instruction]
+    audio_secs  = math.ceil(len(audio) / AUDIO_SR)
+    total_units = max(1, min(len(frames), audio_secs))
+    for i in range(total_units):
         frame = frames[i] if i < len(frames) else frames[-1]
+        start = i * AUDIO_SR
+        end   = min((i + 1) * AUDIO_SR, len(audio))
+        chunk = audio[start:end]
+        if chunk.size == 0: break
         contents.extend(["<unit>", frame, chunk])
+    return contents
 def build_image_omni(path, instruction):
     image = Image.open(path).convert("RGB")
     return [instruction, image]
         "Audio": build_audio_omni
     }
+    instruction = f"{prefix}\n{instruction}"
+    msgs         = [{ "role": "user", "content": global_instruction }, { "role": "user", "content": omni_content }]
     print(msgs)