Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,7 +22,7 @@ AUDIO_SR = 16000
|
|
| 22 |
|
| 23 |
model_name = "openbmb/MiniCPM-o-2_6"
|
| 24 |
|
| 25 |
-
repo = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="sdpa", torch_dtype=torch.bfloat16).to(DEVICE)
|
| 26 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 27 |
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
| 28 |
|
|
@@ -60,43 +60,29 @@ filetypes = {
|
|
| 60 |
}
|
| 61 |
|
| 62 |
# Functions
|
| 63 |
-
uniform_sample
|
| 64 |
-
|
| 65 |
-
def build_video(filepath):
|
| 66 |
-
vr = VideoReader(filepath, ctx = cpu(0))
|
| 67 |
-
i = uniform_sample(range(len(vr)), MAX_FRAMES)
|
| 68 |
-
batch = vr.get_batch(i).asnumpy()
|
| 69 |
-
frames = [Image.fromarray(frame.astype("uint8")) for frame in batch]
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
image = Image.open(filepath).convert("RGB")
|
| 89 |
-
return image
|
| 90 |
|
| 91 |
-
def
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
frames = uniform_sample(frames, MAX_FRAMES)
|
| 95 |
-
return frames
|
| 96 |
-
|
| 97 |
-
def build_audio(filepath):
|
| 98 |
-
audio, _ = librosa.load(filepath, sr=AUDIO_SR, mono=True)
|
| 99 |
-
return audio
|
| 100 |
|
| 101 |
@spaces.GPU(duration=30)
|
| 102 |
def generate(filepath, input=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
|
|
|
|
| 22 |
|
| 23 |
model_name = "openbmb/MiniCPM-o-2_6"
|
| 24 |
|
| 25 |
+
repo = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="sdpa", torch_dtype=torch.bfloat16, init_vision=True, init_audio=True, init_tts=False).to(DEVICE)
|
| 26 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 27 |
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
| 28 |
|
|
|
|
| 60 |
}
|
| 61 |
|
| 62 |
# Functions
|
| 63 |
+
def uniform_sample(sequence, n): return seq[::max(len(sequence) // n,1)][:n]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
+
def build_image(path): return [Image.open(path).convert("RGB")]
|
| 66 |
+
|
| 67 |
+
def build_gif(path):
|
| 68 |
+
frames = [f.copy().convert("RGB") for f in ImageSequence.Iterator(Image.open(path))]
|
| 69 |
+
return uniform_sample(frames, MAX_FRAMES)
|
| 70 |
+
|
| 71 |
+
def build_video(path):
|
| 72 |
+
vr = VideoReader(path, ctx=cpu(0))
|
| 73 |
+
idx = uniform_sample(range(len(vr)), MAX_FRAMES)
|
| 74 |
+
frames = [Image.fromarray(f.astype("uint8")) for f in vr.get_batch(idx).asnumpy()]
|
| 75 |
+
audio = build_audio(path)[0]
|
| 76 |
+
units = []
|
| 77 |
+
for i, frame in enumerate(frames):
|
| 78 |
+
chunk = audio[i*AUDIO_SR:(i+1)*AUDIO_SR]
|
| 79 |
+
if not chunk.size: break
|
| 80 |
+
units.extend(["<unit>", frame, chunk])
|
| 81 |
+
return units
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
def build_audio(path):
|
| 84 |
+
audio, _ = librosa.load(path, sr=AUDIO_SR, mono=True)
|
| 85 |
+
return [audio]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
@spaces.GPU(duration=30)
|
| 88 |
def generate(filepath, input=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
|