File size: 9,600 Bytes
0aeae25
c947ff2
466e3e5
ab0c591
 
 
 
 
 
 
f212225
10a827b
09dd649
10a827b
 
 
1fb2dc5
 
f212225
 
 
 
71e6356
f212225
 
264bf64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f212225
 
 
1fb2dc5
 
a5d07a8
ea33f68
 
323e41c
ea33f68
a5d07a8
ea33f68
 
 
 
a5d07a8
ea33f68
 
 
 
 
 
 
a5d07a8
 
323e41c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b822d2
2d05dd8
c947ff2
 
09dd649
c947ff2
09dd649
 
 
 
 
c947ff2
09dd649
323e41c
d418457
323e41c
 
9e290b2
d418457
323e41c
 
 
9e290b2
d418457
323e41c
ff91285
323e41c
 
 
 
 
d418457
323e41c
 
 
d418457
323e41c
d418457
323e41c
 
 
 
 
 
 
d418457
323e41c
f5be841
c947ff2
323e41c
 
 
 
1fb2dc5
323e41c
9e290b2
323e41c
d418457
 
 
 
 
 
ab0c591
c947ff2
9e290b2
d418457
9e290b2
466e3e5
d418457
ff91285
d418457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5be841
c947ff2
ab0c591
 
 
 
 
 
09dd649
9e290b2
 
 
 
1fb2dc5
f6e8ca8
9e290b2
fdbd4ae
9e290b2
f6e8ca8
 
5b822d2
f6e8ca8
 
 
 
 
9e290b2
 
1ab8818
 
 
9e290b2
1ab8818
 
 
 
f6e8ca8
9e290b2
1ab8818
 
264bf64
 
 
f6e8ca8
264bf64
 
f6e8ca8
264bf64
f6e8ca8
1ab8818
 
f6e8ca8
9e290b2
f6e8ca8
 
 
 
 
 
 
 
 
 
 
 
 
9e290b2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import gradio as gr
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
from transformers.image_utils import load_image
from threading import Thread
import time
import torch
import spaces
import cv2
import numpy as np
from PIL import Image
from gradio_client import Client, handle_file
import os

# Initialize TTS client with HF token
hf_token = os.getenv("HF_TOKEN")  # Set your HF token as environment variable
tts_client = Client("dofbi/galsenai-xtts-v2-wolof-inference", hf_token=hf_token)

def tts(text):
    """Generate TTS using Gradio API client"""
    try:
        result = tts_client.predict(
            text=text,
            audio_reference=handle_file('https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav'),
            api_name="/predict"
        )
        print(f"TTS result: {result}")  # Debug print to see what's returned
        
        # Handle different possible return formats
        if isinstance(result, tuple):
            # If result is a tuple, the audio file might be in the first element
            return result[0] if result else None
        elif isinstance(result, str):
            # If result is a string (file path)
            return result
        elif hasattr(result, 'name'):
            # If result is a file object with a name attribute
            return result.name
        else:
            # Try to return the result as-is
            return result
    except Exception as e:
        print(f"TTS API Error: {e}")
        return None


def progress_bar_html(label: str) -> str:
    """
    Returns an HTML snippet for a thin progress bar with a label.
    The progress bar is styled as a dark animated bar.
    """
    return f'''
<div style="display: flex; align-items: center;">
    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
    <div style="width: 110px; height: 5px; background-color: #9370DB; border-radius: 2px; overflow: hidden;">
        <div style="width: 100%; height: 100%; background-color: #4B0082; animation: loading 1.5s linear infinite;"></div>
    </div>
</div>
<style>
@keyframes loading {{
    0% {{ transform: translateX(-100%); }}
    100% {{ transform: translateX(100%); }}
}}
</style>
    '''

def downsample_video(video_path):
    """
    Downsamples the video to 10 evenly spaced frames.
    Each frame is converted to a PIL Image along with its timestamp.
    """
    vidcap = cv2.VideoCapture(video_path)
    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    frames = []
    if total_frames <= 0 or fps <= 0:
        vidcap.release()
        return frames
    # Sample 10 evenly spaced frames.
    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
    for i in frame_indices:
        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
        success, image = vidcap.read()
        if success:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(image)
            timestamp = round(i / fps, 2)
            frames.append((pil_image, timestamp))
    vidcap.release()
    return frames

MODEL_ID = "yaya-sy/chvtr" # "kaamd/chtvctr"  # Alternatively: "Qwen/Qwen2.5-VL-3B-Instruct" 
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True, min_pixels=256*28*28, max_pixels=1280*28*28)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
).to("cuda").eval()

@spaces.GPU
def model_inference(input_dict, history):
    text = input_dict["text"]
    files = input_dict["files"]

    if text.strip().lower().startswith("@video-infer"):
        # Remove the tag from the query.
        text = text[len("@video-infer"):].strip()
        if not files:
            raise gr.Error("Please upload a video file along with your @video-infer query.")  # Fixed: gr.Error syntax
        # Assume the first file is a video.
        video_path = files[0]
        frames = downsample_video(video_path)
        if not frames:
            raise gr.Error("Could not process video.")  # Fixed: gr.Error syntax
        # Build messages: start with the text prompt.
        messages = [
            # {"role": "system", "content": "Answer clearly to the user's requesst. Please do not use numbers, only letters. If you want to answer with a number, convert it to letters. For example, you should not say 'am an 2 xaj' but 'am an Γ±aari xaj.'"},
            {
                "role": "user",
                "content": [{"type": "text", "text": text}]
            }
        ]
        # Append each frame with a timestamp label.
        for image, timestamp in frames:
            messages[0]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
            messages[0]["content"].append({"type": "image", "image": image})
        # Collect only the images from the frames.
        video_images = [image for image, _ in frames]
        # Prepare the prompt.
        prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = processor(
            text=[prompt],
            images=video_images,
            return_tensors="pt",
            padding=True,
        ).to("cuda")
        # Set up streaming generation.
        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=128, temperature=2.0, min_p=0.8)
        thread = Thread(target=model.generate, kwargs=generation_kwargs)
        thread.start()
        buffer = ""
        for new_text in streamer:
            buffer += new_text
            time.sleep(0.001)
            yield buffer
        return  # Fixed: Added return to prevent falling through

    if len(files) > 1:
        images = [load_image(image) for image in files]
    elif len(files) == 1:
        images = [load_image(files[0])]
    else:
        images = []

    if text == "" and not images:
        raise gr.Error("Please input a query and optionally image(s).")  # Fixed: gr.Error syntax
    if text == "" and images:
        raise gr.Error("Please input a text query along with the image(s).")  # Fixed: gr.Error syntax

    messages = [
        # {"role": "system", "content": "Answer clearly to the user's requesst. Please do not use numbers, only letters. If you want to answer with a number, convert it to letters. For example, you should not say 'am an 2 xaj' but 'am an Γ±aari xaj.'"},
        {
            "role": "user",
            "content": [
                *[{"type": "image", "image": image} for image in images],
                {"type": "text", "text": text},
            ],
        }
    ]
    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[prompt],
        images=images if images else None,
        return_tensors="pt",
        padding=True,
    ).to("cuda")
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=128, temperature=2.0, min_p=0.8)
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    buffer = ""
    for new_text in streamer:
        buffer += new_text
        time.sleep(0.01)
        yield buffer

    # This will only be reached after streaming is complete
    # Generate TTS for the final buffer content
    audio_path = tts(buffer)
    return audio_path  # Return the audio file path

# Option 1: Use regular Interface with streaming (recommended)
with gr.Blocks() as demo:
    gr.Markdown("# Oolel")
    
    chatbot = gr.Chatbot()
    msg = gr.MultimodalTextbox(
        label="Your Request", 
        file_types=["image", "video"], 
        file_count="multiple"
    )
    audio_output = gr.Audio(label="Generated Speech")
    clear = gr.Button("Clear")
    
    def respond(message, chat_history):
        # Add user message to chat history
        bot_message = ""
        chat_history.append([message["text"], ""])
        
        # Stream the response
        for response in model_inference(message, chat_history):
            bot_message = response
            chat_history[-1][1] = bot_message
            yield "", chat_history, None
        
        # Generate audio after streaming is complete
        try:
            if bot_message.strip():  # Only generate TTS if there's actual text
                audio_path = tts(bot_message)
                if audio_path:
                    yield "", chat_history, audio_path
                else:
                    print("TTS returned None or empty result")
                    yield "", chat_history, None
            else:
                yield "", chat_history, None
        except Exception as e:
            print(f"TTS Error: {e}")
            yield "", chat_history, None
    
    msg.submit(respond, [msg, chatbot], [msg, chatbot, audio_output])
    clear.click(lambda: ([], None), outputs=[chatbot, audio_output])

# Option 2: Use ChatInterface without outputs parameter (simpler but no audio)
# demo = gr.ChatInterface(
#     fn=model_inference,
#     description="# oolel-vision-experimental `@video-infer for video understanding`**",
#     fill_height=True,
#     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
#     stop_btn="Stop Generation",
#     multimodal=True,
#     cache_examples=False,
# )

if __name__ == "__main__":
    demo.launch(debug=True)