Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on Aug 14

Commit

d48f554

verified ·

1 Parent(s): 06b5ed2

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -50

app.py CHANGED Viewed

@@ -81,11 +81,11 @@ def downsample_video(video_path):
 @spaces.GPU(duration=120)
 def generate_image(model_name: str, text: str, image: Image.Image,
-                   max_new_tokens: int = 1024,
-                   temperature: float = 0.6,
-                   top_p: float = 0.9,
-                   top_k: int = 50,
-                   repetition_penalty: float = 1.2):
     """
     Generate responses using the selected model for image input.
     """
@@ -137,11 +137,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
-                   max_new_tokens: int = 1024,
-                   temperature: float = 0.6,
-                   top_p: float = 0.9,
-                   top_k: int = 50,
-                   repetition_penalty: float = 1.2):
     """
     Generate responses using the selected model for video input.
     """
@@ -231,47 +231,40 @@ css = """
     border-radius: 10px;
     padding: 20px;
 }
-.model-choices .gr-form:nth-child(1) label {
-    background-color: #e0f7fa !important;
-    border: 1px solid #00796b !important;
-    color: #00796b !important;
-}
-.model-choices .gr-form:nth-child(3) label {
-    background-color: #e0f7fa !important;
-    border: 1px solid #00796b !important;
-    color: #00796b !important;
-}
-.model-choices .gr-form:nth-child(2) label {
-    background-color: #fff3e0 !important;
-    border: 1px solid #f57c00 !important;
-    color: #f57c00 !important;
-}
-.model-choices .gr-form:nth-child(4) label {
-    background-color: #fff3e0 !important;
-    border: 1px solid #f57c00 !important;
-    color: #f57c00 !important;
-}
-.model-choices .gr-form:nth-child(1) label::after {
-    content: " (OCR)";
-    font-weight: bold;
-}
-.model-choices .gr-form:nth-child(3) label::after {
-    content: " (OCR)";
-    font-weight: bold;
-}
-.model-choices .gr-form:nth-child(2) label::after {
-    content: " (Reasoning)";
-    font-weight: bold;
 }
-.model-choices .gr-form:nth-child(4) label::after {
-    content: " (Reasoning)";
-    font-weight: bold;
 }
 """
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
-    gr.Markdown("# **[Multimodal VLM v1.0](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
@@ -291,28 +284,25 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                         examples=video_examples,
                         inputs=[video_query, video_upload]
                     )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.md)")
             model_choice = gr.Radio(
                 choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "Megalodon-OCR-Sync-0713", "ViLaSR-7B"],
                 label="Select Model",
-                value="Camel-Doc-OCR-062825",
-                elem_classes="model-choices"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-v1.0/discussions)")
     # Define the submit button actions
     image_submit.click(fn=generate_image,

 @spaces.GPU(duration=120)
 def generate_image(model_name: str, text: str, image: Image.Image,
+                  max_new_tokens: int = 1024,
+                  temperature: float = 0.6,
+                  top_p: float = 0.9,
+                  top_k: int = 50,
+                  repetition_penalty: float = 1.2):
     """
     Generate responses using the selected model for image input.
     """
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
+                  max_new_tokens: int = 1024,
+                  temperature: float = 0.6,
+                  top_p: float = 0.9,
+                  top_k: int = 50,
+                  repetition_penalty: float = 1.2):
     """
     Generate responses using the selected model for video input.
     """
     border-radius: 10px;
     padding: 20px;
 }
+.model-choice-reasoning {
+    background-color: #2ecc71 !important; /* Green for reasoning models */
+    color: white !important;
+    padding: 5px 10px;
+    border-radius: 5px;
 }
+.model-choice-ocr {
+    background-color: #3498db !important; /* Blue for OCR models */
+    color: white !important;
+    padding: 5px 10px;
+    border-radius: 5px;
 }
 """
+# JavaScript to apply classes to radio button labels
+js_script = """
+<script>
+document.addEventListener('DOMContentLoaded', function() {
+    const labels = document.querySelectorAll('.gr-radio label');
+    labels.forEach(label => {
+        const text = label.textContent.trim();
+        if (text === 'GLM-4.1V-9B-Thinking' || text === 'ViLaSR-7B') {
+            label.classList.add('model-choice-reasoning');
+        } else if (text === 'Camel-Doc-OCR-062825' || text === 'Megalodon-OCR-Sync-0713') {
+            label.classList.add('model-choice-ocr');
+        }
+    });
+});
+</script>
+"""
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
+    gr.Markdown("# Multimodal VLM v1.0")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
                         examples=video_examples,
                         inputs=[video_query, video_upload]
                     )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.md)")
             model_choice = gr.Radio(
                 choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "Megalodon-OCR-Sync-0713", "ViLaSR-7B"],
                 label="Select Model",
+                value="Camel-Doc-OCR-062825"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-v1.0/discussions)")
+            gr.HTML(js_script)  # Inject JavaScript to apply classes
     # Define the submit button actions
     image_submit.click(fn=generate_image,