Spaces:

prithivMLmods
/

DocScope-R1

Running on Zero

App Files Files Community

prithivMLmods commited on May 30

Commit

1590f58

verified ·

1 Parent(s): 304d9d6

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -7

app.py CHANGED Viewed

@@ -34,7 +34,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_M,
     trust_remote_code=True,
     torch_dtype=torch.float16
-).to("cuda").eval()
 # Load DocScope
 MODEL_ID_X = "prithivMLmods/docscopeOCR-7B-050425-exp"
@@ -43,7 +43,16 @@ model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_X,
     trust_remote_code=True,
     torch_dtype=torch.float16
-).to("cuda").eval()
 def downsample_video(video_path):
     """
@@ -82,6 +91,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     elif model_name == "docscopeOCR-7B-050425-exp":
         processor = processor_x
         model = model_x
     else:
         yield "Invalid model selected."
         return
@@ -105,7 +117,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         padding=True,
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
-    ).to("cuda")
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
@@ -133,6 +145,9 @@ def generate_video(model_name: str, text: str, video_path: str,
     elif model_name == "docscopeOCR-7B-050425-exp":
         processor = processor_x
         model = model_x
     else:
         yield "Invalid model selected."
         return
@@ -158,7 +173,7 @@ def generate_video(model_name: str, text: str, video_path: str,
         return_tensors="pt",
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
-    ).to("cuda")
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
@@ -222,7 +237,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                         examples=video_examples,
                         inputs=[video_query, video_upload]
                     )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
@@ -232,9 +246,10 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
         with gr.Column():
             output = gr.Textbox(label="Output", interactive=False)
             model_choice = gr.Radio(
-                choices=["Cosmos-Reason1-7B", "docscopeOCR-7B-050425-exp"],
                 label="Select Model",
-            value="Cosmos-Reason1-7B")
     image_submit.click(
         fn=generate_image,

     MODEL_ID_M,
     trust_remote_code=True,
     torch_dtype=torch.float16
+).to(device).eval()
 # Load DocScope
 MODEL_ID_X = "prithivMLmods/docscopeOCR-7B-050425-exp"
     MODEL_ID_X,
     trust_remote_code=True,
     torch_dtype=torch.float16
+).to(device).eval()
+# Load InkScope Relaxed
+MODEL_ID_Z = "prithivMLmods/Inkscope-Captions-2B-0526"
+processor_z = AutoProcessor.from_pretrained(MODEL_ID_Z, trust_remote_code=True)
+model_z = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_Z,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
 def downsample_video(video_path):
     """
     elif model_name == "docscopeOCR-7B-050425-exp":
         processor = processor_x
         model = model_x
+    elif model_name == "Captions-Mini":
+        processor = processor_z
+        model = model_z
     else:
         yield "Invalid model selected."
         return
         padding=True,
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
+    ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     elif model_name == "docscopeOCR-7B-050425-exp":
         processor = processor_x
         model = model_x
+    elif model_name == "Captions-Mini":
+        processor = processor_z
+        model = model_z
     else:
         yield "Invalid model selected."
         return
         return_tensors="pt",
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
+    ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
                         examples=video_examples,
                         inputs=[video_query, video_upload]
                     )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
         with gr.Column():
             output = gr.Textbox(label="Output", interactive=False)
             model_choice = gr.Radio(
+                choices=["Cosmos-Reason1-7B", "docscopeOCR-7B-050425-exp", "Captions-Mini"],
                 label="Select Model",
+                value="Cosmos-Reason1-7B"
+            )
     image_submit.click(
         fn=generate_image,