Spaces:

prithivMLmods
/

DocScope-R1

Running on Zero

App Files Files Community

prithivMLmods commited on Feb 16

Commit

6b8f8c9

verified ·

1 Parent(s): 7af58fc

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -15

app.py CHANGED Viewed

@@ -7,11 +7,7 @@ from diffusers import DiffusionPipeline
 import random
 import numpy as np
 import os
-import subprocess
 from qwen_vl_utils import process_vision_info
-from threading import Thread
-import uuid
-import io
 # Initialize models
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -36,10 +32,10 @@ enhancer_long = pipeline("summarization", model="gokaygokay/Lamini-Prompt-Enchan
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024  # Reduced to prevent memory issues
-# Qwen2VL caption function
 @spaces.GPU
 def qwen_caption(image):
-    # Convert image to PIL if it's not already
     if not isinstance(image, Image.Image):
         image = Image.fromarray(image)
@@ -48,7 +44,8 @@ def qwen_caption(image):
             "role": "user",
             "content": [
                 {"type": "image", "image": image},
-                {"type": "text", "text": "Generate a detailed and optimized caption for the given image in the form of JSON data {}"},
             ],
         }
     ]
@@ -77,7 +74,7 @@ def qwen_caption(image):
     return output_text
-# Prompt Enhancer function
 def enhance_prompt(input_prompt):
     result = enhancer_long("Enhance the description: " + input_prompt)
     enhanced_text = result[0]['summary_text']
@@ -86,10 +83,8 @@ def enhance_prompt(input_prompt):
 @spaces.GPU(duration=190)
 def process_workflow(image, text_prompt, use_enhancer, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, progress=gr.Progress(track_tqdm=True)):
     if image is not None:
-        # Convert image to PIL if it's not already
         if not isinstance(image, Image.Image):
             image = Image.fromarray(image)
         prompt = qwen_caption(image)
         print(prompt)
     else:
@@ -103,10 +98,9 @@ def process_workflow(image, text_prompt, use_enhancer, seed, randomize_seed, wid
     generator = torch.Generator(device=device).manual_seed(seed)
-    # Reduce memory usage by clearing GPU cache
     torch.cuda.empty_cache()
-    # Generate image with FLUX.1-dev
     try:
         image = pipe(
             prompt=prompt,
@@ -164,10 +158,10 @@ with gr.Blocks(css=custom_css) as demo:
                 use_enhancer = gr.Checkbox(label="Use Prompt Enhancer", value=False)
                 seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
                 randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
-                width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=512)  # Reduced default width
-                height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=512)  # Reduced default height
                 guidance_scale = gr.Slider(label="Guidance Scale", minimum=1, maximum=15, step=0.1, value=3.5)
-                num_inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=20)  # Reduced default steps
             generate_btn = gr.Button("Generate Image + Prompt Enhanced", elem_classes="submit-btn")

 import random
 import numpy as np
 import os
 from qwen_vl_utils import process_vision_info
 # Initialize models
 device = "cuda" if torch.cuda.is_available() else "cpu"
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024  # Reduced to prevent memory issues
+# Qwen2VL caption function – updated to request plain text caption instead of JSON
 @spaces.GPU
 def qwen_caption(image):
+    # Convert image to PIL if needed
     if not isinstance(image, Image.Image):
         image = Image.fromarray(image)
             "role": "user",
             "content": [
                 {"type": "image", "image": image},
+                # Removed "in the form of JSON data {}" to get plain text caption
+                {"type": "text", "text": "Generate a detailed and optimized caption for the given image."},
             ],
         }
     ]
     return output_text
+# Prompt Enhancer function (unchanged)
 def enhance_prompt(input_prompt):
     result = enhancer_long("Enhance the description: " + input_prompt)
     enhanced_text = result[0]['summary_text']
 @spaces.GPU(duration=190)
 def process_workflow(image, text_prompt, use_enhancer, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, progress=gr.Progress(track_tqdm=True)):
     if image is not None:
         if not isinstance(image, Image.Image):
             image = Image.fromarray(image)
         prompt = qwen_caption(image)
         print(prompt)
     else:
     generator = torch.Generator(device=device).manual_seed(seed)
+    # Clear GPU cache before generating the image
     torch.cuda.empty_cache()
     try:
         image = pipe(
             prompt=prompt,
                 use_enhancer = gr.Checkbox(label="Use Prompt Enhancer", value=False)
                 seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
                 randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
+                width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=512)
+                height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=512)
                 guidance_scale = gr.Slider(label="Guidance Scale", minimum=1, maximum=15, step=0.1, value=3.5)
+                num_inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=20)
             generate_btn = gr.Button("Generate Image + Prompt Enhanced", elem_classes="submit-btn")