Spaces:

ThongCoder
/

Qwen3-0.6B-Coder

Paused

App Files Files Community

ThongCoder commited on Sep 7

Commit

15776f9

verified ·

1 Parent(s): 8116812

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -25

app.py CHANGED Viewed

@@ -1,29 +1,48 @@
-from transformers import pipeline
 import gradio as gr
-import time
-# Load model
-pipe = pipeline("text-generation", model="prithivMLmods/rStar-Coder-Qwen3-0.6B")
 history = []
-def chat_fn_stream(user_input):
     global history
     history.append(f"User: {user_input}")
     context = "\n".join(history) + "\nBot:"
-    # Use a generator for streaming
-    for i in range(0, 8192, 20):  # fake streaming in chunks
-        output = pipe(
-            context,
-            max_new_tokens=i+20,
-            do_sample=True,
-            top_p=0.9,
-            return_full_text=False
-        )[0]['generated_text']
-        bot_reply = output.split("Bot:")[-1].strip()
-        yield bot_reply  # stream partial reply
-        time.sleep(0.1)  # small delay to simulate streaming
     history.append(f"Bot: {bot_reply}")
@@ -33,16 +52,11 @@ with gr.Blocks() as demo:
     msg = gr.Textbox(placeholder="Type a message...")
     def respond(user_input, chat_history):
-        bot_reply = ""
-        # Start by adding the user input
-        chat_history.append((user_input, ""))  # empty bot reply for now
-        for partial in chat_fn_stream(user_input):
-            bot_reply = partial
-            # Update the last bot reply
-            chat_history[-1] = (user_input, bot_reply)
             yield chat_history, chat_history
     state = gr.State([])
     msg.submit(respond, [msg, state], [chatbot_ui, state])

 import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# Load tokenizer and model
+model_name = "prithivMLmods/rStar-Coder-Qwen3-0.6B"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
+model.eval()
+if torch.cuda.is_available():
+    model = model.to("cuda")
 history = []
+def stream_chat(user_input):
     global history
     history.append(f"User: {user_input}")
     context = "\n".join(history) + "\nBot:"
+    # Tokenize input
+    input_ids = tokenizer(context, return_tensors="pt").input_ids
+    if torch.cuda.is_available():
+        input_ids = input_ids.to("cuda")
+    # Generate token by token
+    output_ids = input_ids.clone()
+    bot_reply = ""
+    max_new_tokens = 200  # adjust as needed
+    for _ in range(max_new_tokens):
+        with torch.no_grad():
+            outputs = model(output_ids)
+            next_token_logits = outputs.logits[0, -1, :]
+            next_token = torch.argmax(next_token_logits).unsqueeze(0)
+            output_ids = torch.cat([output_ids, next_token.unsqueeze(0)], dim=1)
+            token_str = tokenizer.decode(next_token)
+            bot_reply += token_str
+            # Yield streaming output
+            yield bot_reply
+            # Stop if EOS token
+            if next_token.item() == tokenizer.eos_token_id:
+                break
     history.append(f"Bot: {bot_reply}")
     msg = gr.Textbox(placeholder="Type a message...")
     def respond(user_input, chat_history):
+        chat_history.append((user_input, ""))
+        for partial in stream_chat(user_input):
+            chat_history[-1] = (user_input, partial)
             yield chat_history, chat_history
     state = gr.State([])
     msg.submit(respond, [msg, state], [chatbot_ui, state])