Spaces:

SkyNetWalker
/

chatCPU

Running

App Files Files Community

SkyNetWalker commited on Jun 25

Commit

c2ba929

verified ·

1 Parent(s): 36269ba

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -44

app.py CHANGED Viewed

@@ -1,50 +1,23 @@
 import gradio as gr
-import requests
-import json
-# Define the URL for the local Ollama API and the model name
-OLLAMA_API_URL = "http://localhost:11434/api/generate"
-# This must match the name used in `ollama pull` in Dockerfile
-MODEL_NAME = "gemma3_4b_it_qat"
-def generate_text(prompt, max_new_tokens=256, temperature=0.7):
-    """
-    Function to send a prompt to the Ollama API and get a response.
-    """
-    payload = {
-        "model": MODEL_NAME,
-        "prompt": prompt,
-        "stream": False,  # We want the full response at once
-        "options": {
-            "num_predict": max_new_tokens,
-            "temperature": temperature,
-        }
-    }
-    try:
-        # Send a POST request to the Ollama API.
-        # Increased timeout for potentially slow CPU inference.
-        response = requests.post(OLLAMA_API_URL, json=payload, timeout=600) # 10 minutes timeout
-        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
-        result = response.json()
-        return result.get("response", "No response from model.")
-    except requests.exceptions.RequestException as e:
-        return f"Error communicating with Ollama: {e}"
-# Create the Gradio interface
-iface = gr.Interface(
-    fn=generate_text,
-    inputs=[
-        gr.Textbox(lines=5, label="Enter your prompt", placeholder="Type your message here..."),
-        gr.Slider(minimum=1, maximum=1024, value=256, label="Max New Tokens", info="Maximum number of tokens to generate."),
-        gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature", info="Controls randomness in generation. Lower values are less random.")
-    ],
-    outputs="text",
-    title=f"Ollama {MODEL_NAME} on Hugging Face Spaces (CPU-only)",
-    description="Interact with a Gemma 3.4B IT QAT GGUF model served by Ollama on CPU. Please be patient, as CPU inference can be slow."
 )
-# Launch the Gradio application
-# server_name="0.0.0.0" makes it accessible from outside the container.
-# server_port=7860 is the default port for Gradio apps on Hugging Face Spaces.
-if __name__ == "__main__":
-    iface.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
+import ollama
+# The model name must exactly match what was pulled from Hugging Face
+MODEL_NAME = 'hf.co/unsloth/gemma-3-4b-it-qat-GGUF:Q4_K_M'
+def predict(prompt, history):
+    # The history is not used in this simple example, but is required by the ChatInterface
+    response = ollama.chat(
+        model=MODEL_NAME,
+        messages=[{'role': 'user', 'content': prompt}]
+    )
+    return response['message']['content']
+# Setup the Gradio Chat Interface
+iface = gr.ChatInterface(
+    fn=predict,
+    title="Gemma-3 QAT GGUF Chat",
+    description=f"Chat with the {MODEL_NAME} model via Ollama."
 )
+# Launch the interface
+iface.launch(server_name="0.0.0.0", server_port=7860)