Mi-modelo / app.py
Novaciano's picture
Create app.py
9938e73 verified
raw
history blame
5.25 kB
#
# SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
# SPDX-License-Identifier: Apache-2.0
#
import os
from ollama import AsyncClient
import gradio as gr
async def playground(
message,
history,
num_ctx,
temperature,
repeat_penalty,
min_p,
top_k,
top_p
):
if not isinstance(message, str) or not message.strip():
yield []
return
client = AsyncClient(
host=os.getenv("OLLAMA_API_BASE_URL"),
headers={
"Authorization": f"Bearer {os.getenv('OLLAMA_API_KEY')}"
}
)
messages = []
for item in history:
if isinstance(item, dict) and "role" in item and "content" in item:
messages.append({
"role": item["role"],
"content": item["content"]
})
messages.append({"role": "user", "content": message})
response = ""
async for part in await client.chat(
model="gemma3:270m",
messages=messages,
options={
"num_ctx": int(num_ctx),
"temperature": float(temperature),
"repeat_penalty": float(repeat_penalty),
"min_p": float(min_p),
"top_k": int(top_k),
"top_p": float(top_p)
},
stream=True
):
response += part.get("message", {}).get("content", "")
yield response
with gr.Blocks(
fill_height=True,
fill_width=True
) as app:
with gr.Sidebar():
gr.Markdown("## Ollama Playground by UltimaX Intelligence")
gr.HTML(
"""
This space run the <b><a href=
"https://huggingface.co/google/gemma-3-270m"
target="_blank">Gemma 3 (270M)</a></b> model from
<b>Google</b>, hosted on a server using <b>Ollama</b> and
accessed via the <b>Ollama Python SDK</b>.<br><br>
Official <b>documentation</b> for using Ollama with the
Python SDK can be found
<b><a href="https://github.com/ollama/ollama-python"
target="_blank">here</a></b>.<br><br>
Gemma 3 (270M) runs entirely on <b>CPU</b>, utilizing only a
<b>single core</b>. Thanks to its small size, the model can
operate efficiently on minimal hardware.<br><br>
The Gemma 3 (270M) model can also be viewed or downloaded
from the official Ollama website
<b><a href="https://ollama.com/library/gemma3:270m"
target="_blank">here</a></b>.<br><br>
While Gemma 3 has multimodal capabilities, running it on CPU
with a relatively small number of parameters may limit its
contextual understanding. For this reason, the upload
functionality has been disabled.<br><br>
<b>Like this project? You can support me by buying a
<a href="https://ko-fi.com/hadad" target="_blank">
coffee</a></b>.
"""
)
gr.Markdown("---")
gr.Markdown("## Model Parameters")
num_ctx = gr.Slider(
minimum=512,
maximum=1024,
value=512,
step=128,
label="Context Length (num_ctx)",
info="Maximum context window size. Limited to CPU usage."
)
gr.Markdown("")
temperature = gr.Slider(
minimum=0.1,
maximum=2.0,
value=1.0,
step=0.1,
label="Temperature",
info="Controls randomness in generation"
)
gr.Markdown("")
repeat_penalty = gr.Slider(
minimum=0.1,
maximum=2.0,
value=1.0,
step=0.1,
label="Repeat Penalty",
info="Penalty for repeating tokens"
)
gr.Markdown("")
min_p = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.001,
step=0.001,
label="Min P",
info="Minimum probability threshold"
)
gr.Markdown("")
top_k = gr.Slider(
minimum=0,
maximum=100,
value=64,
step=1,
label="Top K",
info="Number of top tokens to consider"
)
gr.Markdown("")
top_p = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.95,
step=0.05,
label="Top P",
info="Cumulative probability threshold"
)
gr.ChatInterface(
fn=playground,
additional_inputs=[
num_ctx,
temperature,
repeat_penalty,
min_p,
top_k,
top_p
],
chatbot=gr.Chatbot(
label="Ollama | Gemma 3 (270M)",
type="messages",
show_copy_button=True,
scale=1
),
type="messages",
examples=[
["Please introduce yourself."],
["What caused World War II?"],
["Give me a short introduction to large language model."],
["Explain about quantum computers."]
],
cache_examples=False,
show_api=False
)
app.launch(
server_name="0.0.0.0",
pwa=True
)