Spaces:
Paused
Paused
| from fastapi import FastAPI, HTTPException, Request | |
| from fastapi.responses import JSONResponse | |
| from llama_cpp import Llama | |
| import gradio as gr | |
| app = FastAPI() | |
| llm = gr.Llama(model_path="model.gguf", n_ctx=4000, n_threads=2, chat_format="chatml") | |
| async def chat_post(request: Request): | |
| data = await request.json() | |
| message = data.get("message") | |
| history = data.get("history", []) | |
| temperature = data.get("temperature", 0.3) | |
| max_tokens = data.get("max_tokens", 512) | |
| async def generate(): | |
| system_prompt = "You are OpenChat, a useful AI assistant." | |
| formatted_prompt = [{"role": "system", "content": system_prompt}] | |
| for user_prompt, bot_response in history: | |
| formatted_prompt.append({"role": "user", "content": user_prompt}) | |
| formatted_prompt.append({"role": "assistant", "content": bot_response }) | |
| formatted_prompt.append({"role": "user", "content": message}) | |
| stream_response = llm.create_chat_completion(messages=formatted_prompt, temperature=temperature, max_tokens=max_tokens, stream=True) | |
| response = "" | |
| for chunk in stream_response: | |
| if len(chunk['choices'][0]["delta"]) != 0 and "content" in chunk['choices'][0]["delta"]: | |
| response += chunk['choices'][0]["delta"]["content"] | |
| yield response | |
| return JSONResponse(content={"response": await generate()}) | |
| async def chat_get(): | |
| return {"message": "Send a POST request to this endpoint to chat."} | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=8000) | |