Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

R2OAI / main.py

rkihacker

Update main.py

0f99721 verified 28 days ago

raw

history blame

14.5 kB

	import os
	import httpx
	import json
	import time
	import asyncio
	from fastapi import FastAPI, HTTPException
	from fastapi.responses import StreamingResponse
	from pydantic import BaseModel, Field
	from typing import List, Dict, Any, Optional, Union, Literal
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()
	REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_TOKEN")
	if not REPLICATE_API_TOKEN:
	raise ValueError("REPLICATE_API_TOKEN environment variable not set.")

	# FastAPI Init
	app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="9.2.2 (Spacing Fixed)")

	# --- Pydantic Models ---
	class ModelCard(BaseModel):
	id: str; object: str = "model"; created: int = Field(default_factory=lambda: int(time.time())); owned_by: str = "replicate"
	class ModelList(BaseModel):
	object: str = "list"; data: List[ModelCard] = []
	class ChatMessage(BaseModel):
	role: Literal["system", "user", "assistant", "tool"]; content: Union[str, List[Dict[str, Any]]]; name: Optional[str] = None; tool_calls: Optional[List[Any]] = None
	class FunctionDefinition(BaseModel):
	name: str; description: Optional[str] = None; parameters: Optional[Dict[str, Any]] = None
	class ToolDefinition(BaseModel):
	type: Literal["function"]; function: FunctionDefinition
	class FunctionCall(BaseModel):
	name: str; arguments: str
	class ToolCall(BaseModel):
	id: str; type: Literal["function"] = "function"; function: FunctionCall
	class ChatCompletionRequest(BaseModel):
	model: str; messages: List[ChatMessage]; temperature: Optional[float] = 0.7; top_p: Optional[float] = 1.0
	max_tokens: Optional[int] = None; stream: Optional[bool] = False; stop: Optional[Union[str, List[str]]] = None
	tools: Optional[List[ToolDefinition]] = None; tool_choice: Optional[Union[str, Dict[str, Any]]] = None
	functions: Optional[List[FunctionDefinition]] = None; function_call: Optional[Union[str, Dict[str, str]]] = None

	class Choice(BaseModel):
	index: int; message: ChatMessage; finish_reason: Optional[str] = None
	class Usage(BaseModel):
	prompt_tokens: int; completion_tokens: int; total_tokens: int; inference_time: Optional[float] = None
	class ChatCompletion(BaseModel):
	id: str; object: str = "chat.completion"; created: int; model: str; choices: List[Choice]; usage: Usage

	class DeltaMessage(BaseModel):
	role: Optional[str] = None; content: Optional[str] = None; tool_calls: Optional[List[ToolCall]] = None
	class ChoiceDelta(BaseModel):
	index: int; delta: DeltaMessage; finish_reason: Optional[str] = None
	class ChatCompletionChunk(BaseModel):
	id: str; object: str = "chat.completion.chunk"; created: int; model: str; choices: List[ChoiceDelta]; usage: Optional[Usage] = None

	# --- Supported Models ---
	SUPPORTED_MODELS = {
	"llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
	"claude-3-haiku-20240307": "anthropic/claude-3-haiku-20240307", # Example of another common model
	"claude-3-sonnet-20240229": "anthropic/claude-3-sonnet-20240229",
	"llava-13b": "yorickvp/llava-13b:e272157381e2a3bf12df3a8edd1f38d1dbd736bbb7437277c8b34175f8fce358"
	}

	# --- Core Logic ---
	def format_messages_for_replicate(messages: List[ChatMessage], functions: Optional[List[FunctionDefinition]] = None) -> Dict[str, Any]:
	prompt_parts = []
	system_prompt = None
	image_input = None

	if functions:
	functions_text = "You have access to the following tools. Use them if required to answer the user's question.\n\n"
	for func in functions:
	functions_text += f"- Function: {func.name}\n"
	if func.description: functions_text += f" Description: {func.description}\n"
	if func.parameters: functions_text += f" Parameters: {json.dumps(func.parameters)}\n"
	prompt_parts.append(functions_text)

	for msg in messages:
	if msg.role == "system":
	system_prompt = str(msg.content)
	elif msg.role == "assistant":
	if msg.tool_calls:
	tool_calls_text = "\nTool calls:\n"
	for tool_call in msg.tool_calls:
	tool_calls_text += f"- {tool_call.function.name}({tool_call.function.arguments})\n"
	prompt_parts.append(f"Assistant: {tool_calls_text}")
	else:
	prompt_parts.append(f"Assistant: {msg.content}")
	elif msg.role == "tool":
	prompt_parts.append(f"Tool Response: {msg.content}")
	elif msg.role == "user":
	user_text_content = ""
	if isinstance(msg.content, list):
	for item in msg.content:
	if item.get("type") == "text":
	user_text_content += item.get("text", "")
	elif item.get("type") == "image_url":
	image_url_data = item.get("image_url", {})
	image_input = image_url_data.get("url")
	else:
	user_text_content = str(msg.content)
	prompt_parts.append(f"User: {user_text_content}")

	prompt_parts.append("Assistant:") # Let the model generate the space after this
	return {
	"prompt": "\n\n".join(prompt_parts),
	"system_prompt": system_prompt,
	"image": image_input
	}

	def parse_function_call(content: str) -> Optional[Dict[str, Any]]:
	try:
	if "function_call" in content or ("name" in content and "arguments" in content):
	start = content.find("{")
	end = content.rfind("}") + 1
	if start != -1 and end > start:
	json_str = content[start:end]
	parsed = json.loads(json_str)
	if "name" in parsed and "arguments" in parsed:
	return parsed
	except (json.JSONDecodeError, Exception):
	pass
	return None

	async def stream_replicate_response(replicate_model_id: str, input_payload: dict, request_id: str):
	url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
	headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}

	start_time = time.time()
	prompt_tokens = len(input_payload.get("prompt", "")) // 4
	completion_tokens = 0

	async with httpx.AsyncClient(timeout=300.0) as client:
	try:
	response = await client.post(url, headers=headers, json={"input": input_payload, "stream": True})
	response.raise_for_status()
	prediction = response.json()
	stream_url = prediction.get("urls", {}).get("stream")
	if not stream_url:
	yield f"data: {json.dumps({'error': {'message': 'Model did not return a stream URL.'}})}\n\n"
	return
	except httpx.HTTPStatusError as e:
	error_details = e.response.text
	try: error_details = e.response.json().get("detail", error_details)
	except json.JSONDecodeError: pass
	yield f"data: {json.dumps({'error': {'message': f'Upstream Error: {error_details}', 'type': 'replicate_error'}})}\n\n"
	return

	try:
	async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}, timeout=None) as sse:
	current_event = None
	accumulated_content = ""
	first_token = True

	async for line in sse.aiter_lines():
	if not line: continue

	if line.startswith("event:"):
	current_event = line[len("event:"):].strip()
	elif line.startswith("data:") and current_event == "output":
	raw_data = line[5:].strip()
	if not raw_data: continue

	try:
	content_token = json.loads(raw_data)
	except (json.JSONDecodeError, TypeError):
	content_token = raw_data

	# ### MAJOR FIX HERE ###
	# This logic robustly handles the leading space by only stripping
	# the very first non-empty token of the entire stream.
	if first_token:
	content_token = content_token.lstrip()
	# Only flip the flag if we've actually processed a token with content.
	if content_token:
	first_token = False

	accumulated_content += content_token
	completion_tokens += 1

	function_call = parse_function_call(accumulated_content)
	if function_call:
	tool_call = ToolCall(id=f"call_{int(time.time())}", function=FunctionCall(name=function_call["name"], arguments=function_call["arguments"]))
	chunk = ChatCompletionChunk(id=request_id, created=int(time.time()), model=replicate_model_id, choices=[ChoiceDelta(index=0, delta=DeltaMessage(tool_calls=[tool_call]), finish_reason=None)])
	yield f"data: {chunk.json()}\n\n"
	else:
	# Only yield a chunk if there is content to send.
	if content_token:
	chunk = ChatCompletionChunk(id=request_id, created=int(time.time()), model=replicate_model_id, choices=[ChoiceDelta(index=0, delta=DeltaMessage(content=content_token), finish_reason=None)])
	yield f"data: {chunk.json()}\n\n"

	elif current_event == "done":
	end_time = time.time()
	usage = Usage(prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=prompt_tokens + completion_tokens, inference_time=round(end_time - start_time, 3))
	usage_chunk = ChatCompletionChunk(id=request_id, created=int(time.time()), model=replicate_model_id, choices=[ChoiceDelta(index=0, delta=DeltaMessage(), finish_reason="stop")], usage=usage)
	yield f"data: {usage_chunk.json()}\n\n"
	break

	except httpx.ReadTimeout:
	yield f"data: {json.dumps({'error': {'message': 'Stream timed out.', 'type': 'timeout_error'}})}\n\n"
	return

	yield "data: [DONE]\n\n"

	# --- Endpoints ---
	@app.get("/v1/models")
	async def list_models():
	return ModelList(data=[ModelCard(id=k) for k in SUPPORTED_MODELS.keys()])

	@app.post("/v1/chat/completions")
	async def create_chat_completion(request: ChatCompletionRequest):
	if request.model not in SUPPORTED_MODELS:
	raise HTTPException(status_code=404, detail=f"Model not found. Available models: {list(SUPPORTED_MODELS.keys())}")

	replicate_model_id = SUPPORTED_MODELS[request.model]
	formatted = format_messages_for_replicate(request.messages, request.functions)
	replicate_input = {
	"prompt": formatted["prompt"],
	"max_new_tokens": request.max_tokens or 512,
	"temperature": request.temperature or 0.7,
	"top_p": request.top_p or 1.0
	}
	if formatted["system_prompt"]: replicate_input["system_prompt"] = formatted["system_prompt"]
	if formatted["image"]: replicate_input["image"] = formatted["image"]

	request_id = f"chatcmpl-{int(time.time())}"

	if request.stream:
	return StreamingResponse(
	stream_replicate_response(replicate_model_id, replicate_input, request_id),
	media_type="text/event-stream"
	)

	# Non-streaming response
	url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
	headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
	start_time = time.time()

	async with httpx.AsyncClient() as client:
	try:
	resp = await client.post(url, headers=headers, json={"input": replicate_input}, timeout=300.0)
	resp.raise_for_status()
	pred = resp.json()
	output = "".join(pred.get("output", []))

	output = output.strip() # Clean up any leading/trailing whitespace

	end_time = time.time()
	prompt_tokens = len(replicate_input.get("prompt", "")) // 4
	completion_tokens = len(output) // 4

	tool_calls = None
	finish_reason = "stop"
	message_content = output

	function_call = parse_function_call(output)
	if function_call:
	tool_calls = [ToolCall(id=f"call_{int(time.time())}", function=FunctionCall(name=function_call["name"], arguments=function_call["arguments"]))]
	finish_reason = "tool_calls"
	message_content = None # OpenAI standard: content is null when tool_calls are present

	return ChatCompletion(
	id=request_id,
	created=int(time.time()),
	model=request.model,
	choices=[Choice(
	index=0,
	message=ChatMessage(role="assistant", content=message_content, tool_calls=tool_calls),
	finish_reason=finish_reason
	)],
	usage=Usage(
	prompt_tokens=prompt_tokens,
	completion_tokens=completion_tokens,
	total_tokens=prompt_tokens + completion_tokens,
	inference_time=round(end_time - start_time, 3)
	)
	)
	except httpx.HTTPStatusError as e:
	raise HTTPException(status_code=e.response.status_code, detail=f"Error from Replicate API: {e.response.text}")
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}")

	@app.get("/")
	async def root():
	return {"message": "Replicate to OpenAI Compatibility Layer API", "version": "9.2.2"}

	@app.middleware("http")
	async def add_performance_headers(request, call_next):
	start_time = time.time()
	response = await call_next(request)
	process_time = time.time() - start_time
	response.headers["X-Process-Time"] = str(round(process_time, 3))
	response.headers["X-API-Version"] = "9.2.2"
	return response