Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

rkihacker commited on 28 days ago

Commit

ea53c08

verified ·

1 Parent(s): 97aa2c2

Update main.py

Browse files

Files changed (1) hide show

main.py +69 -113

main.py CHANGED Viewed

@@ -3,7 +3,7 @@ import httpx
 import json
 import time
 import asyncio
-from fastapi import FastAPI, Request, HTTPException
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel, Field
 from typing import List, Dict, Any, Optional, Union, Literal
@@ -18,15 +18,10 @@ REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_TOKEN")
 if not REPLICATE_API_TOKEN:
     raise ValueError("REPLICATE_API_TOKEN environment variable not set.")
-# *** THE FIX IS HERE ***
-# Reduced from 1.0 to 0.05 for smoother, more frequent streaming updates.
-# This makes the polling fast enough to appear like real-time token streaming.
-POLLING_INTERVAL_SECONDS = 0.05
 # --- FastAPI App Initialization ---
 app = FastAPI(
     title="Replicate to OpenAI Compatibility Layer",
-    version="1.3.0 (Smooth Streaming)",
 )
 # --- Pydantic Models for OpenAI Compatibility ---
@@ -73,122 +68,92 @@ SUPPORTED_MODELS = {
 # --- Helper Functions ---
-def format_tools_for_prompt(tools: List[Tool]) -> str:
-    if not tools:
-        return ""
-    prompt = "You have access to the following tools. To use a tool, respond with a JSON object in the following format:\n"
-    prompt += '{"type": "tool_call", "name": "tool_name", "arguments": {"arg_name": "value"}}\n\n'
-    prompt += "Available tools:\n"
-    for tool in tools:
-        prompt += json.dumps(tool.function.dict(), indent=2) + "\n"
-    return prompt
 def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, Any]:
-    input_data = {}
-    prompt_parts = []
-    system_prompt = ""
-    image_url = None
-    for message in request.messages:
-        if message.role == "system":
-            system_prompt += str(message.content) + "\n"
-        elif message.role == "user":
-            content = message.content
-            if isinstance(content, list):
-                for item in content:
-                    if item.get("type") == "text":
-                        prompt_parts.append(f"User: {item.get('text', '')}")
-                    elif item.get("type") == "image_url":
-                        image_url = item.get("image_url", {}).get("url")
-            else:
-                prompt_parts.append(f"User: {str(content)}")
-        elif message.role == "assistant":
-            prompt_parts.append(f"Assistant: {str(message.content)}")
-    if request.tools:
-        tool_prompt = format_tools_for_prompt(request.tools)
-        system_prompt += "\n" + tool_prompt
-    # Add final turn for the assistant to respond
-    prompt_parts.append("Assistant:")
-    input_data["prompt"] = "\n".join(prompt_parts)
-    if system_prompt:
-        input_data["system_prompt"] = system_prompt
-    if image_url:
-        input_data["image"] = image_url
     if request.temperature is not None:
-        input_data["temperature"] = request.temperature
     if request.top_p is not None:
-        input_data["top_p"] = request.top_p
-    if request.max_tokens is not None:
-        input_data["max_new_tokens"] = request.max_tokens
-    return input_data
-async def stream_replicate_with_polling(model_id: str, payload: dict):
     """
-    Creates a prediction and polls the 'get' URL to stream back results.
-    Yields raw JSON strings for EventSourceResponse to handle.
     """
     url = f"https://api.replicate.com/v1/models/{model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
     async with httpx.AsyncClient(timeout=300) as client:
-        prediction = None
         try:
-            response = await client.post(url, headers=headers, json={"input": payload})
             response.raise_for_status()
             prediction = response.json()
-            get_url = prediction.get("urls", {}).get("get")
-            if not get_url:
-                error_detail = prediction.get("detail", "Failed to start prediction.")
-                error_chunk = {"error": {"message": error_detail, "type": "api_error", "code": 500}}
-                yield json.dumps(error_chunk)
                 return
         except httpx.HTTPStatusError as e:
-            error_chunk = {"error": {"message": e.response.text, "type": "api_error", "code": e.response.status_code}}
-            yield json.dumps(error_chunk)
             return
-        previous_output = ""
-        status = ""
-        while status not in ["succeeded", "failed", "canceled"]:
-            await asyncio.sleep(POLLING_INTERVAL_SECONDS)
-            try:
-                poll_response = await client.get(get_url, headers=headers)
-                poll_response.raise_for_status()
-                prediction_update = poll_response.json()
-                status = prediction_update["status"]
-                if status == "failed":
-                    error_detail = prediction_update.get("error", "Prediction failed.")
-                    chunk = {"choices": [{"delta": {"content": f"\n\n[ERROR: {error_detail}]"}, "finish_reason": "error"}]}
-                    yield json.dumps(chunk)
-                    break
-                if "output" in prediction_update and prediction_update["output"] is not None:
-                    current_output = "".join(prediction_update["output"])
-                    new_chunk_text = current_output[len(previous_output):]
-                    if new_chunk_text:
-                        chunk = {
-                            "id": prediction["id"], "object": "chat.completion.chunk", "created": int(time.time()), "model": model_id,
-                            "choices": [{"index": 0, "delta": {"content": new_chunk_text}, "finish_reason": None}]
-                        }
-                        yield json.dumps(chunk)
-                        previous_output = current_output
-            except Exception as e:
-                error_chunk = {"error": {"message": f"Polling error: {str(e)}", "type": "internal_error", "code": 500}}
-                yield json.dumps(error_chunk)
-                break
     done_chunk = {
         "id": prediction["id"], "object": "chat.completion.chunk", "created": int(time.time()), "model": model_id,
-        "choices": [{"index": 0, "delta": {}, "finish_reason": "stop" if status == "succeeded" else "error"}]
     }
     yield json.dumps(done_chunk)
     yield "[DONE]"
@@ -210,7 +175,7 @@ async def create_chat_completion(request: OpenAIChatCompletionRequest):
     replicate_input = prepare_replicate_input(request)
     if request.stream:
-        return EventSourceResponse(stream_replicate_with_polling(replicate_model_id, replicate_input))
     # Synchronous request
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
@@ -224,18 +189,9 @@ async def create_chat_completion(request: OpenAIChatCompletionRequest):
             output = "".join(prediction.get("output", []))
-            try:
-                tool_call_data = json.loads(output)
-                if tool_call_data.get("type") == "tool_call":
-                    message_content, tool_calls = None, [{"id": f"call_{int(time.time())}", "type": "function", "function": {"name": tool_call_data["name"], "arguments": json.dumps(tool_call_data["arguments"])}}]
-                else:
-                    message_content, tool_calls = output, None
-            except (json.JSONDecodeError, TypeError):
-                message_content, tool_calls = output, None
             return JSONResponse(content={
                 "id": prediction["id"], "object": "chat.completion", "created": int(time.time()), "model": model_key,
-                "choices": [{"index": 0, "message": {"role": "assistant", "content": message_content, "tool_calls": tool_calls}, "finish_reason": "stop"}],
                 "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
             })

 import json
 import time
 import asyncio
+from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel, Field
 from typing import List, Dict, Any, Optional, Union, Literal
 if not REPLICATE_API_TOKEN:
     raise ValueError("REPLICATE_API_TOKEN environment variable not set.")
 # --- FastAPI App Initialization ---
 app = FastAPI(
     title="Replicate to OpenAI Compatibility Layer",
+    version="2.0.0 (Native Streaming & Context Fixed)",
 )
 # --- Pydantic Models for OpenAI Compatibility ---
 # --- Helper Functions ---
 def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, Any]:
+    """
+    Prepares the input payload for Replicate's chat models.
+    This now correctly passes the messages array for context.
+    """
+    # Convert Pydantic message objects to a list of dictionaries
+    messages_for_replicate = [msg.dict() for msg in request.messages]
+    payload = {
+        "messages": messages_for_replicate
+    }
+    # Add other compatible parameters
+    if request.max_tokens is not None:
+        payload["max_new_tokens"] = request.max_tokens
     if request.temperature is not None:
+        payload["temperature"] = request.temperature
     if request.top_p is not None:
+        payload["top_p"] = request.top_p
+    # Vision support: Find image URL in the last user message if present
+    last_user_message = next((m for m in reversed(request.messages) if m.role == 'user'), None)
+    if last_user_message and isinstance(last_user_message.content, list):
+        for item in last_user_message.content:
+            if item.get("type") == "image_url":
+                payload["image"] = item.get("image_url", {}).get("url")
+                # Reformat messages to be a simple prompt string for vision models if needed,
+                # as some might not support the `messages` format with images.
+                # For Claude Haiku, a prompt string is more reliable with images.
+                if "claude" in request.model:
+                    text_prompts = [item.get('text', '') for item in last_user_message.content if item.get('type') == 'text']
+                    payload["prompt"] = " ".join(text_prompts)
+                    del payload["messages"]
+                break
+    return payload
+async def stream_replicate_native_sse(model_id: str, payload: dict):
     """
+    Connects to Replicate's native SSE stream for true token-by-token streaming.
     """
     url = f"https://api.replicate.com/v1/models/{model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
     async with httpx.AsyncClient(timeout=300) as client:
+        # 1. Create the prediction to get the stream URL
         try:
+            # Add stream=True to the outer payload for Replicate
+            response = await client.post(url, headers=headers, json={"input": payload, "stream": True})
             response.raise_for_status()
             prediction = response.json()
+            stream_url = prediction.get("urls", {}).get("stream")
+            if not stream_url:
+                error_detail = prediction.get("detail", "Failed to get stream URL.")
+                yield json.dumps({"error": {"message": error_detail}})
                 return
         except httpx.HTTPStatusError as e:
+            yield json.dumps({"error": {"message": e.response.text}})
             return
+        # 2. Connect to the SSE stream and yield OpenAI-compatible chunks
+        try:
+            async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}) as sse:
+                sse.raise_for_status()
+                current_event = ""
+                async for line in sse.aiter_lines():
+                    if line.startswith("event:"):
+                        current_event = line[len("event:"):].strip()
+                    elif line.startswith("data:"):
+                        data = line[len("data:"):].strip()
+                        if current_event == "output":
+                            chunk = {
+                                "id": prediction["id"], "object": "chat.completion.chunk", "created": int(time.time()), "model": model_id,
+                                "choices": [{"index": 0, "delta": {"content": json.loads(data)}, "finish_reason": None}]
+                            }
+                            yield json.dumps(chunk)
+                        elif current_event == "done":
+                            break # Exit loop when done event is received
+        except Exception as e:
+            yield json.dumps({"error": {"message": f"Streaming error: {str(e)}"}})
+    # 3. Send the final DONE chunk
     done_chunk = {
         "id": prediction["id"], "object": "chat.completion.chunk", "created": int(time.time()), "model": model_id,
+        "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
     }
     yield json.dumps(done_chunk)
     yield "[DONE]"
     replicate_input = prepare_replicate_input(request)
     if request.stream:
+        return EventSourceResponse(stream_replicate_native_sse(replicate_model_id, replicate_input))
     # Synchronous request
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
             output = "".join(prediction.get("output", []))
             return JSONResponse(content={
                 "id": prediction["id"], "object": "chat.completion", "created": int(time.time()), "model": model_key,
+                "choices": [{"index": 0, "message": {"role": "assistant", "content": output}, "finish_reason": "stop"}],
                 "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
             })