Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

rkihacker commited on Oct 21

Commit

f8a669a

verified ·

1 Parent(s): 2fc646f

Update main.py

Browse files

Files changed (1) hide show

main.py +10 -11

main.py CHANGED Viewed

@@ -20,7 +20,7 @@ if not REPLICATE_API_TOKEN:
 # --- FastAPI App Initialization ---
 app = FastAPI(
     title="Replicate to OpenAI Compatibility Layer",
-    version="2.1.0 (Model Input Fixed)",
 )
 # --- Pydantic Models ---
@@ -45,14 +45,9 @@ SUPPORTED_MODELS = {
 # --- Helper Functions ---
 def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, Any]:
-    """
-    Prepares the input payload for Replicate, handling model-specific formats.
-    """
     payload = {}
-    # *** THIS IS THE CRITICAL FIX ***
-    # Claude models on Replicate require a single 'prompt' string.
-    # We must convert the 'messages' array into a formatted string.
     if "claude" in request.model:
         prompt_parts = []
         system_prompt = None
@@ -72,6 +67,9 @@ def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, A
                     prompt_parts.append(f"User: {msg.content}")
             elif msg.role == "assistant":
                 prompt_parts.append(f"Assistant: {msg.content}")
         payload["prompt"] = "\n".join(prompt_parts)
         if system_prompt:
@@ -79,11 +77,9 @@ def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, A
         if image_url:
             payload["image"] = image_url
-    # Other models like Llama-3 accept the 'messages' array directly.
-    else:
         payload["messages"] = [msg.dict() for msg in request.messages]
-    # Add common parameters
     if request.max_tokens is not None:
         payload["max_new_tokens"] = request.max_tokens
     if request.temperature is not None:
@@ -126,7 +122,10 @@ async def stream_replicate_native_sse(model_id: str, payload: dict):
                         current_event = line[len("event:"):].strip()
                     elif line.startswith("data:"):
                         data = line[len("data:"):].strip()
-                        if current_event == "output":
                             chunk = {
                                 "id": prediction["id"], "object": "chat.completion.chunk", "created": int(time.time()), "model": model_id,
                                 "choices": [{"index": 0, "delta": {"content": json.loads(data)}, "finish_reason": None}]

 # --- FastAPI App Initialization ---
 app = FastAPI(
     title="Replicate to OpenAI Compatibility Layer",
+    version="2.2.0 (Stable Streaming)",
 )
 # --- Pydantic Models ---
 # --- Helper Functions ---
 def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, Any]:
+    """Prepares the input payload for Replicate, handling model-specific formats."""
     payload = {}
     if "claude" in request.model:
         prompt_parts = []
         system_prompt = None
                     prompt_parts.append(f"User: {msg.content}")
             elif msg.role == "assistant":
                 prompt_parts.append(f"Assistant: {msg.content}")
+        # Add final turn for the assistant to respond
+        prompt_parts.append("Assistant:")
         payload["prompt"] = "\n".join(prompt_parts)
         if system_prompt:
         if image_url:
             payload["image"] = image_url
+    else: # Llama-3 and other standard chat models
         payload["messages"] = [msg.dict() for msg in request.messages]
     if request.max_tokens is not None:
         payload["max_new_tokens"] = request.max_tokens
     if request.temperature is not None:
                         current_event = line[len("event:"):].strip()
                     elif line.startswith("data:"):
                         data = line[len("data:"):].strip()
+                        # *** THIS IS THE CRITICAL FIX ***
+                        # Only process non-empty data for 'output' events
+                        if data and current_event == "output":
                             chunk = {
                                 "id": prediction["id"], "object": "chat.completion.chunk", "created": int(time.time()), "model": model_id,
                                 "choices": [{"index": 0, "delta": {"content": json.loads(data)}, "finish_reason": None}]