Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

rkihacker commited on Oct 21

Commit

de4d166

verified ·

1 Parent(s): f8a669a

Update main.py

Browse files

Files changed (1) hide show

main.py +27 -35

main.py CHANGED Viewed

@@ -20,7 +20,7 @@ if not REPLICATE_API_TOKEN:
 # --- FastAPI App Initialization ---
 app = FastAPI(
     title="Replicate to OpenAI Compatibility Layer",
-    version="2.2.0 (Stable Streaming)",
 )
 # --- Pydantic Models ---
@@ -52,41 +52,30 @@ def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, A
         prompt_parts = []
         system_prompt = None
         image_url = None
         for msg in request.messages:
             if msg.role == "system":
                 system_prompt = str(msg.content)
             elif msg.role == "user":
-                if isinstance(msg.content, list): # Vision case
                     for item in msg.content:
                         if item.get("type") == "text":
                             prompt_parts.append(f"User: {item.get('text', '')}")
                         elif item.get("type") == "image_url":
                             image_url = item.get("image_url", {}).get("url")
-                else: # Text-only case
                     prompt_parts.append(f"User: {msg.content}")
             elif msg.role == "assistant":
                 prompt_parts.append(f"Assistant: {msg.content}")
-        # Add final turn for the assistant to respond
         prompt_parts.append("Assistant:")
         payload["prompt"] = "\n".join(prompt_parts)
-        if system_prompt:
-            payload["system_prompt"] = system_prompt
-        if image_url:
-            payload["image"] = image_url
-    else: # Llama-3 and other standard chat models
         payload["messages"] = [msg.dict() for msg in request.messages]
-    if request.max_tokens is not None:
-        payload["max_new_tokens"] = request.max_tokens
-    if request.temperature is not None:
-        payload["temperature"] = request.temperature
-    if request.top_p is not None:
-        payload["top_p"] = request.top_p
     return payload
 async def stream_replicate_native_sse(model_id: str, payload: dict):
@@ -95,6 +84,7 @@ async def stream_replicate_native_sse(model_id: str, payload: dict):
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
     async with httpx.AsyncClient(timeout=300) as client:
         try:
             response = await client.post(url, headers=headers, json={"input": payload, "stream": True})
             response.raise_for_status()
@@ -106,11 +96,8 @@ async def stream_replicate_native_sse(model_id: str, payload: dict):
                 yield json.dumps({"error": {"message": error_detail}})
                 return
         except httpx.HTTPStatusError as e:
-            try:
-                error_body = e.response.json()
-                yield json.dumps({"error": {"message": json.dumps(error_body)}})
-            except json.JSONDecodeError:
-                yield json.dumps({"error": {"message": e.response.text}})
             return
         try:
@@ -123,21 +110,27 @@ async def stream_replicate_native_sse(model_id: str, payload: dict):
                     elif line.startswith("data:"):
                         data = line[len("data:"):].strip()
-                        # *** THIS IS THE CRITICAL FIX ***
-                        # Only process non-empty data for 'output' events
-                        if data and current_event == "output":
-                            chunk = {
-                                "id": prediction["id"], "object": "chat.completion.chunk", "created": int(time.time()), "model": model_id,
-                                "choices": [{"index": 0, "delta": {"content": json.loads(data)}, "finish_reason": None}]
-                            }
-                            yield json.dumps(chunk)
                         elif current_event == "done":
                             break
         except Exception as e:
             yield json.dumps({"error": {"message": f"Streaming error: {str(e)}"}})
     done_chunk = {
-        "id": prediction["id"], "object": "chat.completion.chunk", "created": int(time.time()), "model": model_id,
         "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
     }
     yield json.dumps(done_chunk)
@@ -160,7 +153,6 @@ async def create_chat_completion(request: OpenAIChatCompletionRequest):
     if request.stream:
         return EventSourceResponse(stream_replicate_native_sse(replicate_model_id, replicate_input))
-    # Synchronous request
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json", "Prefer": "wait=120"}

 # --- FastAPI App Initialization ---
 app = FastAPI(
     title="Replicate to OpenAI Compatibility Layer",
+    version="2.3.0 (Definitive Streaming Fix)",
 )
 # --- Pydantic Models ---
         prompt_parts = []
         system_prompt = None
         image_url = None
         for msg in request.messages:
             if msg.role == "system":
                 system_prompt = str(msg.content)
             elif msg.role == "user":
+                if isinstance(msg.content, list):
                     for item in msg.content:
                         if item.get("type") == "text":
                             prompt_parts.append(f"User: {item.get('text', '')}")
                         elif item.get("type") == "image_url":
                             image_url = item.get("image_url", {}).get("url")
+                else:
                     prompt_parts.append(f"User: {msg.content}")
             elif msg.role == "assistant":
                 prompt_parts.append(f"Assistant: {msg.content}")
         prompt_parts.append("Assistant:")
         payload["prompt"] = "\n".join(prompt_parts)
+        if system_prompt: payload["system_prompt"] = system_prompt
+        if image_url: payload["image"] = image_url
+    else:
         payload["messages"] = [msg.dict() for msg in request.messages]
+    if request.max_tokens is not None: payload["max_new_tokens"] = request.max_tokens
+    if request.temperature is not None: payload["temperature"] = request.temperature
+    if request.top_p is not None: payload["top_p"] = request.top_p
     return payload
 async def stream_replicate_native_sse(model_id: str, payload: dict):
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
     async with httpx.AsyncClient(timeout=300) as client:
+        prediction = None
         try:
             response = await client.post(url, headers=headers, json={"input": payload, "stream": True})
             response.raise_for_status()
                 yield json.dumps({"error": {"message": error_detail}})
                 return
         except httpx.HTTPStatusError as e:
+            try: yield json.dumps({"error": {"message": json.dumps(e.response.json())}})
+            except: yield json.dumps({"error": {"message": e.response.text}})
             return
         try:
                     elif line.startswith("data:"):
                         data = line[len("data:"):].strip()
+                        if current_event == "output":
+                            # *** THIS IS THE DEFINITIVE FIX ***
+                            # Wrap the JSON parsing in a try-except block to gracefully
+                            # handle empty or malformed data lines without crashing.
+                            try:
+                                content = json.loads(data)
+                                chunk = {
+                                    "id": prediction["id"], "object": "chat.completion.chunk", "created": int(time.time()), "model": model_id,
+                                    "choices": [{"index": 0, "delta": {"content": content}, "finish_reason": None}]
+                                }
+                                yield json.dumps(chunk)
+                            except json.JSONDecodeError:
+                                # This will silently ignore any non-JSON data, like empty strings.
+                                pass
                         elif current_event == "done":
                             break
         except Exception as e:
             yield json.dumps({"error": {"message": f"Streaming error: {str(e)}"}})
     done_chunk = {
+        "id": prediction["id"] if prediction else "unknown", "object": "chat.completion.chunk", "created": int(time.time()), "model": model_id,
         "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
     }
     yield json.dumps(done_chunk)
     if request.stream:
         return EventSourceResponse(stream_replicate_native_sse(replicate_model_id, replicate_input))
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json", "Prefer": "wait=120"}