Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

rkihacker commited on about 1 month ago

Commit

54de3fd

verified ·

1 Parent(s): a135be4

Update main.py

Browse files

Files changed (1) hide show

main.py +33 -31

main.py CHANGED Viewed

@@ -16,7 +16,7 @@ if not REPLICATE_API_TOKEN:
     raise ValueError("REPLICATE_API_TOKEN environment variable not set.")
 # FastAPI Init
-app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="7.0.0 (Unified Prompt Fix)")
 # --- Pydantic Models ---
 class ModelCard(BaseModel):
@@ -39,51 +39,41 @@ SUPPORTED_MODELS = {
 # --- Core Logic ---
 def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, Any]:
     """
-    Formats the input for the Replicate API. This function now uses a unified approach
-    for all models, flattening the message history into a single 'prompt' string
-    and handling images separately, as required by Replicate's API.
     """
     payload = {}
     prompt_parts = []
     system_prompt = None
     image_input = None
     for msg in request.messages:
         if msg.role == "system":
-            # Extract system prompt; it will be a separate parameter.
             system_prompt = str(msg.content)
         elif msg.role == "assistant":
             prompt_parts.append(f"Assistant: {msg.content}")
         elif msg.role == "user":
             user_text_content = ""
             if isinstance(msg.content, list):
-                # Handle multimodal (vision) input from OpenAI format
                 for item in msg.content:
                     if item.get("type") == "text":
                         user_text_content += item.get("text", "")
                     elif item.get("type") == "image_url":
                         image_url_data = item.get("image_url", {})
-                        # The 'image' parameter is used by Claude, Llava, etc., on Replicate
                         image_input = image_url_data.get("url")
             else:
                 user_text_content = str(msg.content)
             prompt_parts.append(f"User: {user_text_content}")
-    # The final "Assistant:" turn prompts the model for a response.
     prompt_parts.append("Assistant:")
-    # All models on Replicate's API expect a single 'prompt' string.
     payload["prompt"] = "\n\n".join(prompt_parts)
     if system_prompt:
         payload["system_prompt"] = system_prompt
     if image_input:
         payload["image"] = image_input
-    # Map common OpenAI parameters to Replicate equivalents
     if request.max_tokens: payload["max_new_tokens"] = request.max_tokens
     if request.temperature: payload["temperature"] = request.temperature
     if request.top_p: payload["top_p"] = request.top_p
@@ -91,7 +81,7 @@ def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, A
     return payload
 async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
-    """Handles the full streaming lifecycle using standard Replicate endpoints."""
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
@@ -102,20 +92,17 @@ async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
             prediction = response.json()
             stream_url = prediction.get("urls", {}).get("stream")
             prediction_id = prediction.get("id", "stream-unknown")
             if not stream_url:
-                 yield json.dumps({"error": {"message": "Model did not return a stream URL."}})
-                 return
         except httpx.HTTPStatusError as e:
-             error_details = e.response.text
-             try:
-                 error_json = e.response.json()
-                 error_details = error_json.get("detail", error_details)
-             except json.JSONDecodeError:
-                 pass
-             yield json.dumps({"error": {"message": f"Upstream Error: {error_details}", "type": "replicate_error"}})
-             return
         try:
             async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}, timeout=None) as sse:
@@ -126,12 +113,29 @@ async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
                     elif line.startswith("data:"):
                         data = line[len("data:"):].strip()
                         if current_event == "output":
-                            if data:
                                 chunk = {
                                     "id": prediction_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": replicate_model_id,
-                                    "choices": [{"index": 0, "delta": {"content": data}, "finish_reason": None}]
                                 }
                                 yield json.dumps(chunk)
                         elif current_event == "done":
                             break
         except httpx.ReadTimeout:
@@ -148,12 +152,10 @@ async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
 # --- Endpoints ---
 @app.get("/v1/models")
 async def list_models():
-    """Lists the currently supported models."""
     return ModelList(data=[ModelCard(id=k) for k in SUPPORTED_MODELS.keys()])
 @app.post("/v1/chat/completions")
 async def create_chat_completion(request: OpenAIChatCompletionRequest):
-    """Handles chat completion requests, streaming or non-streaming."""
     if request.model not in SUPPORTED_MODELS:
         raise HTTPException(status_code=404, detail=f"Model not found. Available models: {list(SUPPORTED_MODELS.keys())}")

     raise ValueError("REPLICATE_API_TOKEN environment variable not set.")
 # FastAPI Init
+app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="7.1.0 (Streaming Space Fix)")
 # --- Pydantic Models ---
 class ModelCard(BaseModel):
 # --- Core Logic ---
 def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, Any]:
     """
+    Formats the input for Replicate's API, flattening the message history into a
+    single 'prompt' string and handling images separately. This is the required
+    format for all their current chat/vision models.
     """
     payload = {}
     prompt_parts = []
     system_prompt = None
     image_input = None
     for msg in request.messages:
         if msg.role == "system":
             system_prompt = str(msg.content)
         elif msg.role == "assistant":
             prompt_parts.append(f"Assistant: {msg.content}")
         elif msg.role == "user":
             user_text_content = ""
             if isinstance(msg.content, list):
                 for item in msg.content:
                     if item.get("type") == "text":
                         user_text_content += item.get("text", "")
                     elif item.get("type") == "image_url":
                         image_url_data = item.get("image_url", {})
                         image_input = image_url_data.get("url")
             else:
                 user_text_content = str(msg.content)
             prompt_parts.append(f"User: {user_text_content}")
     prompt_parts.append("Assistant:")
     payload["prompt"] = "\n\n".join(prompt_parts)
     if system_prompt:
         payload["system_prompt"] = system_prompt
     if image_input:
         payload["image"] = image_input
     if request.max_tokens: payload["max_new_tokens"] = request.max_tokens
     if request.temperature: payload["temperature"] = request.temperature
     if request.top_p: payload["top_p"] = request.top_p
     return payload
 async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
+    """Handles the full streaming lifecycle with robust token parsing."""
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
             prediction = response.json()
             stream_url = prediction.get("urls", {}).get("stream")
             prediction_id = prediction.get("id", "stream-unknown")
             if not stream_url:
+                yield json.dumps({"error": {"message": "Model did not return a stream URL."}})
+                return
         except httpx.HTTPStatusError as e:
+            error_details = e.response.text
+            try:
+                error_json = e.response.json()
+                error_details = error_json.get("detail", error_details)
+            except json.JSONDecodeError: pass
+            yield json.dumps({"error": {"message": f"Upstream Error: {error_details}", "type": "replicate_error"}})
+            return
         try:
             async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}, timeout=None) as sse:
                     elif line.startswith("data:"):
                         data = line[len("data:"):].strip()
                         if current_event == "output":
+                            # --- START OF STREAMING FIX ---
+                            # Replicate streams tokens that can be plain text or JSON-encoded strings.
+                            # We need to robustly parse them to preserve spaces correctly.
+                            content_token = ""
+                            try:
+                                # Attempt to parse data as JSON. This handles tokens like "\" Hello\""
+                                decoded_data = json.loads(data)
+                                if isinstance(decoded_data, str):
+                                    content_token = decoded_data
+                                else:
+                                    # It's some other JSON type, convert to string
+                                    content_token = str(decoded_data)
+                            except json.JSONDecodeError:
+                                # It's not valid JSON, so it's a plain text token.
+                                content_token = data
+                            if content_token:
                                 chunk = {
                                     "id": prediction_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": replicate_model_id,
+                                    "choices": [{"index": 0, "delta": {"content": content_token}, "finish_reason": None}]
                                 }
                                 yield json.dumps(chunk)
+                            # --- END OF STREAMING FIX ---
                         elif current_event == "done":
                             break
         except httpx.ReadTimeout:
 # --- Endpoints ---
 @app.get("/v1/models")
 async def list_models():
     return ModelList(data=[ModelCard(id=k) for k in SUPPORTED_MODELS.keys()])
 @app.post("/v1/chat/completions")
 async def create_chat_completion(request: OpenAIChatCompletionRequest):
     if request.model not in SUPPORTED_MODELS:
         raise HTTPException(status_code=404, detail=f"Model not found. Available models: {list(SUPPORTED_MODELS.keys())}")