Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

rkihacker commited on Oct 21

Commit

e014ad9

verified ·

1 Parent(s): 415ec30

Update main.py

Browse files

Files changed (1) hide show

main.py +105 -74

main.py CHANGED Viewed

@@ -16,7 +16,7 @@ if not REPLICATE_API_TOKEN:
     raise ValueError("REPLICATE_API_TOKEN environment variable not set.")
 # FastAPI Init
-app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="4.0.0 (Docs Compliant)")
 # --- Pydantic Models ---
 class ModelCard(BaseModel):
@@ -33,43 +33,43 @@ class OpenAIChatCompletionRequest(BaseModel):
 SUPPORTED_MODELS = {
     "llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
     "claude-4.5-haiku": "anthropic/claude-4.5-haiku"
 }
 # --- Core Logic ---
-def prepare_replicate_input(request: OpenAIChatCompletionRequest, replicate_model_id: str) -> Dict[str, Any]:
-    """Formats the input specifically for the requested Replicate model."""
     payload = {}
-    # Claude on Replicate strictly requires a 'prompt' string, not 'messages' array.
-    if "anthropic/claude" in replicate_model_id:
-        prompt_parts = []
-        system_prompt = None
-        for msg in request.messages:
-            if msg.role == "system":
-                 # Extract system prompt if present
-                system_prompt = str(msg.content)
-            elif msg.role == "user":
-                # Handle both simple string content and list content (for potential future vision support)
-                content = msg.content
-                if isinstance(content, list):
-                     text_parts = [item.get("text", "") for item in content if item.get("type") == "text"]
-                     content = " ".join(text_parts)
-                prompt_parts.append(f"User: {content}")
-            elif msg.role == "assistant":
-                prompt_parts.append(f"Assistant: {msg.content}")
-        # Standard Claude prompting convention
-        prompt_parts.append("Assistant:")
-        payload["prompt"] = "\n\n".join(prompt_parts)
-        if system_prompt:
-             payload["system_prompt"] = system_prompt
-    # Llama 3 and others often support the 'messages' array natively.
-    else:
-         # Convert Pydantic models to pure dicts
-         payload["prompt"] = [msg.dict() for msg in request.messages]
     # Map common OpenAI parameters to Replicate equivalents
     if request.max_tokens: payload["max_new_tokens"] = request.max_tokens
     if request.temperature: payload["temperature"] = request.temperature
     if request.top_p: payload["top_p"] = request.top_p
@@ -78,85 +78,116 @@ def prepare_replicate_input(request: OpenAIChatCompletionRequest, replicate_mode
 async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
     """Handles the full streaming lifecycle using standard Replicate endpoints."""
-    # 1. Start Prediction specifically at the named model endpoint
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
     async with httpx.AsyncClient(timeout=60.0) as client:
         try:
-            # Explicitly request stream=True in the body, though often implicit
             response = await client.post(url, headers=headers, json={"input": input_payload, "stream": True})
             response.raise_for_status()
             prediction = response.json()
             stream_url = prediction.get("urls", {}).get("stream")
-            prediction_id = prediction.get("id")
             if not stream_url:
                  yield json.dumps({"error": {"message": "Model did not return a stream URL."}})
                  return
         except httpx.HTTPStatusError as e:
-             yield json.dumps({"error": {"message": e.response.text, "type": "upstream_error"}})
              return
-        # 2. Connect to the provided Stream URL
-        async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}, timeout=None) as sse:
-            current_event = None
-            async for line in sse.aiter_lines():
-                if line.startswith("event:"):
-                    current_event = line[len("event:"):].strip()
-                elif line.startswith("data:"):
-                    data = line[len("data:"):].strip()
-                    if current_event == "output":
-                        # CRITICAL: Wrap in try/except to ignore empty keep-alive lines that crash standard parsers
-                        try:
-                            # Replicate sometimes sends raw strings, sometimes JSON.
-                            # For chat models, it's usually a raw string token.
-                            # We try to load as JSON first, if it fails, use raw data.
-                            try:
-                                content = json.loads(data)
-                            except json.JSONDecodeError:
-                                content = data
-                            if content: # Ensure we don't send empty chunks
                                 chunk = {
                                     "id": prediction_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": replicate_model_id,
-                                    "choices": [{"index": 0, "delta": {"content": content}, "finish_reason": None}]
                                 }
                                 yield json.dumps(chunk)
-                        except Exception:
-                            pass # Safely ignore malformed lines
-                    elif current_event == "done":
-                        break
-    # 3. Send final [DONE] event
-    yield json.dumps({"id": prediction_id, "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]})
     yield "[DONE]"
 # --- Endpoints ---
 @app.get("/v1/models")
 async def list_models():
     return ModelList(data=[ModelCard(id=k) for k in SUPPORTED_MODELS.keys()])
 @app.post("/v1/chat/completions")
 async def create_chat_completion(request: OpenAIChatCompletionRequest):
     if request.model not in SUPPORTED_MODELS:
-        raise HTTPException(404, f"Model not found. Available: {list(SUPPORTED_MODELS.keys())}")
     replicate_id = SUPPORTED_MODELS[request.model]
-    replicate_input = prepare_replicate_input(request, replicate_id)
     if request.stream:
-        return EventSourceResponse(stream_replicate_sse(replicate_id, replicate_input))
     # Non-streaming fallback
     url = f"https://api.replicate.com/v1/models/{replicate_id}/predictions"
-    headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json", "Prefer": "wait=60"}
     async with httpx.AsyncClient() as client:
-        resp = await client.post(url, headers=headers, json={"input": replicate_input})
-        if resp.is_error: raise HTTPException(resp.status_code, resp.text)
-        pred = resp.json()
-        output = "".join(pred.get("output", []))
-        return {"id": pred["id"], "choices": [{"message": {"role": "assistant", "content": output}, "finish_reason": "stop"}]}

     raise ValueError("REPLICATE_API_TOKEN environment variable not set.")
 # FastAPI Init
+app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="4.1.0 (Context Fixed)")
 # --- Pydantic Models ---
 class ModelCard(BaseModel):
 SUPPORTED_MODELS = {
     "llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
     "claude-4.5-haiku": "anthropic/claude-4.5-haiku"
+    # You can add more models here
 }
 # --- Core Logic ---
+def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, Any]:
+    """
+    Formats the input for Replicate API, preserving the conversational context.
+    """
     payload = {}
+    # --- CONTEXT FIX START ---
+    # Modern chat models on Replicate (like Llama 3 and Claude 4.5) expect
+    # the 'messages' array directly, just like OpenAI.
+    # We no longer need to flatten the conversation into a single prompt string.
+    # Extract system prompt if it exists, as some models take it as a separate parameter.
+    messages_for_payload = []
+    system_prompt = None
+    for msg in request.messages:
+        if msg.role == "system":
+            # Claude and some other models prefer a dedicated system_prompt field.
+            system_prompt = str(msg.content)
+        else:
+            # Handle user/assistant roles. Convert Pydantic model to a standard dict.
+            messages_for_payload.append(msg.dict())
+    # The main input for conversation is the 'messages' array.
+    payload["messages"] = messages_for_payload
+    # Add system_prompt to the payload if it was found.
+    if system_prompt:
+         payload["system_prompt"] = system_prompt
+    # --- CONTEXT FIX END ---
     # Map common OpenAI parameters to Replicate equivalents
+    # Note: Replicate's parameter for max tokens is often 'max_new_tokens'
     if request.max_tokens: payload["max_new_tokens"] = request.max_tokens
     if request.temperature: payload["temperature"] = request.temperature
     if request.top_p: payload["top_p"] = request.top_p
 async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
     """Handles the full streaming lifecycle using standard Replicate endpoints."""
+    # 1. Start Prediction
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
     async with httpx.AsyncClient(timeout=60.0) as client:
         try:
+            # Request a streaming prediction
             response = await client.post(url, headers=headers, json={"input": input_payload, "stream": True})
             response.raise_for_status()
             prediction = response.json()
             stream_url = prediction.get("urls", {}).get("stream")
+            prediction_id = prediction.get("id", "stream-unknown")
             if not stream_url:
                  yield json.dumps({"error": {"message": "Model did not return a stream URL."}})
                  return
         except httpx.HTTPStatusError as e:
+             error_details = e.response.text
+             try:
+                 # Try to parse the error for a cleaner message
+                 error_json = e.response.json()
+                 error_details = error_json.get("detail", error_details)
+             except json.JSONDecodeError:
+                 pass # Use raw text if not JSON
+             yield json.dumps({"error": {"message": f"Upstream Error: {error_details}", "type": "replicate_error"}})
              return
+        # 2. Connect to the provided Stream URL and process Server-Sent Events (SSE)
+        try:
+            async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}, timeout=None) as sse:
+                current_event = None
+                async for line in sse.aiter_lines():
+                    if line.startswith("event:"):
+                        current_event = line[len("event:"):].strip()
+                    elif line.startswith("data:"):
+                        data = line[len("data:"):].strip()
+                        if current_event == "output":
+                            # The 'output' event for chat models sends one token at a time as a plain string.
+                            # We don't need to parse it as JSON.
+                            if data: # Ensure we don't send empty chunks
                                 chunk = {
                                     "id": prediction_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": replicate_model_id,
+                                    "choices": [{"index": 0, "delta": {"content": data}, "finish_reason": None}]
                                 }
                                 yield json.dumps(chunk)
+                        elif current_event == "done":
+                            # The 'done' event signals the end of the stream.
+                            break
+        except httpx.ReadTimeout:
+            # Handle cases where the stream times out
+            yield json.dumps({"error": {"message": "Stream timed out.", "type": "timeout_error"}})
+            return
+    # 3. Send the final termination chunk in OpenAI format
+    final_chunk = {
+        "id": prediction_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": replicate_model_id,
+        "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
+    }
+    yield json.dumps(final_chunk)
+    # Some clients (like curl) expect a final "[DONE]" message to close the connection.
     yield "[DONE]"
 # --- Endpoints ---
 @app.get("/v1/models")
 async def list_models():
+    """Lists the currently supported models."""
     return ModelList(data=[ModelCard(id=k) for k in SUPPORTED_MODELS.keys()])
 @app.post("/v1/chat/completions")
 async def create_chat_completion(request: OpenAIChatCompletionRequest):
+    """Handles chat completion requests, streaming or non-streaming."""
     if request.model not in SUPPORTED_MODELS:
+        raise HTTPException(status_code=404, detail=f"Model not found. Available models: {list(SUPPORTED_MODELS.keys())}")
     replicate_id = SUPPORTED_MODELS[request.model]
+    replicate_input = prepare_replicate_input(request)
     if request.stream:
+        # Return a streaming response
+        return EventSourceResponse(stream_replicate_sse(replicate_id, replicate_input), media_type="text/event-stream")
     # Non-streaming fallback
     url = f"https://api.replicate.com/v1/models/{replicate_id}/predictions"
+    headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json", "Prefer": "wait=120"} # Increased wait time
     async with httpx.AsyncClient() as client:
+        try:
+            resp = await client.post(url, headers=headers, json={"input": replicate_input}, timeout=130.0)
+            resp.raise_for_status()
+            pred = resp.json()
+            # The output of chat models is typically a list of strings (tokens)
+            output = "".join(pred.get("output", []))
+            return {
+                "id": pred.get("id"),
+                "object": "chat.completion",
+                "created": int(time.time()),
+                "model": request.model,
+                "choices": [{
+                    "index": 0,
+                    "message": {"role": "assistant", "content": output},
+                    "finish_reason": "stop"
+                }],
+                "usage": { # Placeholder usage object
+                    "prompt_tokens": 0,
+                    "completion_tokens": 0,
+                    "total_tokens": 0
+                }
+            }
+        except httpx.HTTPStatusError as e:
+            raise HTTPException(status_code=e.response.status_code, detail=f"Error from Replicate API: {e.response.text}")