Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

rkihacker commited on 29 days ago

Commit

63b36c2

verified ·

1 Parent(s): de4d166

Update main.py

Browse files

Files changed (1) hide show

main.py +49 -43

main.py CHANGED Viewed

@@ -20,7 +20,7 @@ if not REPLICATE_API_TOKEN:
 # --- FastAPI App Initialization ---
 app = FastAPI(
     title="Replicate to OpenAI Compatibility Layer",
-    version="2.3.0 (Definitive Streaming Fix)",
 )
 # --- Pydantic Models ---
@@ -36,57 +36,66 @@ class ChatMessage(BaseModel):
 class OpenAIChatCompletionRequest(BaseModel):
     model: str; messages: List[ChatMessage]; temperature: Optional[float] = 0.7; top_p: Optional[float] = 1.0; max_tokens: Optional[int] = None; stream: Optional[bool] = False
-# --- Model Mapping ---
 SUPPORTED_MODELS = {
-    "llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
-    "claude-4.5-haiku": "anthropic/claude-4.5-haiku"
 }
 # --- Helper Functions ---
-def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, Any]:
-    """Prepares the input payload for Replicate, handling model-specific formats."""
-    payload = {}
-    if "claude" in request.model:
         prompt_parts = []
         system_prompt = None
-        image_url = None
         for msg in request.messages:
             if msg.role == "system":
                 system_prompt = str(msg.content)
             elif msg.role == "user":
-                if isinstance(msg.content, list):
-                    for item in msg.content:
-                        if item.get("type") == "text":
-                            prompt_parts.append(f"User: {item.get('text', '')}")
-                        elif item.get("type") == "image_url":
-                            image_url = item.get("image_url", {}).get("url")
-                else:
-                    prompt_parts.append(f"User: {msg.content}")
             elif msg.role == "assistant":
                 prompt_parts.append(f"Assistant: {msg.content}")
-        prompt_parts.append("Assistant:")
-        payload["prompt"] = "\n".join(prompt_parts)
-        if system_prompt: payload["system_prompt"] = system_prompt
-        if image_url: payload["image"] = image_url
-    else:
-        payload["messages"] = [msg.dict() for msg in request.messages]
-    if request.max_tokens is not None: payload["max_new_tokens"] = request.max_tokens
-    if request.temperature is not None: payload["temperature"] = request.temperature
-    if request.top_p is not None: payload["top_p"] = request.top_p
-    return payload
-async def stream_replicate_native_sse(model_id: str, payload: dict):
-    """Connects to Replicate's native SSE stream for token-by-token streaming."""
-    url = f"https://api.replicate.com/v1/models/{model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
     async with httpx.AsyncClient(timeout=300) as client:
         prediction = None
         try:
-            response = await client.post(url, headers=headers, json={"input": payload, "stream": True})
             response.raise_for_status()
             prediction = response.json()
             stream_url = prediction.get("urls", {}).get("stream")
@@ -109,11 +118,7 @@ async def stream_replicate_native_sse(model_id: str, payload: dict):
                         current_event = line[len("event:"):].strip()
                     elif line.startswith("data:"):
                         data = line[len("data:"):].strip()
                         if current_event == "output":
-                            # *** THIS IS THE DEFINITIVE FIX ***
-                            # Wrap the JSON parsing in a try-except block to gracefully
-                            # handle empty or malformed data lines without crashing.
                             try:
                                 content = json.loads(data)
                                 chunk = {
@@ -122,7 +127,7 @@ async def stream_replicate_native_sse(model_id: str, payload: dict):
                                 }
                                 yield json.dumps(chunk)
                             except json.JSONDecodeError:
-                                # This will silently ignore any non-JSON data, like empty strings.
                                 pass
                         elif current_event == "done":
                             break
@@ -147,18 +152,19 @@ async def create_chat_completion(request: OpenAIChatCompletionRequest):
     if model_key not in SUPPORTED_MODELS:
         raise HTTPException(status_code=404, detail=f"Model not found. Supported models: {list(SUPPORTED_MODELS.keys())}")
-    replicate_model_id = SUPPORTED_MODELS[model_key]
-    replicate_input = prepare_replicate_input(request)
     if request.stream:
-        return EventSourceResponse(stream_replicate_native_sse(replicate_model_id, replicate_input))
-    url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json", "Prefer": "wait=120"}
     async with httpx.AsyncClient(timeout=150) as client:
         try:
-            response = await client.post(url, headers=headers, json={"input": replicate_input})
             response.raise_for_status()
             prediction = response.json()
             output = "".join(prediction.get("output", []))

 # --- FastAPI App Initialization ---
 app = FastAPI(
     title="Replicate to OpenAI Compatibility Layer",
+    version="3.0.0 (Production Grade)",
 )
 # --- Pydantic Models ---
 class OpenAIChatCompletionRequest(BaseModel):
     model: str; messages: List[ChatMessage]; temperature: Optional[float] = 0.7; top_p: Optional[float] = 1.0; max_tokens: Optional[int] = None; stream: Optional[bool] = False
+# --- Model Mapping with Explicit Version Hashes (Inspired by LiteLLM) ---
 SUPPORTED_MODELS = {
+    "llama3-8b-instruct": {
+        "id": "meta/meta-llama-3-8b-instruct",
+        "version": "02741d1be9a932e6566058d4c92ab80332f143003b5a874f63c9b743e4f3583c",
+        "input_type": "messages"
+    },
+    "claude-4.5-haiku": {
+        "id": "anthropic/claude-4.5-haiku",
+        "version": "311c5ff9b9f71c9ebd401b34a41ce604a8b735def3a4aad56f671302b5c56784",
+        "input_type": "prompt"
+    }
 }
 # --- Helper Functions ---
+def build_replicate_request_body(request: OpenAIChatCompletionRequest, model_details: dict) -> dict:
+    """Builds the complete request body, including the crucial version hash."""
+    input_payload = {}
+    # Handle model-specific input format (prompt vs messages)
+    if model_details["input_type"] == "prompt":
         prompt_parts = []
         system_prompt = None
         for msg in request.messages:
             if msg.role == "system":
                 system_prompt = str(msg.content)
             elif msg.role == "user":
+                prompt_parts.append(f"User: {msg.content}")
             elif msg.role == "assistant":
                 prompt_parts.append(f"Assistant: {msg.content}")
+        prompt_parts.append("Assistant:") # Cue the model to respond
+        input_payload["prompt"] = "\n".join(prompt_parts)
+        if system_prompt: input_payload["system_prompt"] = system_prompt
+    else: # "messages"
+        input_payload["messages"] = [msg.dict() for msg in request.messages]
+    # Add common parameters
+    if request.max_tokens is not None: input_payload["max_new_tokens"] = request.max_tokens
+    if request.temperature is not None: input_payload["temperature"] = request.temperature
+    if request.top_p is not None: input_payload["top_p"] = request.top_p
+    return {
+        "version": model_details["version"],
+        "input": input_payload
+    }
+async def stream_replicate_native_sse(model_id: str, request_body: dict):
+    """Connects to Replicate's native SSE stream for true token-by-token streaming."""
+    # Note: We call the generic predictions endpoint when providing a version hash.
+    url = "https://api.replicate.com/v1/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
+    # Add stream=True to the request body
+    request_body["stream"] = True
     async with httpx.AsyncClient(timeout=300) as client:
         prediction = None
         try:
+            response = await client.post(url, headers=headers, json=request_body)
             response.raise_for_status()
             prediction = response.json()
             stream_url = prediction.get("urls", {}).get("stream")
                         current_event = line[len("event:"):].strip()
                     elif line.startswith("data:"):
                         data = line[len("data:"):].strip()
                         if current_event == "output":
                             try:
                                 content = json.loads(data)
                                 chunk = {
                                 }
                                 yield json.dumps(chunk)
                             except json.JSONDecodeError:
+                                # Silently ignore malformed or empty data lines
                                 pass
                         elif current_event == "done":
                             break
     if model_key not in SUPPORTED_MODELS:
         raise HTTPException(status_code=404, detail=f"Model not found. Supported models: {list(SUPPORTED_MODELS.keys())}")
+    model_details = SUPPORTED_MODELS[model_key]
+    replicate_request_body = build_replicate_request_body(request, model_details)
     if request.stream:
+        return EventSourceResponse(stream_replicate_native_sse(model_details["id"], replicate_request_body))
+    # Synchronous request
+    url = "https://api.replicate.com/v1/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json", "Prefer": "wait=120"}
     async with httpx.AsyncClient(timeout=150) as client:
         try:
+            response = await client.post(url, headers=headers, json=replicate_request_body)
             response.raise_for_status()
             prediction = response.json()
             output = "".join(prediction.get("output", []))