Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

rkihacker commited on Oct 21

Commit

67e24f8

verified ·

1 Parent(s): 5e2bd86

Update main.py

Browse files

Files changed (1) hide show

main.py +66 -66

main.py CHANGED Viewed

@@ -22,7 +22,7 @@ if not SERVER_API_KEY:
     raise ValueError("SERVER_API_KEY environment variable not set. This is required to protect your server.")
 # FastAPI Init
-app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="9.3.0 (Streaming Fix)")
 # --- Authentication ---
 security = HTTPBearer()
@@ -52,7 +52,7 @@ class ModelList(BaseModel):
 class ChatMessage(BaseModel):
     role: Literal["system", "user", "assistant", "tool"]
-    content: Union[str, List[Dict[str, Any]], None] # Allow content to be None for tool calls
     name: Optional[str] = None
     tool_calls: Optional[List[Any]] = None
@@ -127,37 +127,33 @@ class ChatCompletionChunk(BaseModel):
 # --- Supported Models ---
 SUPPORTED_MODELS = {
     "llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
-    "claude-4.5-haiku": "anthropic/claude-3-haiku-20240307",
-    "claude-4.5-sonnet": "anthropic/claude-3.5-sonnet-20240620", # Updated to correct model ID
     "llava-13b": "yorickvp/llava-13b:e272157381e2a3bf12df3a8edd1f38d1dbd736bbb7437277c8b34175f8fce358"
 }
 # --- Core Logic ---
 def generate_request_id() -> str:
-    """Generates a unique request ID."""
-    return f"chatcmpl-{secrets.token_hex(16)}"
 def format_messages_for_replicate(messages: List[ChatMessage], functions: Optional[List[FunctionDefinition]] = None) -> Dict[str, Any]:
     prompt_parts = []
     system_prompt = None
     image_input = None
-    # Handle functions/tools if provided
-    tools_prompt_section = ""
     if functions:
-        tools_prompt_section += "You have access to the following tools. Use them if required to answer the user's question.\n\n"
         for func in functions:
-            tools_prompt_section += f"- Function: {func.name}\n"
-            if func.description: tools_prompt_section += f"  Description: {func.description}\n"
-            if func.parameters: tools_prompt_section += f"  Parameters: {json.dumps(func.parameters)}\n"
-        tools_prompt_section += "\nTo call a function, respond with a JSON object like this: {\"name\": \"function_name\", \"arguments\": {\"arg1\": \"value1\"}}\n"
     for msg in messages:
         if msg.role == "system":
             system_prompt = str(msg.content)
-            if tools_prompt_section:
-                system_prompt += "\n\n" + tools_prompt_section
         elif msg.role == "assistant":
             if msg.tool_calls:
                 tool_calls_text = "\nTool calls:\n"
@@ -181,12 +177,9 @@ def format_messages_for_replicate(messages: List[ChatMessage], functions: Option
                 user_text_content = str(msg.content)
             prompt_parts.append(f"User: {user_text_content}")
-    if not system_prompt and tools_prompt_section:
-        system_prompt = tools_prompt_section
-    prompt_parts.append("Assistant:")
     return {
-        "prompt": "\n".join(prompt_parts),
         "system_prompt": system_prompt,
         "image": image_input
     }
@@ -205,7 +198,7 @@ def parse_function_call(content: str) -> Optional[Dict[str, Any]]:
         pass
     return None
-async def stream_replicate_response(replicate_model_id: str, input_payload: dict, request_id: str, model_name: str):
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
@@ -232,6 +225,7 @@ async def stream_replicate_response(replicate_model_id: str, input_payload: dict
         try:
             async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}, timeout=None) as sse:
                 current_event = None
                 async for line in sse.aiter_lines():
                     if not line: continue
@@ -243,42 +237,34 @@ async def stream_replicate_response(replicate_model_id: str, input_payload: dict
                         if not raw_data: continue
                         try:
-                            # Replicate sends JSON-encoded strings. This correctly handles escaped chars like \n
                             content_token = json.loads(raw_data)
-                            if not isinstance(content_token, str):
-                                content_token = str(content_token) # Ensure it's a string
                         except (json.JSONDecodeError, TypeError):
-                            content_token = raw_data # Fallback for non-JSON data
-                        completion_tokens += 1
-                        if content_token:
-                            chunk = ChatCompletionChunk(
-                                id=request_id,
-                                created=int(time.time()),
-                                model=model_name,
-                                choices=[ChoiceDelta(index=0, delta=DeltaMessage(content=content_token))]
-                            )
-                            yield f"data: {chunk.model_dump_json()}\n\n"
-                    # --- THIS IS THE CRITICAL FIX for incomplete responses ---
-                    elif line.startswith("data:") and current_event == "error":
-                        raw_data = line[5:].strip()
-                        error_details = raw_data
-                        try:
-                            error_json = json.loads(raw_data)
-                            error_details = error_json.get("detail") or str(error_json)
-                        except json.JSONDecodeError: pass
-                        error_chunk = {"error": {"message": f"Replicate stream error: {error_details}", "type": "replicate_error"}}
-                        yield f"data: {json.dumps(error_chunk)}\n\n"
-                        break # Stop streaming on error
                     elif current_event == "done":
                         end_time = time.time()
                         usage = Usage(prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=prompt_tokens + completion_tokens, inference_time=round(end_time - start_time, 3))
-                        usage_chunk = ChatCompletionChunk(id=request_id, created=int(time.time()), model=model_name, choices=[ChoiceDelta(index=0, delta=DeltaMessage(), finish_reason="stop")], usage=usage)
-                        yield f"data: {usage_chunk.model_dump_json()}\n\n"
                         break
         except httpx.ReadTimeout:
@@ -290,10 +276,16 @@ async def stream_replicate_response(replicate_model_id: str, input_payload: dict
 # --- Endpoints ---
 @app.get("/v1/models", dependencies=[Depends(verify_api_key)])
 async def list_models():
     return ModelList(data=[ModelCard(id=k) for k in SUPPORTED_MODELS.keys()])
 @app.post("/v1/chat/completions", dependencies=[Depends(verify_api_key)])
 async def create_chat_completion(request: ChatCompletionRequest):
     if request.model not in SUPPORTED_MODELS:
         raise HTTPException(status_code=404, detail=f"Model not found. Available models: {list(SUPPORTED_MODELS.keys())}")
@@ -302,17 +294,13 @@ async def create_chat_completion(request: ChatCompletionRequest):
     replicate_input = {
         "prompt": formatted["prompt"],
-        "temperature": request.temperature if request.temperature is not None else 0.7,
-        "top_p": request.top_p if request.top_p is not None else 1.0,
     }
     if request.max_tokens is not None:
         replicate_input["max_new_tokens"] = request.max_tokens
-    # --- THIS IS THE SECOND FIX for incomplete responses ---
-    if request.stop:
-        replicate_input["stop_sequences"] = request.stop if isinstance(request.stop, list) else [request.stop]
     if formatted["system_prompt"]: replicate_input["system_prompt"] = formatted["system_prompt"]
     if formatted["image"]: replicate_input["image"] = formatted["image"]
@@ -320,7 +308,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
     if request.stream:
         return StreamingResponse(
-            stream_replicate_response(replicate_model_id, replicate_input, request_id, request.model),
             media_type="text/event-stream"
         )
@@ -335,13 +323,21 @@ async def create_chat_completion(request: ChatCompletionRequest):
             resp.raise_for_status()
             pred = resp.json()
-            # Handle errors in non-streaming mode
-            if pred.get("status") == "failed":
-                raise HTTPException(status_code=500, detail=f"Replicate prediction failed: {pred.get('error')}")
             raw_output = pred.get("output")
-            output = "".join(raw_output) if isinstance(raw_output, list) else (raw_output or "")
             end_time = time.time()
             prompt_tokens = len(replicate_input.get("prompt", "")) // 4
             completion_tokens = len(output) // 4
@@ -352,9 +348,9 @@ async def create_chat_completion(request: ChatCompletionRequest):
             function_call = parse_function_call(output)
             if function_call:
-                tool_calls = [ToolCall(id=f"call_{int(time.time())}", function=FunctionCall(name=function_call["name"], arguments=json.dumps(function_call["arguments"])))]
                 finish_reason = "tool_calls"
-                message_content = None
             return ChatCompletion(
                 id=request_id,
@@ -379,7 +375,10 @@ async def create_chat_completion(request: ChatCompletionRequest):
 @app.get("/")
 async def root():
-    return {"message": "Replicate to OpenAI Compatibility Layer API", "version": "9.3.0"}
 @app.middleware("http")
 async def add_performance_headers(request, call_next):
@@ -387,5 +386,6 @@ async def add_performance_headers(request, call_next):
     response = await call_next(request)
     process_time = time.time() - start_time
     response.headers["X-Process-Time"] = str(round(process_time, 3))
-    response.headers["X-API-Version"] = "9.3.0"
-    return response

     raise ValueError("SERVER_API_KEY environment variable not set. This is required to protect your server.")
 # FastAPI Init
+app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="9.2.8 (Raw Output Fix)")
 # --- Authentication ---
 security = HTTPBearer()
 class ChatMessage(BaseModel):
     role: Literal["system", "user", "assistant", "tool"]
+    content: Union[str, List[Dict[str, Any]]]
     name: Optional[str] = None
     tool_calls: Optional[List[Any]] = None
 # --- Supported Models ---
 SUPPORTED_MODELS = {
     "llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
+    "claude-4.5-haiku": "anthropic/claude-4.5-haiku",
+    "claude-4.5-sonnet": "anthropic/claude-4.5-sonnet",
     "llava-13b": "yorickvp/llava-13b:e272157381e2a3bf12df3a8edd1f38d1dbd736bbb7437277c8b34175f8fce358"
 }
 # --- Core Logic ---
 def generate_request_id() -> str:
+    """Generates a unique request ID in the user-specified format."""
+    return f"gen-{int(time.time())}-{secrets.token_hex(8)}"
 def format_messages_for_replicate(messages: List[ChatMessage], functions: Optional[List[FunctionDefinition]] = None) -> Dict[str, Any]:
     prompt_parts = []
     system_prompt = None
     image_input = None
     if functions:
+        functions_text = "You have access to the following tools. Use them if required to answer the user's question.\n\n"
         for func in functions:
+            functions_text += f"- Function: {func.name}\n"
+            if func.description: functions_text += f"  Description: {func.description}\n"
+            if func.parameters: functions_text += f"  Parameters: {json.dumps(func.parameters)}\n"
+        prompt_parts.append(functions_text)
     for msg in messages:
         if msg.role == "system":
             system_prompt = str(msg.content)
         elif msg.role == "assistant":
             if msg.tool_calls:
                 tool_calls_text = "\nTool calls:\n"
                 user_text_content = str(msg.content)
             prompt_parts.append(f"User: {user_text_content}")
+    prompt_parts.append("Assistant:") # Let the model generate the space after this
     return {
+        "prompt": "\n\n".join(prompt_parts),
         "system_prompt": system_prompt,
         "image": image_input
     }
         pass
     return None
+async def stream_replicate_response(replicate_model_id: str, input_payload: dict, request_id: str):
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
         try:
             async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}, timeout=None) as sse:
                 current_event = None
+                accumulated_content = ""
                 async for line in sse.aiter_lines():
                     if not line: continue
                         if not raw_data: continue
                         try:
                             content_token = json.loads(raw_data)
                         except (json.JSONDecodeError, TypeError):
+                            content_token = raw_data
+                        # ### THIS IS THE FIX ###
+                        # There is NO lstrip() or strip() here.
+                        # This sends the raw, unmodified token from Replicate.
+                        # If the log shows "HowcanI", it's because the model
+                        # sent "How", "can", "I" as separate tokens.
+                        accumulated_content += content_token
+                        completion_tokens += 1
+                        function_call = parse_function_call(accumulated_content)
+                        if function_call:
+                            tool_call = ToolCall(id=f"call_{int(time.time())}", function=FunctionCall(name=function_call["name"], arguments=function_call["arguments"]))
+                            chunk = ChatCompletionChunk(id=request_id, created=int(time.time()), model=replicate_model_id, choices=[ChoiceDelta(index=0, delta=DeltaMessage(tool_calls=[tool_call]), finish_reason=None)])
+                            yield f"data: {chunk.json()}\n\n"
+                        else:
+                            if content_token:
+                                chunk = ChatCompletionChunk(id=request_id, created=int(time.time()), model=replicate_model_id, choices=[ChoiceDelta(index=0, delta=DeltaMessage(content=content_token), finish_reason=None)])
+                                yield f"data: {chunk.json()}\n\n"
                     elif current_event == "done":
                         end_time = time.time()
                         usage = Usage(prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=prompt_tokens + completion_tokens, inference_time=round(end_time - start_time, 3))
+                        usage_chunk = ChatCompletionChunk(id=request_id, created=int(time.time()), model=replicate_model_id, choices=[ChoiceDelta(index=0, delta=DeltaMessage(), finish_reason="stop")], usage=usage)
+                        yield f"data: {usage_chunk.json()}\n\n"
                         break
         except httpx.ReadTimeout:
 # --- Endpoints ---
 @app.get("/v1/models", dependencies=[Depends(verify_api_key)])
 async def list_models():
+    """
+    Protected endpoint to list available models.
+    """
     return ModelList(data=[ModelCard(id=k) for k in SUPPORTED_MODELS.keys()])
 @app.post("/v1/chat/completions", dependencies=[Depends(verify_api_key)])
 async def create_chat_completion(request: ChatCompletionRequest):
+    """
+    Protected endpoint to create a chat completion.
+    """
     if request.model not in SUPPORTED_MODELS:
         raise HTTPException(status_code=404, detail=f"Model not found. Available models: {list(SUPPORTED_MODELS.keys())}")
     replicate_input = {
         "prompt": formatted["prompt"],
+        "temperature": request.temperature or 0.7,
+        "top_p": request.top_p or 1.0
     }
     if request.max_tokens is not None:
         replicate_input["max_new_tokens"] = request.max_tokens
     if formatted["system_prompt"]: replicate_input["system_prompt"] = formatted["system_prompt"]
     if formatted["image"]: replicate_input["image"] = formatted["image"]
     if request.stream:
         return StreamingResponse(
+            stream_replicate_response(replicate_model_id, replicate_input, request_id),
             media_type="text/event-stream"
         )
             resp.raise_for_status()
             pred = resp.json()
+            # Handle the 'output' field which could be a list, string, or null
             raw_output = pred.get("output")
+            if isinstance(raw_output, list):
+                output = "".join(raw_output)  # Expected case: list of strings
+            elif isinstance(raw_output, str):
+                output = raw_output          # Handle if it's just a single string
+            else:
+                output = ""
+            # ### THIS IS THE FIX ###
+            # Removed output.strip() to return the raw response.
+            # This fixes the bug where a single space (" ") response
+            # would become "" and show content: "" in the JSON.
             end_time = time.time()
             prompt_tokens = len(replicate_input.get("prompt", "")) // 4
             completion_tokens = len(output) // 4
             function_call = parse_function_call(output)
             if function_call:
+                tool_calls = [ToolCall(id=f"call_{int(time.time())}", function=FunctionCall(name=function_call["name"], arguments=function_call["arguments"]))]
                 finish_reason = "tool_calls"
+                message_content = None # OpenAI standard: content is null when tool_calls are present
             return ChatCompletion(
                 id=request_id,
 @app.get("/")
 async def root():
+    """
+    Root endpoint for health checks. Does not require authentication.
+    """
+    return {"message": "Replicate to OpenAI Compatibility Layer API", "version": "9.2.8"}
 @app.middleware("http")
 async def add_performance_headers(request, call_next):
     response = await call_next(request)
     process_time = time.time() - start_time
     response.headers["X-Process-Time"] = str(round(process_time, 3))
+    response.headers["X-API-Version"] = "9.2.8"
+    return response