Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

rkihacker commited on about 1 month ago

Commit

5e2bd86

verified ·

1 Parent(s): 3634096

Update main.py

Browse files

Files changed (1) hide show

main.py +65 -64

main.py CHANGED Viewed

@@ -22,7 +22,7 @@ if not SERVER_API_KEY:
     raise ValueError("SERVER_API_KEY environment variable not set. This is required to protect your server.")
 # FastAPI Init
-app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="9.2.8 (Raw Output Fix)")
 # --- Authentication ---
 security = HTTPBearer()
@@ -52,7 +52,7 @@ class ModelList(BaseModel):
 class ChatMessage(BaseModel):
     role: Literal["system", "user", "assistant", "tool"]
-    content: Union[str, List[Dict[str, Any]]]
     name: Optional[str] = None
     tool_calls: Optional[List[Any]] = None
@@ -127,33 +127,37 @@ class ChatCompletionChunk(BaseModel):
 # --- Supported Models ---
 SUPPORTED_MODELS = {
     "llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
-    "claude-4.5-haiku": "anthropic/claude-4.5-haiku",
-    "claude-4.5-sonnet": "anthropic/claude-4.5-sonnet",
     "llava-13b": "yorickvp/llava-13b:e272157381e2a3bf12df3a8edd1f38d1dbd736bbb7437277c8b34175f8fce358"
 }
 # --- Core Logic ---
 def generate_request_id() -> str:
-    """Generates a unique request ID in the user-specified format."""
-    return f"gen-{int(time.time())}-{secrets.token_hex(8)}"
 def format_messages_for_replicate(messages: List[ChatMessage], functions: Optional[List[FunctionDefinition]] = None) -> Dict[str, Any]:
     prompt_parts = []
     system_prompt = None
     image_input = None
     if functions:
-        functions_text = "You have access to the following tools. Use them if required to answer the user's question.\n\n"
         for func in functions:
-            functions_text += f"- Function: {func.name}\n"
-            if func.description: functions_text += f"  Description: {func.description}\n"
-            if func.parameters: functions_text += f"  Parameters: {json.dumps(func.parameters)}\n"
-        prompt_parts.append(functions_text)
     for msg in messages:
         if msg.role == "system":
             system_prompt = str(msg.content)
         elif msg.role == "assistant":
             if msg.tool_calls:
                 tool_calls_text = "\nTool calls:\n"
@@ -177,9 +181,12 @@ def format_messages_for_replicate(messages: List[ChatMessage], functions: Option
                 user_text_content = str(msg.content)
             prompt_parts.append(f"User: {user_text_content}")
-    prompt_parts.append("Assistant:") # Let the model generate the space after this
     return {
-        "prompt": "\n\n".join(prompt_parts),
         "system_prompt": system_prompt,
         "image": image_input
     }
@@ -198,7 +205,7 @@ def parse_function_call(content: str) -> Optional[Dict[str, Any]]:
         pass
     return None
-async def stream_replicate_response(replicate_model_id: str, input_payload: dict, request_id: str):
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
@@ -225,7 +232,6 @@ async def stream_replicate_response(replicate_model_id: str, input_payload: dict
         try:
             async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}, timeout=None) as sse:
                 current_event = None
-                accumulated_content = ""
                 async for line in sse.aiter_lines():
                     if not line: continue
@@ -237,34 +243,42 @@ async def stream_replicate_response(replicate_model_id: str, input_payload: dict
                         if not raw_data: continue
                         try:
                             content_token = json.loads(raw_data)
                         except (json.JSONDecodeError, TypeError):
-                            content_token = raw_data
-                        # ### THIS IS THE FIX ###
-                        # There is NO lstrip() or strip() here.
-                        # This sends the raw, unmodified token from Replicate.
-                        # If the log shows "HowcanI", it's because the model
-                        # sent "How", "can", "I" as separate tokens.
-                        accumulated_content += content_token
                         completion_tokens += 1
-                        function_call = parse_function_call(accumulated_content)
-                        if function_call:
-                            tool_call = ToolCall(id=f"call_{int(time.time())}", function=FunctionCall(name=function_call["name"], arguments=function_call["arguments"]))
-                            chunk = ChatCompletionChunk(id=request_id, created=int(time.time()), model=replicate_model_id, choices=[ChoiceDelta(index=0, delta=DeltaMessage(tool_calls=[tool_call]), finish_reason=None)])
-                            yield f"data: {chunk.json()}\n\n"
-                        else:
-                            if content_token:
-                                chunk = ChatCompletionChunk(id=request_id, created=int(time.time()), model=replicate_model_id, choices=[ChoiceDelta(index=0, delta=DeltaMessage(content=content_token), finish_reason=None)])
-                                yield f"data: {chunk.json()}\n\n"
                     elif current_event == "done":
                         end_time = time.time()
                         usage = Usage(prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=prompt_tokens + completion_tokens, inference_time=round(end_time - start_time, 3))
-                        usage_chunk = ChatCompletionChunk(id=request_id, created=int(time.time()), model=replicate_model_id, choices=[ChoiceDelta(index=0, delta=DeltaMessage(), finish_reason="stop")], usage=usage)
-                        yield f"data: {usage_chunk.json()}\n\n"
                         break
         except httpx.ReadTimeout:
@@ -276,16 +290,10 @@ async def stream_replicate_response(replicate_model_id: str, input_payload: dict
 # --- Endpoints ---
 @app.get("/v1/models", dependencies=[Depends(verify_api_key)])
 async def list_models():
-    """
-    Protected endpoint to list available models.
-    """
     return ModelList(data=[ModelCard(id=k) for k in SUPPORTED_MODELS.keys()])
 @app.post("/v1/chat/completions", dependencies=[Depends(verify_api_key)])
 async def create_chat_completion(request: ChatCompletionRequest):
-    """
-    Protected endpoint to create a chat completion.
-    """
     if request.model not in SUPPORTED_MODELS:
         raise HTTPException(status_code=404, detail=f"Model not found. Available models: {list(SUPPORTED_MODELS.keys())}")
@@ -294,13 +302,17 @@ async def create_chat_completion(request: ChatCompletionRequest):
     replicate_input = {
         "prompt": formatted["prompt"],
-        "temperature": request.temperature or 0.7,
-        "top_p": request.top_p or 1.0
     }
     if request.max_tokens is not None:
         replicate_input["max_new_tokens"] = request.max_tokens
     if formatted["system_prompt"]: replicate_input["system_prompt"] = formatted["system_prompt"]
     if formatted["image"]: replicate_input["image"] = formatted["image"]
@@ -308,7 +320,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
     if request.stream:
         return StreamingResponse(
-            stream_replicate_response(replicate_model_id, replicate_input, request_id),
             media_type="text/event-stream"
         )
@@ -323,21 +335,13 @@ async def create_chat_completion(request: ChatCompletionRequest):
             resp.raise_for_status()
             pred = resp.json()
-            # Handle the 'output' field which could be a list, string, or null
             raw_output = pred.get("output")
-            if isinstance(raw_output, list):
-                output = "".join(raw_output)  # Expected case: list of strings
-            elif isinstance(raw_output, str):
-                output = raw_output          # Handle if it's just a single string
-            else:
-                output = ""
-            # ### THIS IS THE FIX ###
-            # Removed output.strip() to return the raw response.
-            # This fixes the bug where a single space (" ") response
-            # would become "" and show content: "" in the JSON.
             end_time = time.time()
             prompt_tokens = len(replicate_input.get("prompt", "")) // 4
             completion_tokens = len(output) // 4
@@ -348,9 +352,9 @@ async def create_chat_completion(request: ChatCompletionRequest):
             function_call = parse_function_call(output)
             if function_call:
-                tool_calls = [ToolCall(id=f"call_{int(time.time())}", function=FunctionCall(name=function_call["name"], arguments=function_call["arguments"]))]
                 finish_reason = "tool_calls"
-                message_content = None # OpenAI standard: content is null when tool_calls are present
             return ChatCompletion(
                 id=request_id,
@@ -375,10 +379,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
 @app.get("/")
 async def root():
-    """
-    Root endpoint for health checks. Does not require authentication.
-    """
-    return {"message": "Replicate to OpenAI Compatibility Layer API", "version": "9.2.8"}
 @app.middleware("http")
 async def add_performance_headers(request, call_next):
@@ -386,5 +387,5 @@ async def add_performance_headers(request, call_next):
     response = await call_next(request)
     process_time = time.time() - start_time
     response.headers["X-Process-Time"] = str(round(process_time, 3))
-    response.headers["X-API-Version"] = "9.2.8"
     return response

     raise ValueError("SERVER_API_KEY environment variable not set. This is required to protect your server.")
 # FastAPI Init
+app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="9.3.0 (Streaming Fix)")
 # --- Authentication ---
 security = HTTPBearer()
 class ChatMessage(BaseModel):
     role: Literal["system", "user", "assistant", "tool"]
+    content: Union[str, List[Dict[str, Any]], None] # Allow content to be None for tool calls
     name: Optional[str] = None
     tool_calls: Optional[List[Any]] = None
 # --- Supported Models ---
 SUPPORTED_MODELS = {
     "llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
+    "claude-4.5-haiku": "anthropic/claude-3-haiku-20240307",
+    "claude-4.5-sonnet": "anthropic/claude-3.5-sonnet-20240620", # Updated to correct model ID
     "llava-13b": "yorickvp/llava-13b:e272157381e2a3bf12df3a8edd1f38d1dbd736bbb7437277c8b34175f8fce358"
 }
 # --- Core Logic ---
 def generate_request_id() -> str:
+    """Generates a unique request ID."""
+    return f"chatcmpl-{secrets.token_hex(16)}"
 def format_messages_for_replicate(messages: List[ChatMessage], functions: Optional[List[FunctionDefinition]] = None) -> Dict[str, Any]:
     prompt_parts = []
     system_prompt = None
     image_input = None
+    # Handle functions/tools if provided
+    tools_prompt_section = ""
     if functions:
+        tools_prompt_section += "You have access to the following tools. Use them if required to answer the user's question.\n\n"
         for func in functions:
+            tools_prompt_section += f"- Function: {func.name}\n"
+            if func.description: tools_prompt_section += f"  Description: {func.description}\n"
+            if func.parameters: tools_prompt_section += f"  Parameters: {json.dumps(func.parameters)}\n"
+        tools_prompt_section += "\nTo call a function, respond with a JSON object like this: {\"name\": \"function_name\", \"arguments\": {\"arg1\": \"value1\"}}\n"
     for msg in messages:
         if msg.role == "system":
             system_prompt = str(msg.content)
+            if tools_prompt_section:
+                system_prompt += "\n\n" + tools_prompt_section
         elif msg.role == "assistant":
             if msg.tool_calls:
                 tool_calls_text = "\nTool calls:\n"
                 user_text_content = str(msg.content)
             prompt_parts.append(f"User: {user_text_content}")
+    if not system_prompt and tools_prompt_section:
+        system_prompt = tools_prompt_section
+    prompt_parts.append("Assistant:")
     return {
+        "prompt": "\n".join(prompt_parts),
         "system_prompt": system_prompt,
         "image": image_input
     }
         pass
     return None
+async def stream_replicate_response(replicate_model_id: str, input_payload: dict, request_id: str, model_name: str):
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
         try:
             async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}, timeout=None) as sse:
                 current_event = None
                 async for line in sse.aiter_lines():
                     if not line: continue
                         if not raw_data: continue
                         try:
+                            # Replicate sends JSON-encoded strings. This correctly handles escaped chars like \n
                             content_token = json.loads(raw_data)
+                            if not isinstance(content_token, str):
+                                content_token = str(content_token) # Ensure it's a string
                         except (json.JSONDecodeError, TypeError):
+                            content_token = raw_data # Fallback for non-JSON data
                         completion_tokens += 1
+                        if content_token:
+                            chunk = ChatCompletionChunk(
+                                id=request_id,
+                                created=int(time.time()),
+                                model=model_name,
+                                choices=[ChoiceDelta(index=0, delta=DeltaMessage(content=content_token))]
+                            )
+                            yield f"data: {chunk.model_dump_json()}\n\n"
+                    # --- THIS IS THE CRITICAL FIX for incomplete responses ---
+                    elif line.startswith("data:") and current_event == "error":
+                        raw_data = line[5:].strip()
+                        error_details = raw_data
+                        try:
+                            error_json = json.loads(raw_data)
+                            error_details = error_json.get("detail") or str(error_json)
+                        except json.JSONDecodeError: pass
+                        error_chunk = {"error": {"message": f"Replicate stream error: {error_details}", "type": "replicate_error"}}
+                        yield f"data: {json.dumps(error_chunk)}\n\n"
+                        break # Stop streaming on error
                     elif current_event == "done":
                         end_time = time.time()
                         usage = Usage(prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=prompt_tokens + completion_tokens, inference_time=round(end_time - start_time, 3))
+                        usage_chunk = ChatCompletionChunk(id=request_id, created=int(time.time()), model=model_name, choices=[ChoiceDelta(index=0, delta=DeltaMessage(), finish_reason="stop")], usage=usage)
+                        yield f"data: {usage_chunk.model_dump_json()}\n\n"
                         break
         except httpx.ReadTimeout:
 # --- Endpoints ---
 @app.get("/v1/models", dependencies=[Depends(verify_api_key)])
 async def list_models():
     return ModelList(data=[ModelCard(id=k) for k in SUPPORTED_MODELS.keys()])
 @app.post("/v1/chat/completions", dependencies=[Depends(verify_api_key)])
 async def create_chat_completion(request: ChatCompletionRequest):
     if request.model not in SUPPORTED_MODELS:
         raise HTTPException(status_code=404, detail=f"Model not found. Available models: {list(SUPPORTED_MODELS.keys())}")
     replicate_input = {
         "prompt": formatted["prompt"],
+        "temperature": request.temperature if request.temperature is not None else 0.7,
+        "top_p": request.top_p if request.top_p is not None else 1.0,
     }
     if request.max_tokens is not None:
         replicate_input["max_new_tokens"] = request.max_tokens
+    # --- THIS IS THE SECOND FIX for incomplete responses ---
+    if request.stop:
+        replicate_input["stop_sequences"] = request.stop if isinstance(request.stop, list) else [request.stop]
     if formatted["system_prompt"]: replicate_input["system_prompt"] = formatted["system_prompt"]
     if formatted["image"]: replicate_input["image"] = formatted["image"]
     if request.stream:
         return StreamingResponse(
+            stream_replicate_response(replicate_model_id, replicate_input, request_id, request.model),
             media_type="text/event-stream"
         )
             resp.raise_for_status()
             pred = resp.json()
+            # Handle errors in non-streaming mode
+            if pred.get("status") == "failed":
+                raise HTTPException(status_code=500, detail=f"Replicate prediction failed: {pred.get('error')}")
             raw_output = pred.get("output")
+            output = "".join(raw_output) if isinstance(raw_output, list) else (raw_output or "")
             end_time = time.time()
             prompt_tokens = len(replicate_input.get("prompt", "")) // 4
             completion_tokens = len(output) // 4
             function_call = parse_function_call(output)
             if function_call:
+                tool_calls = [ToolCall(id=f"call_{int(time.time())}", function=FunctionCall(name=function_call["name"], arguments=json.dumps(function_call["arguments"])))]
                 finish_reason = "tool_calls"
+                message_content = None
             return ChatCompletion(
                 id=request_id,
 @app.get("/")
 async def root():
+    return {"message": "Replicate to OpenAI Compatibility Layer API", "version": "9.3.0"}
 @app.middleware("http")
 async def add_performance_headers(request, call_next):
     response = await call_next(request)
     process_time = time.time() - start_time
     response.headers["X-Process-Time"] = str(round(process_time, 3))
+    response.headers["X-API-Version"] = "9.3.0"
     return response