Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

rkihacker commited on 28 days ago

Commit

0f99721

verified ·

1 Parent(s): 580cccc

Update main.py

Browse files

Files changed (1) hide show

main.py +43 -109

main.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import httpx
 import json
@@ -17,7 +16,7 @@ if not REPLICATE_API_TOKEN:
     raise ValueError("REPLICATE_API_TOKEN environment variable not set.")
 # FastAPI Init
-app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="9.2.1 (Spacing Fixed)")
 # --- Pydantic Models ---
 class ModelCard(BaseModel):
@@ -25,7 +24,7 @@ class ModelCard(BaseModel):
 class ModelList(BaseModel):
     object: str = "list"; data: List[ModelCard] = []
 class ChatMessage(BaseModel):
-    role: Literal["system", "user", "assistant", "tool"]; content: Union[str, List[Dict[str, Any]]]; name: Optional[str] = None
 class FunctionDefinition(BaseModel):
     name: str; description: Optional[str] = None; parameters: Optional[Dict[str, Any]] = None
 class ToolDefinition(BaseModel):
@@ -57,33 +56,30 @@ class ChatCompletionChunk(BaseModel):
 # --- Supported Models ---
 SUPPORTED_MODELS = {
     "llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
-    "claude-4.5-haiku": "anthropic/claude-4.5-haiku",
-    "claude-4.5-sonnet": "anthropic/claude-4.5-sonnet",
     "llava-13b": "yorickvp/llava-13b:e272157381e2a3bf12df3a8edd1f38d1dbd736bbb7437277c8b34175f8fce358"
 }
 # --- Core Logic ---
 def format_messages_for_replicate(messages: List[ChatMessage], functions: Optional[List[FunctionDefinition]] = None) -> Dict[str, Any]:
-    """Convert OpenAI messages to Replicate-compatible format with function calling support."""
     prompt_parts = []
     system_prompt = None
     image_input = None
-    # Add functions to system prompt if provided
     if functions:
-        functions_text = "\n\nAvailable functions:\n"
         for func in functions:
-            functions_text += f"- {func.name}: {func.description or 'No description'}\n"
-            if func.parameters:
-                functions_text += f"  Parameters: {json.dumps(func.parameters)}\n"
         prompt_parts.append(functions_text)
     for msg in messages:
         if msg.role == "system":
             system_prompt = str(msg.content)
         elif msg.role == "assistant":
-            # Handle tool calls in assistant messages
-            if hasattr(msg, 'tool_calls') and msg.tool_calls:
                 tool_calls_text = "\nTool calls:\n"
                 for tool_call in msg.tool_calls:
                     tool_calls_text += f"- {tool_call.function.name}({tool_call.function.arguments})\n"
@@ -91,7 +87,6 @@ def format_messages_for_replicate(messages: List[ChatMessage], functions: Option
             else:
                 prompt_parts.append(f"Assistant: {msg.content}")
         elif msg.role == "tool":
-            # Handle tool responses
             prompt_parts.append(f"Tool Response: {msg.content}")
         elif msg.role == "user":
             user_text_content = ""
@@ -106,8 +101,7 @@ def format_messages_for_replicate(messages: List[ChatMessage], functions: Option
                 user_text_content = str(msg.content)
             prompt_parts.append(f"User: {user_text_content}")
-    # Fix: Don't add trailing space, let model decide spacing
-    prompt_parts.append("Assistant:")
     return {
         "prompt": "\n\n".join(prompt_parts),
         "system_prompt": system_prompt,
@@ -115,11 +109,8 @@ def format_messages_for_replicate(messages: List[ChatMessage], functions: Option
     }
 def parse_function_call(content: str) -> Optional[Dict[str, Any]]:
-    """Parse function call from model response."""
     try:
-        # Look for JSON-like function call patterns
         if "function_call" in content or ("name" in content and "arguments" in content):
-            # Extract JSON part
             start = content.find("{")
             end = content.rfind("}") + 1
             if start != -1 and end > start:
@@ -132,7 +123,6 @@ def parse_function_call(content: str) -> Optional[Dict[str, Any]]:
     return None
 async def stream_replicate_response(replicate_model_id: str, input_payload: dict, request_id: str):
-    """Stream response with full OpenAI compatibility including tool calls."""
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
@@ -151,9 +141,7 @@ async def stream_replicate_response(replicate_model_id: str, input_payload: dict
                 return
         except httpx.HTTPStatusError as e:
             error_details = e.response.text
-            try:
-                error_json = e.response.json()
-                error_details = error_json.get("detail", error_details)
             except json.JSONDecodeError: pass
             yield f"data: {json.dumps({'error': {'message': f'Upstream Error: {error_details}', 'type': 'replicate_error'}})}\n\n"
             return
@@ -172,80 +160,39 @@ async def stream_replicate_response(replicate_model_id: str, input_payload: dict
                     elif line.startswith("data:") and current_event == "output":
                         raw_data = line[5:].strip()
                         if not raw_data: continue
-                        content_token = ""
                         try:
                             content_token = json.loads(raw_data)
                         except (json.JSONDecodeError, TypeError):
                             content_token = raw_data
-                        # Fix: Handle spacing properly - don't prepend space to first token
                         if first_token:
                             content_token = content_token.lstrip()
-                            first_token = False
                         accumulated_content += content_token
                         completion_tokens += 1
-                        # Check for function calls in accumulated content
                         function_call = parse_function_call(accumulated_content)
                         if function_call:
-                            # Send tool call chunk
-                            tool_call = ToolCall(
-                                id=f"call_{int(time.time())}",
-                                function=FunctionCall(
-                                    name=function_call["name"],
-                                    arguments=function_call["arguments"]
-                                )
-                            )
-                            chunk = ChatCompletionChunk(
-                                id=request_id,
-                                created=int(time.time()),
-                                model=replicate_model_id,
-                                choices=[ChoiceDelta(
-                                    index=0,
-                                    delta=DeltaMessage(tool_calls=[tool_call]),
-                                    finish_reason=None
-                                )]
-                            )
                             yield f"data: {chunk.json()}\n\n"
                         else:
-                            # Send regular content chunk
-                            chunk = ChatCompletionChunk(
-                                id=request_id,
-                                created=int(time.time()),
-                                model=replicate_model_id,
-                                choices=[ChoiceDelta(
-                                    index=0,
-                                    delta=DeltaMessage(content=content_token),
-                                    finish_reason=None
-                                )]
-                            )
-                            yield f"data: {chunk.json()}\n\n"
                     elif current_event == "done":
-                        # Send final usage chunk
                         end_time = time.time()
-                        inference_time = end_time - start_time
-                        usage = Usage(
-                            prompt_tokens=prompt_tokens,
-                            completion_tokens=completion_tokens,
-                            total_tokens=prompt_tokens + completion_tokens,
-                            inference_time=round(inference_time, 3)
-                        )
-                        usage_chunk = ChatCompletionChunk(
-                            id=request_id,
-                            created=int(time.time()),
-                            model=replicate_model_id,
-                            choices=[ChoiceDelta(
-                                index=0,
-                                delta=DeltaMessage(),
-                                finish_reason="stop"
-                            )],
-                            usage=usage
-                        )
                         yield f"data: {usage_chunk.json()}\n\n"
                         break
@@ -265,7 +212,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
     if request.model not in SUPPORTED_MODELS:
         raise HTTPException(status_code=404, detail=f"Model not found. Available models: {list(SUPPORTED_MODELS.keys())}")
-    # Format messages for Replicate
     formatted = format_messages_for_replicate(request.messages, request.functions)
     replicate_input = {
         "prompt": formatted["prompt"],
@@ -273,21 +220,19 @@ async def create_chat_completion(request: ChatCompletionRequest):
         "temperature": request.temperature or 0.7,
         "top_p": request.top_p or 1.0
     }
-    if formatted["system_prompt"]:
-        replicate_input["system_prompt"] = formatted["system_prompt"]
-    if formatted["image"]:
-        replicate_input["image"] = formatted["image"]
     request_id = f"chatcmpl-{int(time.time())}"
     if request.stream:
         return StreamingResponse(
-            stream_replicate_response(SUPPORTED_MODELS[request.model], replicate_input, request_id),
             media_type="text/event-stream"
         )
     # Non-streaming response
-    url = f"https://api.replicate.com/v1/models/{SUPPORTED_MODELS[request.model]}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
     start_time = time.time()
@@ -298,27 +243,21 @@ async def create_chat_completion(request: ChatCompletionRequest):
             pred = resp.json()
             output = "".join(pred.get("output", []))
-            # Fix: Clean up leading/trailing whitespace
-            output = output.strip()
-            # Calculate timing and tokens
             end_time = time.time()
-            inference_time = end_time - start_time
             prompt_tokens = len(replicate_input.get("prompt", "")) // 4
             completion_tokens = len(output) // 4
-            # Parse function call if present
             tool_calls = None
             function_call = parse_function_call(output)
             if function_call:
-                tool_call = ToolCall(
-                    id=f"call_{int(time.time())}",
-                    function=FunctionCall(
-                        name=function_call["name"],
-                        arguments=function_call["arguments"]
-                    )
-                )
-                tool_calls = [tool_call]
             return ChatCompletion(
                 id=request_id,
@@ -326,18 +265,14 @@ async def create_chat_completion(request: ChatCompletionRequest):
                 model=request.model,
                 choices=[Choice(
                     index=0,
-                    message=ChatMessage(
-                        role="assistant",
-                        content=output if not function_call else None,
-                        tool_calls=tool_calls
-                    ),
-                    finish_reason="tool_calls" if function_call else "stop"
                 )],
                 usage=Usage(
                     prompt_tokens=prompt_tokens,
                     completion_tokens=completion_tokens,
                     total_tokens=prompt_tokens + completion_tokens,
-                    inference_time=round(inference_time, 3)
                 )
             )
         except httpx.HTTPStatusError as e:
@@ -347,14 +282,13 @@ async def create_chat_completion(request: ChatCompletionRequest):
 @app.get("/")
 async def root():
-    return {"message": "Replicate to OpenAI Compatibility Layer API", "version": "9.2.1"}
-# Performance optimization middleware
 @app.middleware("http")
 async def add_performance_headers(request, call_next):
     start_time = time.time()
     response = await call_next(request)
     process_time = time.time() - start_time
     response.headers["X-Process-Time"] = str(round(process_time, 3))
-    response.headers["X-API-Version"] = "9.2.1"
     return response

 import os
 import httpx
 import json
     raise ValueError("REPLICATE_API_TOKEN environment variable not set.")
 # FastAPI Init
+app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="9.2.2 (Spacing Fixed)")
 # --- Pydantic Models ---
 class ModelCard(BaseModel):
 class ModelList(BaseModel):
     object: str = "list"; data: List[ModelCard] = []
 class ChatMessage(BaseModel):
+    role: Literal["system", "user", "assistant", "tool"]; content: Union[str, List[Dict[str, Any]]]; name: Optional[str] = None; tool_calls: Optional[List[Any]] = None
 class FunctionDefinition(BaseModel):
     name: str; description: Optional[str] = None; parameters: Optional[Dict[str, Any]] = None
 class ToolDefinition(BaseModel):
 # --- Supported Models ---
 SUPPORTED_MODELS = {
     "llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
+    "claude-3-haiku-20240307": "anthropic/claude-3-haiku-20240307", # Example of another common model
+    "claude-3-sonnet-20240229": "anthropic/claude-3-sonnet-20240229",
     "llava-13b": "yorickvp/llava-13b:e272157381e2a3bf12df3a8edd1f38d1dbd736bbb7437277c8b34175f8fce358"
 }
 # --- Core Logic ---
 def format_messages_for_replicate(messages: List[ChatMessage], functions: Optional[List[FunctionDefinition]] = None) -> Dict[str, Any]:
     prompt_parts = []
     system_prompt = None
     image_input = None
     if functions:
+        functions_text = "You have access to the following tools. Use them if required to answer the user's question.\n\n"
         for func in functions:
+            functions_text += f"- Function: {func.name}\n"
+            if func.description: functions_text += f"  Description: {func.description}\n"
+            if func.parameters: functions_text += f"  Parameters: {json.dumps(func.parameters)}\n"
         prompt_parts.append(functions_text)
     for msg in messages:
         if msg.role == "system":
             system_prompt = str(msg.content)
         elif msg.role == "assistant":
+            if msg.tool_calls:
                 tool_calls_text = "\nTool calls:\n"
                 for tool_call in msg.tool_calls:
                     tool_calls_text += f"- {tool_call.function.name}({tool_call.function.arguments})\n"
             else:
                 prompt_parts.append(f"Assistant: {msg.content}")
         elif msg.role == "tool":
             prompt_parts.append(f"Tool Response: {msg.content}")
         elif msg.role == "user":
             user_text_content = ""
                 user_text_content = str(msg.content)
             prompt_parts.append(f"User: {user_text_content}")
+    prompt_parts.append("Assistant:") # Let the model generate the space after this
     return {
         "prompt": "\n\n".join(prompt_parts),
         "system_prompt": system_prompt,
     }
 def parse_function_call(content: str) -> Optional[Dict[str, Any]]:
     try:
         if "function_call" in content or ("name" in content and "arguments" in content):
             start = content.find("{")
             end = content.rfind("}") + 1
             if start != -1 and end > start:
     return None
 async def stream_replicate_response(replicate_model_id: str, input_payload: dict, request_id: str):
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
                 return
         except httpx.HTTPStatusError as e:
             error_details = e.response.text
+            try: error_details = e.response.json().get("detail", error_details)
             except json.JSONDecodeError: pass
             yield f"data: {json.dumps({'error': {'message': f'Upstream Error: {error_details}', 'type': 'replicate_error'}})}\n\n"
             return
                     elif line.startswith("data:") and current_event == "output":
                         raw_data = line[5:].strip()
                         if not raw_data: continue
                         try:
                             content_token = json.loads(raw_data)
                         except (json.JSONDecodeError, TypeError):
                             content_token = raw_data
+                        # ### MAJOR FIX HERE ###
+                        # This logic robustly handles the leading space by only stripping
+                        # the very first non-empty token of the entire stream.
                         if first_token:
                             content_token = content_token.lstrip()
+                            # Only flip the flag if we've actually processed a token with content.
+                            if content_token:
+                                first_token = False
                         accumulated_content += content_token
                         completion_tokens += 1
                         function_call = parse_function_call(accumulated_content)
                         if function_call:
+                            tool_call = ToolCall(id=f"call_{int(time.time())}", function=FunctionCall(name=function_call["name"], arguments=function_call["arguments"]))
+                            chunk = ChatCompletionChunk(id=request_id, created=int(time.time()), model=replicate_model_id, choices=[ChoiceDelta(index=0, delta=DeltaMessage(tool_calls=[tool_call]), finish_reason=None)])
                             yield f"data: {chunk.json()}\n\n"
                         else:
+                            # Only yield a chunk if there is content to send.
+                            if content_token:
+                                chunk = ChatCompletionChunk(id=request_id, created=int(time.time()), model=replicate_model_id, choices=[ChoiceDelta(index=0, delta=DeltaMessage(content=content_token), finish_reason=None)])
+                                yield f"data: {chunk.json()}\n\n"
                     elif current_event == "done":
                         end_time = time.time()
+                        usage = Usage(prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=prompt_tokens + completion_tokens, inference_time=round(end_time - start_time, 3))
+                        usage_chunk = ChatCompletionChunk(id=request_id, created=int(time.time()), model=replicate_model_id, choices=[ChoiceDelta(index=0, delta=DeltaMessage(), finish_reason="stop")], usage=usage)
                         yield f"data: {usage_chunk.json()}\n\n"
                         break
     if request.model not in SUPPORTED_MODELS:
         raise HTTPException(status_code=404, detail=f"Model not found. Available models: {list(SUPPORTED_MODELS.keys())}")
+    replicate_model_id = SUPPORTED_MODELS[request.model]
     formatted = format_messages_for_replicate(request.messages, request.functions)
     replicate_input = {
         "prompt": formatted["prompt"],
         "temperature": request.temperature or 0.7,
         "top_p": request.top_p or 1.0
     }
+    if formatted["system_prompt"]: replicate_input["system_prompt"] = formatted["system_prompt"]
+    if formatted["image"]: replicate_input["image"] = formatted["image"]
     request_id = f"chatcmpl-{int(time.time())}"
     if request.stream:
         return StreamingResponse(
+            stream_replicate_response(replicate_model_id, replicate_input, request_id),
             media_type="text/event-stream"
         )
     # Non-streaming response
+    url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
     start_time = time.time()
             pred = resp.json()
             output = "".join(pred.get("output", []))
+            output = output.strip() # Clean up any leading/trailing whitespace
             end_time = time.time()
             prompt_tokens = len(replicate_input.get("prompt", "")) // 4
             completion_tokens = len(output) // 4
             tool_calls = None
+            finish_reason = "stop"
+            message_content = output
             function_call = parse_function_call(output)
             if function_call:
+                tool_calls = [ToolCall(id=f"call_{int(time.time())}", function=FunctionCall(name=function_call["name"], arguments=function_call["arguments"]))]
+                finish_reason = "tool_calls"
+                message_content = None # OpenAI standard: content is null when tool_calls are present
             return ChatCompletion(
                 id=request_id,
                 model=request.model,
                 choices=[Choice(
                     index=0,
+                    message=ChatMessage(role="assistant", content=message_content, tool_calls=tool_calls),
+                    finish_reason=finish_reason
                 )],
                 usage=Usage(
                     prompt_tokens=prompt_tokens,
                     completion_tokens=completion_tokens,
                     total_tokens=prompt_tokens + completion_tokens,
+                    inference_time=round(end_time - start_time, 3)
                 )
             )
         except httpx.HTTPStatusError as e:
 @app.get("/")
 async def root():
+    return {"message": "Replicate to OpenAI Compatibility Layer API", "version": "9.2.2"}
 @app.middleware("http")
 async def add_performance_headers(request, call_next):
     start_time = time.time()
     response = await call_next(request)
     process_time = time.time() - start_time
     response.headers["X-Process-Time"] = str(round(process_time, 3))
+    response.headers["X-API-Version"] = "9.2.2"
     return response