Spaces:

RWKV-Red-Team
/

RWKV-LatestSpace

Running on T4

App Files Files Community

sparkleman commited on Mar 12

Commit

94c4923

1 Parent(s): 664ff1c

UPDATE: Remove <think> tag in content & handle EOS token

Browse files

Files changed (2) hide show

app.py +24 -9
utils.py +27 -2

app.py CHANGED Viewed

@@ -44,6 +44,8 @@ from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from api_types import (
     ChatMessage,
@@ -54,7 +56,7 @@ from api_types import (
     ChatCompletionChoice,
     ChatCompletionMessage,
 )
-from utils import cleanMessages, parse_think_response
 class ModelStorage:
@@ -159,6 +161,7 @@ app.add_middleware(
     allow_methods=["*"],
     allow_headers=["*"],
 )
 async def runPrefill(
@@ -185,7 +188,6 @@ def generate(
     out,
     model_tokens: List[int],
     model_state,
-    stops=["\n\n"],
     max_tokens=2048,
 ):
     args = PIPELINE_ARGS(
@@ -212,18 +214,29 @@ def generate(
             out, temperature=args.temperature, top_p=args.top_p
         )
         out, model_state = MODEL_STORAGE[request.model].model.forward(
             [token], model_state
         )
-        model_tokens.append(token)
-        out_tokens.append(token)
         if token in request.stop_tokens:
             yield {
                 "content": "",
                 "tokens": out_tokens[out_last:],
-                "finish_reason": "stop",
                 "state": model_state,
             }
@@ -231,6 +244,8 @@ def generate(
             gc.collect()
             return
         for xxx in occurrence:
             occurrence[xxx] *= request.penalty_decay
         occurrence[token] = 1 + (occurrence[token] if token in occurrence else 0)
@@ -243,13 +258,13 @@ def generate(
         output_cache.append(tmp)
         output_cache_str = "".join(output_cache)
-        for stop_words in stops:
             if stop_words in output_cache_str:
                 yield {
                     "content": tmp.replace(stop_words, ""),
                     "tokens": out_tokens[out_last:],
-                    "finish_reason": "stop",
                     "state": model_state,
                 }
@@ -365,7 +380,7 @@ async def chatResponseStream(
     createTimestamp = int(time.time())
     prompt = (
-        f"{cleanMessages(request.messages)}\n\nAssistant:{' <think' if enableReasoning else ''}"
         if request.prompt == None
         else request.prompt.strip()
     )
@@ -415,7 +430,7 @@ async def chatResponseStream(
         buffer.append("<think")
         streamConfig = {
-            "isChecking": False,
             "fullTextCursor": 0,
             "in_think": False,
             "cacheStr": "",

 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.gzip import GZipMiddleware
 from api_types import (
     ChatMessage,
     ChatCompletionChoice,
     ChatCompletionMessage,
 )
+from utils import cleanMessages, parse_think_response, remove_nested_think_tags_stack
 class ModelStorage:
     allow_methods=["*"],
     allow_headers=["*"],
 )
+app.add_middleware(GZipMiddleware, minimum_size=1000, compresslevel=5)
 async def runPrefill(
     out,
     model_tokens: List[int],
     model_state,
     max_tokens=2048,
 ):
     args = PIPELINE_ARGS(
             out, temperature=args.temperature, top_p=args.top_p
         )
+        if token == 0 and token in request.stop_tokens:
+            yield {
+                "content": "",
+                "tokens": out_tokens[out_last:],
+                "finish_reason": "stop:token:0",
+                "state": model_state,
+            }
+            del out
+            gc.collect()
+            return
         out, model_state = MODEL_STORAGE[request.model].model.forward(
             [token], model_state
         )
+        model_tokens.append(token)
         if token in request.stop_tokens:
             yield {
                 "content": "",
                 "tokens": out_tokens[out_last:],
+                "finish_reason": f"stop:token:{token}",
                 "state": model_state,
             }
             gc.collect()
             return
+        out_tokens.append(token)
         for xxx in occurrence:
             occurrence[xxx] *= request.penalty_decay
         occurrence[token] = 1 + (occurrence[token] if token in occurrence else 0)
         output_cache.append(tmp)
         output_cache_str = "".join(output_cache)
+        for stop_words in request.stop:
             if stop_words in output_cache_str:
                 yield {
                     "content": tmp.replace(stop_words, ""),
                     "tokens": out_tokens[out_last:],
+                    "finish_reason": f"stop:words:{stop_words}",
                     "state": model_state,
                 }
     createTimestamp = int(time.time())
     prompt = (
+        f"{cleanMessages(request.messages,enableReasoning)}\n\nAssistant:{' <think' if enableReasoning else ''}"
         if request.prompt == None
         else request.prompt.strip()
     )
         buffer.append("<think")
         streamConfig = {
+            "isChecking": False,  # check whether is <think> tag
             "fullTextCursor": 0,
             "in_think": False,
             "cacheStr": "",

utils.py CHANGED Viewed

@@ -24,12 +24,37 @@ def parse_think_response(full_response: str):
     return reasoning_content, content
-def cleanMessages(messages: List[ChatMessage]):
     promptStrList = []
     for message in messages:
         content = message.content.strip()
         content = re.sub(r"\n+", "\n", content)
-        promptStrList.append(f"{message.role.strip()}: {content}")
     return "\n\n".join(promptStrList)

     return reasoning_content, content
+def cleanMessages(messages: List[ChatMessage], removeThinkingContent: bool = False):
     promptStrList = []
     for message in messages:
         content = message.content.strip()
         content = re.sub(r"\n+", "\n", content)
+        promptStrList.append(
+            f"{message.role.strip()}: {content if message.role!='Assistant' or not removeThinkingContent else remove_nested_think_tags_stack(content)}"
+        )
     return "\n\n".join(promptStrList)
+def remove_nested_think_tags_stack(text):
+    stack = []
+    result = ""
+    i = 0
+    while i < len(text):
+        if text[i : i + 7] == "<think>":
+            stack.append("<think>")
+            i += 7
+        elif text[i : i + 8] == "</think>":
+            if stack and stack[-1] == "<think>":
+                stack.pop()
+                i += 8
+            else:
+                result += text[i : i + 8]
+                i += 8
+        elif not stack:
+            result += text[i]
+            i += 1
+        else:
+            i += 1
+    return result