Spaces:
Runtime error
Runtime error
update
Browse files- components/llm/common.py +2 -1
- components/llm/deepinfra_api.py +48 -30
- routes/llm.py +29 -7
components/llm/common.py
CHANGED
|
@@ -72,8 +72,9 @@ class LlmApi:
|
|
| 72 |
class Message(BaseModel):
|
| 73 |
role: str
|
| 74 |
content: str
|
| 75 |
-
searchResults: str
|
| 76 |
searchEntities: Optional[List[str]] = []
|
|
|
|
| 77 |
|
| 78 |
class ChatRequest(BaseModel):
|
| 79 |
history: List[Message]
|
|
|
|
| 72 |
class Message(BaseModel):
|
| 73 |
role: str
|
| 74 |
content: str
|
| 75 |
+
searchResults: Optional[str] = ''
|
| 76 |
searchEntities: Optional[List[str]] = []
|
| 77 |
+
reasoning: Optional[str] = ''
|
| 78 |
|
| 79 |
class ChatRequest(BaseModel):
|
| 80 |
history: List[Message]
|
components/llm/deepinfra_api.py
CHANGED
|
@@ -257,7 +257,7 @@ class DeepInfraApi(LlmApi):
|
|
| 257 |
logging.error(f"Request failed: status code {response.status_code}")
|
| 258 |
logging.error(response.text)
|
| 259 |
|
| 260 |
-
async def predict_chat_stream(self, request: ChatRequest, system_prompt, params: LlmPredictParams, max_retries: int =
|
| 261 |
"""
|
| 262 |
Выполняет запрос к API с поддержкой потокового вывода (SSE) и возвращает результат.
|
| 263 |
|
|
@@ -271,7 +271,9 @@ class DeepInfraApi(LlmApi):
|
|
| 271 |
Returns:
|
| 272 |
str: Сгенерированный текст.
|
| 273 |
"""
|
| 274 |
-
|
|
|
|
|
|
|
| 275 |
request = self.create_chat_request(request, system_prompt, params)
|
| 276 |
request["stream"] = True
|
| 277 |
|
|
@@ -312,45 +314,61 @@ class DeepInfraApi(LlmApi):
|
|
| 312 |
|
| 313 |
|
| 314 |
async def get_predict_chat_generator(self, request: ChatRequest, system_prompt: str,
|
| 315 |
-
|
| 316 |
"""
|
| 317 |
-
Выполняет потоковый запрос к API и возвращает токены по мере их
|
| 318 |
|
| 319 |
Args:
|
| 320 |
request (ChatRequest): История чата.
|
| 321 |
system_prompt (str): Системный промпт.
|
| 322 |
params (LlmPredictParams): Параметры предсказания.
|
|
|
|
|
|
|
| 323 |
|
| 324 |
Yields:
|
| 325 |
str: Токены ответа LLM.
|
| 326 |
"""
|
|
|
|
| 327 |
timeout = httpx.Timeout(connect=30.0, read=None, pool=None, write=None, timeout=None)
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
|
| 355 |
async def predict(self, prompt: str, system_prompt: str) -> str:
|
| 356 |
"""
|
|
|
|
| 257 |
logging.error(f"Request failed: status code {response.status_code}")
|
| 258 |
logging.error(response.text)
|
| 259 |
|
| 260 |
+
async def predict_chat_stream(self, request: ChatRequest, system_prompt, params: LlmPredictParams, max_retries: int = 5, retry_delay: float = 2) -> str:
|
| 261 |
"""
|
| 262 |
Выполняет запрос к API с поддержкой потокового вывода (SSE) и возвращает результат.
|
| 263 |
|
|
|
|
| 271 |
Returns:
|
| 272 |
str: Сгенерированный текст.
|
| 273 |
"""
|
| 274 |
+
|
| 275 |
+
timeout = httpx.Timeout(connect=30.0, read=None, pool=None, write=None, timeout=None)
|
| 276 |
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
| 277 |
request = self.create_chat_request(request, system_prompt, params)
|
| 278 |
request["stream"] = True
|
| 279 |
|
|
|
|
| 314 |
|
| 315 |
|
| 316 |
async def get_predict_chat_generator(self, request: ChatRequest, system_prompt: str,
|
| 317 |
+
params: LlmPredictParams, max_retries: int = 5, retry_delay: float = 2) -> AsyncGenerator[str, None]:
|
| 318 |
"""
|
| 319 |
+
Выполняет потоковый запрос к API и возвращает токены по мере их генерации с реконнектом при ошибках.
|
| 320 |
|
| 321 |
Args:
|
| 322 |
request (ChatRequest): История чата.
|
| 323 |
system_prompt (str): Системный промпт.
|
| 324 |
params (LlmPredictParams): Параметры предсказания.
|
| 325 |
+
max_retries (int): Максимальное количество попыток повторного подключения.
|
| 326 |
+
retry_delay (float): Задержка между попытками в секундах.
|
| 327 |
|
| 328 |
Yields:
|
| 329 |
str: Токены ответа LLM.
|
| 330 |
"""
|
| 331 |
+
print(request.history)
|
| 332 |
timeout = httpx.Timeout(connect=30.0, read=None, pool=None, write=None, timeout=None)
|
| 333 |
+
attempt = 0
|
| 334 |
+
|
| 335 |
+
while attempt < max_retries + 1:
|
| 336 |
+
try:
|
| 337 |
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
| 338 |
+
request_data = self.create_chat_request(request, system_prompt, params)
|
| 339 |
+
request_data["stream"] = True
|
| 340 |
+
|
| 341 |
+
async with client.stream(
|
| 342 |
+
"POST",
|
| 343 |
+
f"{self.params.url}/v1/openai/chat/completions",
|
| 344 |
+
json=request_data,
|
| 345 |
+
headers=super().create_headers(),
|
| 346 |
+
) as response:
|
| 347 |
+
if response.status_code != 200:
|
| 348 |
+
error_content = await response.aread()
|
| 349 |
+
raise Exception(f"API error: {error_content.decode('utf-8')}")
|
| 350 |
+
|
| 351 |
+
async for line in response.aiter_lines():
|
| 352 |
+
if line.startswith("data: "):
|
| 353 |
+
try:
|
| 354 |
+
data = json.loads(line[len("data: "):].strip())
|
| 355 |
+
if data == "[DONE]":
|
| 356 |
+
return # Успешно завершаем генерацию
|
| 357 |
+
if "choices" in data and data["choices"]:
|
| 358 |
+
token_value = data["choices"][0].get("delta", {}).get("content", "")
|
| 359 |
+
if token_value:
|
| 360 |
+
yield token_value
|
| 361 |
+
except json.JSONDecodeError:
|
| 362 |
+
continue
|
| 363 |
+
return # Успешно завершили обработку потока
|
| 364 |
+
|
| 365 |
+
except Exception as e:
|
| 366 |
+
attempt += 1
|
| 367 |
+
if attempt == max_retries + 1:
|
| 368 |
+
raise Exception(f"predict_chat_stream failed after {max_retries} retries: {str(e)}")
|
| 369 |
+
# Ждем перед следующей попыткой
|
| 370 |
+
await asyncio.sleep(retry_delay)
|
| 371 |
+
|
| 372 |
|
| 373 |
async def predict(self, prompt: str, system_prompt: str) -> str:
|
| 374 |
"""
|
routes/llm.py
CHANGED
|
@@ -82,12 +82,21 @@ def try_insert_search_results(
|
|
| 82 |
return True
|
| 83 |
return False
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
def collapse_history_to_first_message(chat_request: ChatRequest) -> ChatRequest:
|
| 86 |
"""
|
| 87 |
Сворачивает историю в первое сообщение и возвращает новый объект ChatRequest.
|
| 88 |
Формат:
|
| 89 |
-
<search-results>[Источник] - текст</search-results>
|
| 90 |
role: текст сообщения
|
|
|
|
|
|
|
|
|
|
| 91 |
"""
|
| 92 |
if not chat_request.history:
|
| 93 |
return ChatRequest(history=[])
|
|
@@ -95,12 +104,15 @@ def collapse_history_to_first_message(chat_request: ChatRequest) -> ChatRequest:
|
|
| 95 |
# Собираем историю в одну строку
|
| 96 |
collapsed_content = []
|
| 97 |
for msg in chat_request.history:
|
| 98 |
-
# Добавляем search-results, если они есть
|
| 99 |
-
if msg.searchResults:
|
| 100 |
-
collapsed_content.append(f"<search-results>{msg.searchResults}</search-results>")
|
| 101 |
# Добавляем текст сообщения с указанием роли
|
| 102 |
if msg.content.strip():
|
| 103 |
-
collapsed_content.append(f"{msg.role}: {msg.content.strip()}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
# Формируем финальный текст с переносами строк
|
| 106 |
new_content = "\n".join(collapsed_content)
|
|
@@ -122,9 +134,19 @@ async def sse_generator(request: ChatRequest, llm_api: DeepInfraApi, system_prom
|
|
| 122 |
Генератор для стриминга ответа LLM через SSE.
|
| 123 |
"""
|
| 124 |
try:
|
| 125 |
-
qe_result = await dialogue_service.get_qe_result(request.history)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
qe_event = {
|
| 127 |
-
"event": "
|
| 128 |
"data": {
|
| 129 |
"text": qe_result.debug_message
|
| 130 |
}
|
|
|
|
| 82 |
return True
|
| 83 |
return False
|
| 84 |
|
| 85 |
+
def try_insert_reasoning(
|
| 86 |
+
chat_request: ChatRequest, reasoning: str
|
| 87 |
+
):
|
| 88 |
+
for msg in reversed(chat_request.history):
|
| 89 |
+
if msg.role == "user":
|
| 90 |
+
msg.reasoning = reasoning
|
| 91 |
+
|
| 92 |
def collapse_history_to_first_message(chat_request: ChatRequest) -> ChatRequest:
|
| 93 |
"""
|
| 94 |
Сворачивает историю в первое сообщение и возвращает новый объект ChatRequest.
|
| 95 |
Формат:
|
|
|
|
| 96 |
role: текст сообщения
|
| 97 |
+
<reasoning>[Источник] - текст</reasoning>
|
| 98 |
+
<search-results>[Источник] - текст</search-results>
|
| 99 |
+
|
| 100 |
"""
|
| 101 |
if not chat_request.history:
|
| 102 |
return ChatRequest(history=[])
|
|
|
|
| 104 |
# Собираем историю в одну строку
|
| 105 |
collapsed_content = []
|
| 106 |
for msg in chat_request.history:
|
|
|
|
|
|
|
|
|
|
| 107 |
# Добавляем текст сообщения с указанием роли
|
| 108 |
if msg.content.strip():
|
| 109 |
+
collapsed_content.append(f"{msg.role.strip()}: {msg.content.strip()}")
|
| 110 |
+
# Добавляем reasoning, если есть
|
| 111 |
+
if msg.reasoning.strip():
|
| 112 |
+
collapsed_content.append(f"<reasoning>{msg.reasoning}</reasoning>")
|
| 113 |
+
# Добавляем search-results, если они есть
|
| 114 |
+
if msg.searchResults.strip():
|
| 115 |
+
collapsed_content.append(f"<search-results>{msg.searchResults}</search-results>")
|
| 116 |
|
| 117 |
# Формируем финальный текст с переносами строк
|
| 118 |
new_content = "\n".join(collapsed_content)
|
|
|
|
| 134 |
Генератор для стриминга ответа LLM через SSE.
|
| 135 |
"""
|
| 136 |
try:
|
| 137 |
+
qe_result = await dialogue_service.get_qe_result(request.history)
|
| 138 |
+
try_insert_reasoning(request, qe_result.debug_message)
|
| 139 |
+
|
| 140 |
+
# qe_debug_event = {
|
| 141 |
+
# "event": "debug",
|
| 142 |
+
# "data": {
|
| 143 |
+
# "text": qe_result.debug_message
|
| 144 |
+
# }
|
| 145 |
+
# }
|
| 146 |
+
# yield f"data: {json.dumps(qe_debug_event, ensure_ascii=False)}\n\n"
|
| 147 |
+
|
| 148 |
qe_event = {
|
| 149 |
+
"event": "reasoning",
|
| 150 |
"data": {
|
| 151 |
"text": qe_result.debug_message
|
| 152 |
}
|