Deepti-AI commited on
Commit
fcbfe7d
·
verified ·
1 Parent(s): 477cf68

Update main.py

Browse files

latency times have printed

Files changed (1) hide show
  1. main.py +58 -25
main.py CHANGED
@@ -3,6 +3,8 @@ os.environ["TRANSFORMERS_CACHE"] = "/app/.cache/transformers"
3
  os.environ["HF_HOME"] = "/app/.cache/huggingface"
4
 
5
  import uvicorn
 
 
6
  from fastapi import FastAPI, File, UploadFile
7
  from fastapi.responses import StreamingResponse
8
  from fastapi.middleware.cors import CORSMiddleware
@@ -173,8 +175,10 @@ async def compute_similarity(query: str, query_embedding: np.ndarray, chunk_text
173
  return combined_score
174
 
175
  async def retrieve_top_k_hybrid(query, k, sem_weight,syn_weight,bm25):
176
-
177
  query_embedding = model.encode(query)
 
 
178
 
179
  tasks = [
180
 
@@ -192,7 +196,7 @@ async def retrieve_top_k_hybrid(query, k, sem_weight,syn_weight,bm25):
192
 
193
  # print("the retrieved chunks are")
194
  # print(top_results["telugu_chunk"].to_list()[0])
195
-
196
  return top_results["telugu_chunk"].to_list()
197
 
198
 
@@ -319,7 +323,13 @@ def tts_chunk_stream(text_chunk: str, lang: str = "en"):
319
 
320
  async def get_rag_response(user_message_english: str, user_message_telugu: str):
321
  global chat_messages
 
322
  Chunks = await retrieve_top_k_hybrid(user_message_english,15, 0.9, 0.1,bm25)
 
 
 
 
 
323
  context = "======================================================================================================\n".join(map(str,Chunks))
324
  chat_messages.append({"role": "user", "content": f'''
325
  Context : {context}
@@ -330,53 +340,74 @@ async def get_rag_response(user_message_english: str, user_message_telugu: str):
330
 
331
  # --- GPT + TTS async generator with smaller buffer like second code ---
332
  async def gpt_tts_stream(prompt: str,telugu_text: str):
333
- # start_time = time.time()
334
- # print("started gpt_tts_stream",prompt)
335
  global chat_messages
336
  chat_messages = await get_rag_response(prompt,telugu_text)
337
  # print(chat_messages,"chat_messages after getting RAG response")
338
-
339
  # response = openai.ChatCompletion.create(
340
  # model="gpt-4o",
341
  # messages= chat_messages,
342
  # stream=True
343
  # )
344
-
345
  bot_response = ""
346
  buffer = ""
347
  buffer_size = 30
348
-
 
 
 
349
  # ✅ Must use the `with` block for streaming
 
350
  with client.chat.completions.stream(
351
  model="gpt-4o",
352
  messages=chat_messages,
353
  ) as stream:
354
-
355
  for event in stream:
 
 
 
 
 
 
356
  if event.type == "content.delta":
357
  delta = event.delta
358
  bot_response = bot_response + delta
359
  buffer += delta
360
  if len(buffer) >= buffer_size and buffer.endswith((".", "!", ",", "?", "\n", ";", ":")):
361
- # print("the buffer is ")
362
- # print(buffer)
 
 
 
 
363
  # audio_chunks = tts_chunk_stream(buffer)
 
364
  for audio_chunk in tts_chunk_stream(buffer):
365
- # print("chunk",buffer)
 
 
 
 
 
 
366
  yield audio_chunk
367
  buffer = ""
368
-
 
 
369
  elif event.type == "content.done":
370
- fll_response = event.content
371
  # 🧾 model finished — flush whatever is left
372
  if buffer.strip():
373
- # print("the left over message")
 
374
  print(buffer.strip())
375
  for audio_chunk in tts_chunk_stream(buffer):
 
376
  # print("chunk",buffer)
377
  yield audio_chunk
378
- buffer = ""
379
-
 
 
380
 
381
  bot_response = bot_response.strip()
382
  # print("the final bot response :")
@@ -385,9 +416,6 @@ async def gpt_tts_stream(prompt: str,telugu_text: str):
385
  # print(fll_response)
386
  chat_messages.append({"role": "assistant", "content": bot_response})
387
 
388
-
389
-
390
-
391
  # def convert_to_mono16_wav_bytes(audio_bytes: bytes) -> tuple[bytes, int]:
392
  # print("i am inside the mono16 conversion")
393
  # """Convert any uploaded audio (mp3/webm/wav) to mono 16-bit WAV bytes in memory."""
@@ -465,7 +493,6 @@ async def gpt_tts_stream(prompt: str,telugu_text: str):
465
  async def chat_stream(file: UploadFile = File(...)):
466
  start_time = time.time()
467
  audio_bytes = await file.read()
468
- print("audio file read")
469
 
470
  transcription = client.audio.transcriptions.create(
471
  model="gpt-4o-transcribe", # or "gpt-4o-mini-transcribe"
@@ -475,8 +502,12 @@ async def chat_stream(file: UploadFile = File(...)):
475
  )
476
 
477
  telugu_text = transcription.text
 
 
 
 
478
  print(f"the text is : {telugu_text}")
479
- print(f"tts time : {time.time()-start_time}")
480
  start_time = time.time()
481
  translation = client.responses.create(
482
  model="gpt-4o-mini",
@@ -487,12 +518,14 @@ async def chat_stream(file: UploadFile = File(...)):
487
  Give only the english translation, These queries are generally relevant to knee replacement surgery. Make sure you correct minor mistakes and return the user query in a proper english.''')
488
 
489
  english_text = translation.output[0].content[0].text
490
- print(f"translation time {time.time() - start_time}")
491
- print(f"the english text is {english_text}")
492
-
493
- return StreamingResponse(gpt_tts_stream(english_text,telugu_text), media_type="audio/mpeg")
494
 
 
 
 
 
495
 
 
496
 
497
  @app.post("/reset_chat")
498
  async def reset_chat():
 
3
  os.environ["HF_HOME"] = "/app/.cache/huggingface"
4
 
5
  import uvicorn
6
+
7
+
8
  from fastapi import FastAPI, File, UploadFile
9
  from fastapi.responses import StreamingResponse
10
  from fastapi.middleware.cors import CORSMiddleware
 
175
  return combined_score
176
 
177
  async def retrieve_top_k_hybrid(query, k, sem_weight,syn_weight,bm25):
178
+ emb_strt = time.time()
179
  query_embedding = model.encode(query)
180
+ emb_end = time.time()
181
+ print("\n\nTime for Query Embedding", emb_end-emb_strt)
182
 
183
  tasks = [
184
 
 
196
 
197
  # print("the retrieved chunks are")
198
  # print(top_results["telugu_chunk"].to_list()[0])
199
+ print("\n\nRetrieval Time", time.time() - emb_end)
200
  return top_results["telugu_chunk"].to_list()
201
 
202
 
 
323
 
324
  async def get_rag_response(user_message_english: str, user_message_telugu: str):
325
  global chat_messages
326
+ start_time = time.time()
327
  Chunks = await retrieve_top_k_hybrid(user_message_english,15, 0.9, 0.1,bm25)
328
+ end_time = time.time()
329
+ # print(f"Retrieval start time : {start_time}")
330
+ # print(f"Retrieval end time : {end_time}")
331
+ # print(f"Retrieval duration is : {end_time - start_time}")
332
+
333
  context = "======================================================================================================\n".join(map(str,Chunks))
334
  chat_messages.append({"role": "user", "content": f'''
335
  Context : {context}
 
340
 
341
  # --- GPT + TTS async generator with smaller buffer like second code ---
342
  async def gpt_tts_stream(prompt: str,telugu_text: str):
 
 
343
  global chat_messages
344
  chat_messages = await get_rag_response(prompt,telugu_text)
345
  # print(chat_messages,"chat_messages after getting RAG response")
 
346
  # response = openai.ChatCompletion.create(
347
  # model="gpt-4o",
348
  # messages= chat_messages,
349
  # stream=True
350
  # )
 
351
  bot_response = ""
352
  buffer = ""
353
  buffer_size = 30
354
+ count1 = 0
355
+ count2 = 0
356
+ count3 = 0
357
+ count4 = 0
358
  # ✅ Must use the `with` block for streaming
359
+ start_time = time.time()
360
  with client.chat.completions.stream(
361
  model="gpt-4o",
362
  messages=chat_messages,
363
  ) as stream:
 
364
  for event in stream:
365
+ if count1 == 0:
366
+ end_time = time.time()
367
+ # print(f"gpt call start time : {start_time}")
368
+ # print(f"gpt response start time : {end_time}")
369
+ print(f"gpt duration for first token : {end_time - start_time}")
370
+ count1 += 1
371
  if event.type == "content.delta":
372
  delta = event.delta
373
  bot_response = bot_response + delta
374
  buffer += delta
375
  if len(buffer) >= buffer_size and buffer.endswith((".", "!", ",", "?", "\n", ";", ":")):
376
+ if count2 == 0:
377
+ count2 += 1
378
+ end_time = time.time()
379
+ # print(f"gpt response first buffer start time : {end_time}")
380
+ print(f"gpt duration for first buffer : {end_time - start_time}")
381
+ print(buffer)
382
  # audio_chunks = tts_chunk_stream(buffer)
383
+ start_time = time.time()
384
  for audio_chunk in tts_chunk_stream(buffer):
385
+ if count3 == 0:
386
+ count3+=1
387
+ end_time = time.time()
388
+ # print(f"tts start time : {start_time}")
389
+ # print(f"tts response first buffer start time : {end_time}")
390
+ print(f"tts duration for first buffer : {end_time - start_time}")
391
+ # print("chunk",buffer)
392
  yield audio_chunk
393
  buffer = ""
394
+ # audio_chunk = tts_chunk_stream(buffer)
395
+ # yield audio_chunk
396
+ # count+=1
397
  elif event.type == "content.done":
 
398
  # 🧾 model finished — flush whatever is left
399
  if buffer.strip():
400
+ start_time = time.time()
401
+ # print(f"the final response time : {start_time}")
402
  print(buffer.strip())
403
  for audio_chunk in tts_chunk_stream(buffer):
404
+
405
  # print("chunk",buffer)
406
  yield audio_chunk
407
+ # buffer = ""
408
+ # audio_chunk = tts_chunk_stream(buffer)
409
+ start_time = time.time()
410
+ # print(f"the final audio time : {start_time}")
411
 
412
  bot_response = bot_response.strip()
413
  # print("the final bot response :")
 
416
  # print(fll_response)
417
  chat_messages.append({"role": "assistant", "content": bot_response})
418
 
 
 
 
419
  # def convert_to_mono16_wav_bytes(audio_bytes: bytes) -> tuple[bytes, int]:
420
  # print("i am inside the mono16 conversion")
421
  # """Convert any uploaded audio (mp3/webm/wav) to mono 16-bit WAV bytes in memory."""
 
493
  async def chat_stream(file: UploadFile = File(...)):
494
  start_time = time.time()
495
  audio_bytes = await file.read()
 
496
 
497
  transcription = client.audio.transcriptions.create(
498
  model="gpt-4o-transcribe", # or "gpt-4o-mini-transcribe"
 
502
  )
503
 
504
  telugu_text = transcription.text
505
+ end_time = time.time()
506
+ # print(f"stt start time :{start_time}")
507
+ # print(f"stt end time : {end_time}")
508
+ print(f"transcription total time : {end_time-start_time}")
509
  print(f"the text is : {telugu_text}")
510
+
511
  start_time = time.time()
512
  translation = client.responses.create(
513
  model="gpt-4o-mini",
 
518
  Give only the english translation, These queries are generally relevant to knee replacement surgery. Make sure you correct minor mistakes and return the user query in a proper english.''')
519
 
520
  english_text = translation.output[0].content[0].text
521
+ end_time = time.time()
 
 
 
522
 
523
+ # print(f"translation start time :{start_time}")
524
+ # print(f"translation end time : {end_time}")
525
+ print(f"translation total time : {end_time-start_time}")
526
+ print(f"the english text is : {english_text}")
527
 
528
+ return StreamingResponse(gpt_tts_stream(english_text,telugu_text), media_type="audio/mpeg")
529
 
530
  @app.post("/reset_chat")
531
  async def reset_chat():