Spaces:
Runtime error
Runtime error
| import logging | |
| from fastapi import FastAPI | |
| from llama_index.core.memory import ChatMemoryBuffer | |
| from llama_index.llms.llama_cpp import LlamaCPP | |
| from transformers import AutoTokenizer | |
| from llama_index.core import set_global_tokenizer | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| from llama_index.core import SimpleDirectoryReader | |
| from llama_index.core import VectorStoreIndex | |
| logging.basicConfig( | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| level=logging.INFO | |
| ) | |
| logger = logging.getLogger(__name__) | |
| logger.info("Запускаемся... 🥳🥳🥳") | |
| app = FastAPI() | |
| model_url = "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q3_k_m.gguf" | |
| SYSTEM_PROMPT = '' | |
| tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct") | |
| set_global_tokenizer( | |
| AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct").encode | |
| ) | |
| # embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2") | |
| # | |
| # documents = SimpleDirectoryReader("./data/").load_data() | |
| def messages_to_prompt(messages): | |
| messages = [{"role": m.role.value, "content": m.content} for m in messages] | |
| prompt = tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| return prompt | |
| def completion_to_prompt(completion): | |
| messages = [ | |
| {"role": "system", "content": SYSTEM_PROMPT or "Ответ должен быть точным, кратким и с юмором."}, | |
| {"role": "user", "content": completion}, | |
| ] | |
| prompt = tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| return prompt | |
| llm = LlamaCPP( | |
| # You can pass in the URL to a GGML model to download it automatically | |
| model_url=model_url, | |
| # optionally, you can set the path to a pre-downloaded model instead of model_url | |
| model_path=None, | |
| temperature=0.1, | |
| max_new_tokens=256, | |
| # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room | |
| context_window=2046, | |
| # kwargs to pass to __call__() | |
| generate_kwargs={}, | |
| # kwargs to pass to __init__() | |
| # set to at least 1 to use GPU | |
| model_kwargs={"n_gpu_layers": -1, "num_return_sequences": 1, "no_repeat_ngram_size": 2, "n_threads": 2}, | |
| # transform inputs into Llama2 format | |
| messages_to_prompt=messages_to_prompt, | |
| completion_to_prompt=completion_to_prompt, | |
| verbose=True, | |
| ) | |
| memory = ChatMemoryBuffer.from_defaults(token_limit=3900) | |
| index = VectorStoreIndex.from_documents({}) | |
| chat_engine = index.as_chat_engine( | |
| chat_mode="context", | |
| memory=memory, | |
| llm=llm, | |
| context_propt=( | |
| "Вы - чат-бот, способный нормально взаимодействовать.\n" | |
| "Используйте предыдущую историю чата или приведенный выше контекст, чтобы взаимодействовать с пользователем и помогать ему." | |
| ) | |
| ) | |
| def generate_response(completion_response): | |
| try: | |
| response_text = completion_response.text | |
| return response_text.strip() if response_text else "Пустой ответ" | |
| except Exception as e: | |
| logger.error(f"Ошибка обработки ответа: {str(e)}") | |
| return "Ошибка генерации" | |
| def greet_json(): | |
| return {"Hello": "World!"} | |
| async def set_system_prompt(text: str): | |
| logger.info('post/system-prompt') | |
| # global SYSTEM_PROMPT | |
| # SYSTEM_PROMPT = text | |
| async def predict(text: str): | |
| # Генерация ответа с помощью модели | |
| logger.info('post/predict') | |
| logger.info('ЗАПРОС:') | |
| logger.info(text) | |
| # response = llm.complete(text) | |
| response = chat_engine.chat(text) | |
| logger.info('ОТВЕТ:') | |
| logger.info(response) | |
| # text_response = generate_response(response) | |
| # return {"response": text_response} | |
| return {"response": response} |