Spaces:

Locutusque
/

Locutusque-Models

Running on Zero

App Files Files Community

Locutusque commited on Aug 28

Commit

7462772

verified ·

1 Parent(s): 2b65eee

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -25

app.py CHANGED Viewed

@@ -1,13 +1,65 @@
 import spaces
 import gradio as gr
-from transformers import pipeline, AutoTokenizer, TextIteratorStreamer
 import torch
 from threading import Thread
 import os
 @spaces.GPU()
-def load_model(model_name):
-    return pipeline("text-generation", model=model_name, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True, token=os.environ["token"], use_fast=True)
 @spaces.GPU(duration=45)
 def generate(
     message,
@@ -21,27 +73,32 @@ def generate(
     max_new_tokens=256,
 ):
     try:
-        pipe = load_model(model_name)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=os.environ["token"])
-        tokenizer.eos_token = "<|im_end|>"
-        print(tokenizer)
-        pipe.tokenizer = tokenizer
         prompt = f"<|im_start|>system\n{system}<|im_end|>\n"
         for (user_turn, assistant_turn) in history:
             prompt += f"<|im_start|>user\n{user_turn}<|im_end|>\n<|im_start|>assistant\n{assistant_turn}<|im_end|>\n"
         prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
-        streamer = TextIteratorStreamer(pipe.tokenizer, timeout=240.0, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = dict(
-            text_inputs=prompt,
-            streamer=streamer,
-            max_new_tokens=max_new_tokens,
-            do_sample=True,
-            top_p=top_p,
-            min_p=min_p,
-            top_k=top_k,
-            temperature=temperature,
-            num_beams=1,
             repetition_penalty=1.1
         )
@@ -52,28 +109,57 @@ def generate(
         for chunk in streamer:
             outputs.append(chunk)
             yield "".join(outputs)
     except StopAsyncIteration:
         print("Stream stopped unexpectedly.")
         yield "".join(outputs)
     except Exception as e:
         print(f"An error occurred: {e}")
-        yield "An error occurred during generation."
-model_choices = ["Locutusque/Hercules-6.1-Llama-3.1-8B", "Locutusque/liberalis-cogitator-mistral-nemo-2407", "Locutusque/Hercules-6.9-Llama-3.1-8B", "Locutusque/lmsys-llama-3.2-3b", "Locutusque/CollectiveLM-Falcon-3-7B", "Locutusque/StockQwen-2.5-7B"]
-# What at the best options?
 g = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
-        gr.components.Dropdown(choices=model_choices, label="Model", value=model_choices[0], interactive=True),
-        gr.components.Textbox(lines=2, label="System Prompt", value="You are a sentient, super intelligent AI developed by a man named Locutusque."),
         gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
         gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
         gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
         gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
-        gr.components.Slider(minimum=1, maximum=8192, step=1, value=1024, label="Max tokens"),
     ],
     title="Locutusque's Language Models",
     description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
 )
 if __name__ == "__main__":
-    g.launch()

 import spaces
 import gradio as gr
+from transformers import pipeline, AutoTokenizer, TextIteratorStreamer, AutoModelForCausalLM
 import torch
 from threading import Thread
 import os
+# Global dictionary to store preloaded models and tokenizers
+LOADED_MODELS = {}
+LOADED_TOKENIZERS = {}
+def preload_models(model_choices):
+    """Preload all models to CPU at startup"""
+    print("Preloading models to CPU...")
+    for model_name in model_choices:
+        try:
+            print(f"Loading {model_name}...")
+            # Load model to CPU with bfloat16 to save memory
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=True,
+                token=os.environ.get("token"),
+                device_map="cpu",
+                low_cpu_mem_usage=True
+            )
+            # Load tokenizer
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                token=os.environ.get("token")
+            )
+            tokenizer.eos_token = "<|im_end|>"
+            LOADED_MODELS[model_name] = model
+            LOADED_TOKENIZERS[model_name] = tokenizer
+            print(f"Successfully loaded {model_name}")
+        except Exception as e:
+            print(f"Failed to load {model_name}: {e}")
 @spaces.GPU()
+def get_model_pipeline(model_name):
+    """Move selected model to GPU and create pipeline"""
+    if model_name not in LOADED_MODELS:
+        raise ValueError(f"Model {model_name} not found in preloaded models")
+    # Move model to GPU
+    model = LOADED_MODELS[model_name].to("cuda")
+    tokenizer = LOADED_TOKENIZERS[model_name]
+    # Create pipeline with the GPU model
+    pipe = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        torch_dtype=torch.bfloat16,
+        device="cuda"
+    )
+    return pipe, model
 @spaces.GPU(duration=45)
 def generate(
     message,
     max_new_tokens=256,
 ):
     try:
+        # Get the pipeline with model on GPU
+        pipe, gpu_model = get_model_pipeline(model_name)
+        # Build the prompt
         prompt = f"<|im_start|>system\n{system}<|im_end|>\n"
         for (user_turn, assistant_turn) in history:
             prompt += f"<|im_start|>user\n{user_turn}<|im_end|>\n<|im_start|>assistant\n{assistant_turn}<|im_end|>\n"
         prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
+        streamer = TextIteratorStreamer(
+            pipe.tokenizer,
+            timeout=240.0,
+            skip_prompt=True,
+            skip_special_tokens=True
+        )
         generation_kwargs = dict(
+            text_inputs=prompt,
+            streamer=streamer,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            top_p=top_p,
+            min_p=min_p,
+            top_k=top_k,
+            temperature=temperature,
+            num_beams=1,
             repetition_penalty=1.1
         )
         for chunk in streamer:
             outputs.append(chunk)
             yield "".join(outputs)
+        # Move model back to CPU after inference to free GPU memory
+        gpu_model.to("cpu")
+        torch.cuda.empty_cache()
     except StopAsyncIteration:
         print("Stream stopped unexpectedly.")
         yield "".join(outputs)
     except Exception as e:
         print(f"An error occurred: {e}")
+        yield f"An error occurred during generation: {str(e)}"
+    finally:
+        # Ensure model is moved back to CPU even if there's an error
+        if 'gpu_model' in locals():
+            gpu_model.to("cpu")
+            torch.cuda.empty_cache()
+# Define model choices
+model_choices = [
+    "Locutusque/Hercules-6.1-Llama-3.1-8B",
+    "Locutusque/liberalis-cogitator-mistral-nemo-2407",
+    "Locutusque/lmsys-llama-3.2-3b"
+]
+# Preload all models to CPU at startup
+preload_models(model_choices)
+# Create Gradio interface
 g = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
+        gr.components.Dropdown(
+            choices=model_choices,
+            label="Model",
+            value=model_choices[0],
+            interactive=True
+        ),
+        gr.components.Textbox(
+            lines=2,
+            label="System Prompt",
+            value="You are a sentient, super intelligent AI developed by a man named Locutusque."
+        ),
         gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
         gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
         gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
         gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
+        gr.components.Slider(minimum=1, maximum=8192, step=1, value=1024, label="Max tokens"),
     ],
     title="Locutusque's Language Models",
     description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
 )
 if __name__ == "__main__":
+    g.launch()