Spaces:
Build error
Build error
| # import gc | |
| # import gradio as gr | |
| # import torch | |
| # from transformers import AutoTokenizer, AutoModelForCausalLM #, HqqConfig | |
| # # # quant_config = HqqConfig(nbits=8, group_size=64) | |
| # MODEL_ID = "HuggingFaceTB/SmolLM3-3B" | |
| # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| # print("Loading tokenizer & model…") | |
| # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| # # # model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE) | |
| # model =\ | |
| # AutoModelForCausalLM\ | |
| # .from_pretrained( | |
| # MODEL_ID, | |
| # torch_dtype=torch.float16, | |
| # # device_map="cuda", | |
| # # quantization_config=quant_config | |
| # ).to(DEVICE) | |
| # gc.collect() | |
| ######### | |
| # import torch | |
| # from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer | |
| # from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig | |
| # # quant_config = Float8WeightOnlyConfig() | |
| # quant_config = Float8DynamicActivationFloat8WeightConfig() | |
| # quantization_config = TorchAoConfig(quant_type=quant_config) | |
| # MODEL_ID = "HuggingFaceTB/SmolLM3-3B" | |
| # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| # model = AutoModelForCausalLM.from_pretrained( | |
| # MODEL_ID, | |
| # torch_dtype="auto", | |
| # device_map="auto", | |
| # quantization_config=quantization_config) | |
| # gc.collect() | |
| ######### | |
| # from unsloth import FastLanguageModel | |
| # model, tokenizer = FastLanguageModel.from_pretrained( | |
| # "unsloth/Llama-3.2-3B-Instruct-bnb-4bit", | |
| # max_seq_length=128_000, | |
| # load_in_4bit=True | |
| # ) | |
| ######### | |
| # import gc | |
| # import gradio as gr | |
| # from transformers import AutoTokenizer | |
| # from optimum.onnxruntime import ORTModelForCausalLM, ORTQuantizer | |
| # from optimum.onnxruntime.configuration import AutoQuantizationConfig | |
| # MODEL_NAME = "HuggingFaceTB/SmolLM3-3B" | |
| # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| # model = ORTModelForCausalLM.from_pretrained(MODEL_NAME, export=True) | |
| # print("Creating quant config") | |
| # qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True) | |
| # print("Creating quant config successful") | |
| # print("Creating quantizer") | |
| # quantizer = ORTQuantizer.from_pretrained(model) | |
| # print("Creating quantizer successful") | |
| # # Step 4: Perform quantization saving output in a new directory | |
| # quantized_model_dir = "./quantized_model" | |
| # print("Starting quantization...") | |
| # quantizer.quantize(save_dir=quantized_model_dir, quantization_config=qconfig) | |
| # print("Quantization was successful. Garbage collecting...") | |
| # del(quantizer) | |
| # del(qconfig) | |
| # del(model) | |
| # Run garbage collection again to release memory from quantizer objects | |
| # gc.collect() | |
| # # Step 5: Load the quantized ONNX model for inference | |
| # print("Loading quantized ONNX model for inference...") | |
| # model = ORTModelForCausalLM.from_pretrained(quantized_model_dir) | |
| # print("Loading model was succcessful. Garbage collecting.") | |
| # Garbage collection again after final loading | |
| # gc.collect() | |
| ######### | |
| # print("Loading tokenizer & model…") | |
| # import gradio as gr | |
| # from transformers import AutoTokenizer | |
| # from optimum.onnxruntime import ORTModelForCausalLM | |
| # MODEL_ID = "HuggingFaceTB/SmolLM3-3B" | |
| # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| # model = ORTModelForCausalLM.from_pretrained(MODEL_ID, export=True, quantize=True) | |
| ######### | |
| # ------------------------------------------------- | |
| # Optional tool(s) | |
| # ------------------------------------------------- | |
| # TOOLS = [{ | |
| # "name": "get_weather", | |
| # "description": "Get the current weather in a given city", | |
| # "parameters": { | |
| # "type": "object", | |
| # "properties": { | |
| # "city": {"type": "string", "description": "City name"} | |
| # }, | |
| # "required": ["city"] | |
| # } | |
| # }] | |
| # ------------------------------------------------- | |
| # Helpers | |
| # ------------------------------------------------- | |
| # def build_messages(history, enable_thinking: bool): | |
| # """Convert Gradio history to the chat template.""" | |
| # messages = [] | |
| # for h in history: | |
| # messages.append({"role": h["role"], "content": h["content"]}) | |
| # # Add system instruction for mode | |
| # system_flag = "/think" if enable_thinking else "/no_think" | |
| # messages.insert(0, {"role": "system", "content": system_flag}) | |
| # return messages | |
| # def chat_fn(history, enable_thinking, temperature, top_p, top_k, repetition_penalty, max_new_tokens): | |
| # """Generate a streaming response.""" | |
| # messages = build_messages(history, enable_thinking) | |
| # text = tokenizer.apply_chat_template( | |
| # messages, | |
| # tokenize=False, | |
| # add_generation_prompt=True, | |
| # # xml_tools=TOOLS | |
| # ) | |
| # inputs = tokenizer(text, return_tensors="pt") | |
| # gc.collect() | |
| # with torch.inference_mode(): | |
| # streamer = model.generate( | |
| # **inputs, | |
| # max_new_tokens=max_new_tokens, | |
| # do_sample=True, | |
| # temperature=temperature, | |
| # top_p=top_p, | |
| # top_k=top_k, | |
| # repetition_penalty=repetition_penalty, | |
| # pad_token_id=tokenizer.eos_token_id, | |
| # streamer=None # we'll yield manually | |
| # ) | |
| # gc.collect() | |
| # output_ids = streamer[0][len(inputs.input_ids[0]):] | |
| # response = tokenizer.decode(output_ids, skip_special_tokens=True) | |
| # if isinstance(response, str): | |
| # response = response.replace('<think>',"# <think>").replace('</think>',"</think>") | |
| # elif isinstance(response,list): | |
| # response = [paper.replace('<think>',"# <think>").replace('</think>',"</think>") for paper in response] | |
| # else: | |
| # raise ValueError("Tokenizer response seems malformed; Not a string, nor a list?!?!") | |
| # # streaming char-by-char | |
| # history.append({"role": "assistant", "content": ""}) | |
| # for ch in response: | |
| # history[-1]["content"] += ch | |
| # yield history | |
| # # ------------------------------------------------- | |
| # # Blocks UI | |
| # # ------------------------------------------------- | |
| # with gr.Blocks(title="SmolLM3-3B Chat") as demo: | |
| # gr.Markdown("## 🤖 SmolLM3-3B Chatbot (Streaming)") | |
| # with gr.Row(): | |
| # enable_think = gr.Checkbox(label="Enable Extended Thinking (/think)", value=True) | |
| # temperature = gr.Slider(0.0, 1.0, value=0.6, label="Temperature") | |
| # top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p") | |
| # top_k = gr.Slider(1,40,value=20,label="Top_k") | |
| # repetition_penalty = gr.Slider(1.0,1.4,value=1.1,label="Repetition_Penalty") | |
| # max_new_tokens = gr.Slider(1000,32768,value=32768,label="Max_New_Tokens") | |
| # chatbot = gr.Chatbot(type="messages") | |
| # msg = gr.Textbox(placeholder="Type your message here…", lines=1) | |
| # clear = gr.Button("Clear") | |
| # def user_fn(user_msg, history): | |
| # return "", history + [{"role": "user", "content": user_msg}] | |
| # msg.submit( | |
| # user_fn, [msg, chatbot], [msg, chatbot], queue=False | |
| # ).then( | |
| # chat_fn, [chatbot, enable_think, temperature, top_p, top_k, repetition_penalty, max_new_tokens], chatbot | |
| # ) | |
| # clear.click(lambda: None, None, chatbot, queue=False) | |
| # demo.queue().launch() | |
| import gc | |
| from pathlib import Path | |
| from llama_cpp import Llama | |
| import gradio as gr | |
| from pypdf import PdfReader | |
| import pandas as pd | |
| from docx import Document | |
| MAX_TOKENS = 10_000 | |
| llm = Llama.from_pretrained( | |
| repo_id="unsloth/SmolLM3-3B-GGUF", | |
| filename="SmolLM3-3B-Q4_K_M.gguf", | |
| n_ctx=MAX_TOKENS, | |
| ) | |
| gc.collect() | |
| # ---------- helpers ---------- | |
| def read_file(p: Path) -> str: | |
| try: | |
| suffix = p.suffix.lower() | |
| if suffix == ".pdf": | |
| with p.open("rb") as f: | |
| reader = PdfReader(f) | |
| return "\n".join(page.extract_text() or "" for page in reader.pages) | |
| elif suffix in (".xlsx", ".xls"): | |
| sheets = pd.read_excel(p, sheet_name=None) | |
| text = "" | |
| for sheet_name, df in sheets.items(): | |
| text += df.to_string() | |
| return text | |
| elif suffix == ".docx": | |
| with p.open("rb") as f: | |
| doc = Document(f) | |
| return "\n".join(para.text for para in doc.paragraphs) | |
| else: | |
| return p.read_text(encoding="utf-8", errors="ignore") | |
| except Exception: | |
| return "[could not read file]" | |
| def build_messages(history, enable_thinking: bool): | |
| messages = [] | |
| for h in history: | |
| messages.append({"role": h["role"], "content": h["content"]}) | |
| system_flag = "/think" if enable_thinking else "/no_think" | |
| messages.insert(0, {"role": "system", "content": system_flag}) | |
| return messages | |
| def chat_fn(history, enable_thinking, temperature, top_p, top_k, | |
| repetition_penalty, max_new_tokens): | |
| messages = build_messages(history, enable_thinking) | |
| response = llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=max_new_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| top_k=top_k, | |
| repeat_penalty=repetition_penalty | |
| ) | |
| response_text = response['choices'][0]['message']['content'] | |
| if isinstance(response_text, str): | |
| response = response_text.replace('<think>', "# <think>").replace('</think>', "</think>") | |
| elif isinstance(response_text, list): | |
| response = [t.replace('<think>', "# <think>").replace('</think>', "</think>") for t in response_text] | |
| else: | |
| raise ValueError("Malformed response from tokenizer") | |
| history.append({"role": "assistant", "content": ""}) | |
| for ch in response: | |
| history[-1]["content"] += ch | |
| yield history | |
| # ---------- UI ---------- | |
| with gr.Blocks(title="SmolLM3-3B Chat") as demo: | |
| gr.Markdown("## 🤖 SmolLM3-3B Chatbot (Streaming)") | |
| with gr.Row(): | |
| enable_think = gr.Checkbox(label="Enable Extended Thinking (/think)", value=True) | |
| temperature = gr.Slider(0.0, 1.0, value=0.6, label="Temperature") | |
| top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p") | |
| top_k = gr.Slider(1, 40, value=20, label="Top-k") | |
| repetition_penalty = gr.Slider(1.0, 1.4, value=1.1, label="Repetition Penalty") | |
| max_new_tokens = gr.Slider(1000, MAX_TOKENS, value=MAX_TOKENS, label="Max New Tokens") | |
| chatbot = gr.Chatbot(type="messages") | |
| with gr.Row(): | |
| msg = gr.Textbox(placeholder="Type your message here…", lines=1, scale=8) | |
| send_btn = gr.Button("Send", scale=1) | |
| file_uploader = gr.File(label="Attach file(s)", file_count="multiple", file_types=None) | |
| clear = gr.Button("Clear") | |
| def user_fn(user_msg, history, files): | |
| if files: | |
| file_contents = "\n\n".join(read_file(Path(fp)) for fp in files) | |
| user_msg += f"\n\n# FILE CONTENT:\n\n{file_contents}" | |
| return "", history + [{"role": "user", "content": user_msg}], None # clear file_uploader | |
| # Submit on button click or Enter key | |
| for trigger in (msg.submit, send_btn.click): | |
| trigger( | |
| user_fn, [msg, chatbot, file_uploader], [msg, chatbot, file_uploader], queue=False | |
| ).then( | |
| chat_fn, | |
| [chatbot, enable_think, temperature, top_p, top_k, repetition_penalty, max_new_tokens], | |
| chatbot | |
| ) | |
| clear.click(lambda: None, None, chatbot, queue=False) | |
| demo.queue().launch() | |