Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,381 Bytes
6f3f07e 3f2900f f055cf8 3f2900f f055cf8 2d26215 7462772 cf082dc f055cf8 eb1daf1 b87f04a b6f59a7 6615fb0 b87f04a 6d09328 b87f04a a184d8d b87f04a 6615fb0 f055cf8 a273772 6615fb0 f055cf8 6615fb0 f055cf8 6615fb0 8bd462e 6615fb0 f055cf8 7462772 6df9214 f055cf8 6615fb0 b87f04a 6615fb0 f055cf8 bfe628d b87f04a a184d8d 634313c f055cf8 3f2900f 80f2b5c acc3ae4 3f2900f 6615fb0 f055cf8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import spaces
import gradio as gr
from transformers import pipeline, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread
import os
@spaces.GPU()
def load_model(model_name):
return pipeline("text-generation", model=model_name, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True, token=os.environ["token"], use_fast=True)
@spaces.GPU(duration=45)
def generate(
message,
history,
model_name,
system,
temperature=0.4,
top_p=0.95,
min_p=0.1,
top_k=50,
max_new_tokens=256,
):
try:
pipe = load_model(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=os.environ["token"])
tokenizer.eos_token = "<|im_end|>"
print(tokenizer)
pipe.tokenizer = tokenizer
prompt = f"<|im_start|>system\n{system}<|im_end|>\n"
for (user_turn, assistant_turn) in history:
prompt += f"<|im_start|>user\n{user_turn}<|im_end|>\n<|im_start|>assistant\n{assistant_turn}<|im_end|>\n"
prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
streamer = TextIteratorStreamer(pipe.tokenizer, timeout=240.0, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
text_inputs=prompt,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
top_p=top_p,
min_p=min_p,
top_k=top_k,
temperature=temperature,
num_beams=1,
repetition_penalty=1.1
)
t = Thread(target=pipe.__call__, kwargs=generation_kwargs)
t.start()
outputs = []
for chunk in streamer:
outputs.append(chunk)
yield "".join(outputs)
except StopAsyncIteration:
print("Stream stopped unexpectedly.")
yield "".join(outputs)
except Exception as e:
print(f"An error occurred: {e}")
yield "An error occurred during generation."
model_choices = ["Locutusque/Hercules-6.1-Llama-3.1-8B", "Locutusque/liberalis-cogitator-mistral-nemo-2407", "Locutusque/liberalis-cogitator-smollm3-3b", "Locutusque/lmsys-llama-3.2-3b", "Locutusque/CollectiveLM-Falcon-3-7B", "Locutusque/StockQwen-2.5-7B"]
# What at the best options?
g = gr.ChatInterface(
fn=generate,
additional_inputs=[
gr.components.Dropdown(choices=model_choices, label="Model", value=model_choices[0], interactive=True),
gr.components.Textbox(lines=2, label="System Prompt", value="You are a sentient, super intelligent AI developed by a man named Locutusque."),
gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
gr.components.Slider(minimum=1, maximum=8192, step=1, value=1024, label="Max tokens"),
],
title="Locutusque's Language Models",
description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
)
if __name__ == "__main__":
g.launch() |