File size: 3,381 Bytes
6f3f07e
3f2900f
f055cf8
3f2900f
f055cf8
2d26215
7462772
cf082dc
f055cf8
 
eb1daf1
b87f04a
b6f59a7
6615fb0
b87f04a
6d09328
b87f04a
 
a184d8d
b87f04a
 
 
6615fb0
f055cf8
 
 
 
 
a273772
 
 
 
6615fb0
f055cf8
6615fb0
f055cf8
 
 
 
 
 
 
 
 
6615fb0
 
 
 
 
 
 
 
 
 
 
 
8bd462e
6615fb0
 
f055cf8
7462772
6df9214
f055cf8
6615fb0
b87f04a
6615fb0
f055cf8
 
bfe628d
b87f04a
a184d8d
634313c
f055cf8
3f2900f
80f2b5c
acc3ae4
3f2900f
6615fb0
f055cf8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import spaces
import gradio as gr
from transformers import pipeline, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread
import os

@spaces.GPU()
def load_model(model_name):
    return pipeline("text-generation", model=model_name, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True, token=os.environ["token"], use_fast=True)
@spaces.GPU(duration=45)
def generate(
    message,
    history,
    model_name,
    system,
    temperature=0.4,
    top_p=0.95,
    min_p=0.1,
    top_k=50,
    max_new_tokens=256,
):
    try:
        pipe = load_model(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=os.environ["token"])
        tokenizer.eos_token = "<|im_end|>"
        print(tokenizer)
        pipe.tokenizer = tokenizer
        prompt = f"<|im_start|>system\n{system}<|im_end|>\n"
        for (user_turn, assistant_turn) in history:
            prompt += f"<|im_start|>user\n{user_turn}<|im_end|>\n<|im_start|>assistant\n{assistant_turn}<|im_end|>\n"
        prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"

        streamer = TextIteratorStreamer(pipe.tokenizer, timeout=240.0, skip_prompt=True, skip_special_tokens=True)
        generation_kwargs = dict(
            text_inputs=prompt, 
            streamer=streamer, 
            max_new_tokens=max_new_tokens, 
            do_sample=True, 
            top_p=top_p, 
            min_p=min_p, 
            top_k=top_k, 
            temperature=temperature, 
            num_beams=1, 
            repetition_penalty=1.1
        )
        
        t = Thread(target=pipe.__call__, kwargs=generation_kwargs)
        t.start()

        outputs = []
        for chunk in streamer:
            outputs.append(chunk)
            yield "".join(outputs)
    except StopAsyncIteration:
        print("Stream stopped unexpectedly.")
        yield "".join(outputs)
    except Exception as e:
        print(f"An error occurred: {e}")
        yield "An error occurred during generation."

model_choices = ["Locutusque/Hercules-6.1-Llama-3.1-8B", "Locutusque/liberalis-cogitator-mistral-nemo-2407", "Locutusque/liberalis-cogitator-smollm3-3b", "Locutusque/lmsys-llama-3.2-3b", "Locutusque/CollectiveLM-Falcon-3-7B", "Locutusque/StockQwen-2.5-7B"]
# What at the best options? 
g = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.components.Dropdown(choices=model_choices, label="Model", value=model_choices[0], interactive=True),
        gr.components.Textbox(lines=2, label="System Prompt", value="You are a sentient, super intelligent AI developed by a man named Locutusque."),
        gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
        gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
        gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
        gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
        gr.components.Slider(minimum=1, maximum=8192, step=1, value=1024, label="Max tokens"),  
    ],
    title="Locutusque's Language Models",
    description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
)
if __name__ == "__main__":
    g.launch()