import torch from transformers import AutoTokenizer, AutoModelForCausalLM import gradio as gr import concurrent.futures # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load models and tokenizers def load_model(name): tokenizer = AutoTokenizer.from_pretrained(name) model = AutoModelForCausalLM.from_pretrained(name) # Define pad token explicitly tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = tokenizer.pad_token_id return tokenizer, model.to(device) tokenizer1, model1 = load_model("Gensyn/Qwen2.5-0.5B-Instruct") tokenizer2, model2 = load_model("facebook/opt-125m") tokenizer3, model3 = load_model("Qwen/Qwen2.5-7B-Instruct") # Generation function def generate_response(model, tokenizer, prompt): inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device) outputs = model.generate( inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=100, pad_token_id=tokenizer.pad_token_id, do_sample=True, temperature=0.7, top_p=0.9 ) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Multi-agent handler def multi_agent_chat(user_input): with concurrent.futures.ThreadPoolExecutor() as executor: futures = [ executor.submit(generate_response, model1, tokenizer1, user_input), executor.submit(generate_response, model2, tokenizer2, user_input), executor.submit(generate_response, model3, tokenizer3, user_input) ] results = [f.result() for f in futures] return results # Gradio Interface interface = gr.Interface( fn=multi_agent_chat, inputs=gr.Textbox(lines=2, placeholder="Ask something..."), outputs=[ gr.Textbox(label="Agent 1 (Gensyn/Qwen2.5-0.5B-Instruct)"), gr.Textbox(label="Agent 2 (facebook/opt-125m)"), gr.Textbox(label="Agent 3 (Qwen/Qwen2.5-7B-Instruct)") ], title="3-Agent AI Chatbot", description="Three GPT-style agents respond to your input in parallel!" ) interface.launch()