import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) def generate(prompt): inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=200) return tokenizer.decode(outputs[0], skip_special_tokens=True) gr.Interface(fn=generate, inputs="text", outputs="text").launch() # import gradio as gr # from llama_cpp import Llama # # Use the quantized model file path # model_path = "MegaTom/TinyLlama-1.1B-Chat-v1.0-Q4_K_M-GGUF" # Use your actual path to the quantized model # # Load the quantized model # llm = Llama(model_path=model_path) # # Function to generate text using the model # def generate(prompt): # # Generate the response # output = llm(prompt, max_tokens=50) # return output['choices'][0]['text'] # # Set up the Gradio interface # gr.Interface(fn=generate, inputs="text", outputs="text").launch()