# import gradio as gr # from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig # from peft import PeftModel, PeftConfig # # Load tokenizer # tokenizer = AutoTokenizer.from_pretrained(".") # # Load base model with quantization # bnb_config = BitsAndBytesConfig(load_in_4bit=True) # base_model = AutoModelForCausalLM.from_pretrained( # "unsloth/Meta-Llama-3.1-8B-bnb-4bit", # same base you fine-tuned # quantization_config=bnb_config, # device_map="auto" # ) # # Load LoRA adapters # model = PeftModel.from_pretrained(base_model, ".") # # Create Gradio Interface # def generate_response(prompt): # inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7) # return tokenizer.decode(outputs[0], skip_special_tokens=True) # gr.Interface( # fn=generate_response, # inputs=gr.Textbox(label="Enter your instruction"), # outputs=gr.Textbox(label="Model response"), # title="LLaMA 3 - Fine-tuned Model" # ).launch() # Here I change the model name # from transformers import AutoTokenizer, AutoModelForCausalLM # from peft import PeftModel # import torch # import gradio as gr # # Load base model from HF Hub # base_model_name = "distilgpt2" # tokenizer = AutoTokenizer.from_pretrained(base_model_name) # # Load base model (set torch_dtype if needed) # model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float16) # # Load LoRA adapters from local files in Space # adapter_path = "./" # If adapter files are in root or specify folder name # model = PeftModel.from_pretrained(model, adapter_path) # model.eval() # def predict(text): # inputs = tokenizer(text, return_tensors="pt").to("cpu") # Use "cuda" if GPU available # outputs = model.generate(**inputs, max_new_tokens=70) # return tokenizer.decode(outputs[0], skip_special_tokens=True) # iface = gr.Interface(fn=predict, inputs="text", outputs="text", title="LoRA Model Demo") # iface.launch() # // Here is the new code with intent to optimize from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel import torch import gradio as gr # Load base model from HF Hub base_model_name = "unsloth/Llama-3.2-1B" # Use your model path or model name tokenizer = AutoTokenizer.from_pretrained(base_model_name) # Load base model (set torch_dtype if needed) model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float16) # Load LoRA adapters from local files in Space adapter_path = "./" # If adapter files are in root or specify folder name model = PeftModel.from_pretrained(model, adapter_path) model.eval() def predict(text): inputs = tokenizer(text, return_tensors="pt").to("cpu") # Use "cuda" if GPU available outputs = model.generate(**inputs, max_new_tokens=100) return tokenizer.decode(outputs[0], skip_special_tokens=True) iface = gr.Interface(fn=predict, inputs="text", outputs="text", title="LoRA Model Demo") iface.launch()