Spaces:
Runtime error
Runtime error
import os | |
import json | |
import subprocess | |
import gradio as gr | |
from threading import Thread | |
from huggingface_hub import hf_hub_download | |
from llama_cpp import Llama | |
from datetime import datetime | |
# Load model from Hugging Face Hub | |
MODEL_ID = "large-traversaal/Alif-1.0-8B-Instruct" | |
MODEL_FILE = "model-Q8_0.gguf" | |
model_path_file = hf_hub_download(MODEL_ID, filename=MODEL_FILE) | |
# Initialize Llama model | |
llama = Llama( | |
model_path=model_path_file, | |
n_gpu_layers=40, # Adjust based on VRAM | |
n_threads=8, # Match CPU cores | |
n_batch=512, # Optimize for better VRAM usage | |
n_ctx=4096, # Context window size | |
verbose=True # Enable debug logging | |
) | |
# Function to generate responses | |
def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p): | |
# chat_prompt = f"You are an Urdu Chatbot. Write an appropriate response for the given instruction: {message} Response:" | |
chat_prompt = f"{system_prompt}\n ### Instruction: {message}\n ### Response:" | |
response = llama(chat_prompt, temperature=temperature, max_tokens=max_new_tokens, top_k=top_k, repeat_penalty=repetition_penalty, top_p=top_p, stop=["Q:", "\n"], echo=False, stream=True) | |
text = "" | |
for chunk in response: | |
content = chunk["choices"][0]["text"] | |
if content: | |
text += content | |
yield text | |
# def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p): | |
# """Generates a streaming response from the Llama model.""" | |
# messages = [ | |
# {"role": "system", "content": "You are an Urdu Chatbot. Write an appropriate response for the given instruction."}, | |
# ] | |
# # Add history and the current message | |
# #for user, bot in history: | |
# #messages.append({"role": "user", "content": user}) | |
# #messages.append({"role": "assistant", "content": bot}) | |
# messages.append({"role": "user", "content": message}) | |
# response = llama.create_chat_completion( | |
# messages=messages, | |
# stream=True, | |
# ) | |
# partial_message = "" | |
# for part in response: | |
# content = part["choices"][0]["delta"].get("content", "") | |
# partial_message += content | |
# yield partial_message | |
# JavaScript function for `on_load` | |
on_load = """ | |
async()=>{ alert("Welcome to the Traversaal Alif 1.0 Chatbot! This is an experimental AI model. Please use responsibly."); } | |
""" | |
placeholder = """ | |
<center><h1>10 Questions</h1><br>Think of a person, place, or thing. I'll ask you 10 yes/no questions to try and guess it. | |
</center> | |
""" | |
# Create custom chat UI using `gr.Blocks` | |
with gr.Blocks(js=on_load, theme=gr.themes.Default()) as demo: | |
with gr.Column(scale=1, elem_id="center-content"): | |
gr.Markdown( | |
""" | |
<div style="text-align: center;"> | |
<h1>Alif 1.0 Urdu & English Chatbot 🚀</h1> | |
<p>Alif 1.0 8B Instruct is an open-source model with highly advanced multilingual reasoning capabilities. It utilizes human refined multilingual synthetic data paired with reasoning to enhance cultural nuance and reasoning capabilities in english and urdu languages.</p> | |
</div> | |
""", | |
) | |
chat = gr.ChatInterface( | |
generate_response, | |
#chatbot=gr.Chatbot(placeholder=placeholder), | |
#title="🚀" + " " + "Alif-1.0 Chatbot", | |
#description="Urdu AI Chatbot powered by Llama.cpp", | |
examples=[ | |
["شہر کراچی کے بارے میں بتاؤ"], | |
["قابل تجدید توانائی کیا ہے؟"], | |
["پاکستان کے بارے میں بتائیں"] | |
], | |
additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), | |
additional_inputs=[ | |
gr.Textbox(value="You are an Urdu Chatbot. Write an appropriate response for the given instruction in Urdu.", label="System prompt", render=False), | |
gr.Slider(0, 1, 0.8, label="Temperature", render=False), | |
gr.Slider(128, 4096, 512, label="Max new tokens", render=False), | |
gr.Slider(1, 80, 40, step=1, label="Top K sampling", render=False), | |
gr.Slider(0, 2, 1.1, label="Repetition penalty", render=False), | |
gr.Slider(0, 1, 0.95, label="Top P sampling", render=False), | |
], | |
) | |
demo.queue(max_size=10).launch(share=True) | |