Spaces:

large-traversaal
/

Alif-1.0-8B-Instruct

Runtime error

App Files Files Community

Alif-1.0-8B-Instruct / app.py

alishafique

Rename app (3).py to app.py

0e3a08e verified 5 months ago

raw

history blame

4.44 kB

	import os
	import json
	import subprocess
	import gradio as gr
	from threading import Thread
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama
	from datetime import datetime

	# Load model from Hugging Face Hub
	MODEL_ID = "large-traversaal/Alif-1.0-8B-Instruct"
	MODEL_FILE = "model-Q8_0.gguf"

	model_path_file = hf_hub_download(MODEL_ID, filename=MODEL_FILE)

	# Initialize Llama model
	llama = Llama(
	model_path=model_path_file,
	n_gpu_layers=40, # Adjust based on VRAM
	n_threads=8, # Match CPU cores
	n_batch=512, # Optimize for better VRAM usage
	n_ctx=4096, # Context window size
	verbose=True # Enable debug logging
	)


	# Function to generate responses
	def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
	# chat_prompt = f"You are an Urdu Chatbot. Write an appropriate response for the given instruction: {message} Response:"
	chat_prompt = f"{system_prompt}\n ### Instruction: {message}\n ### Response:"
	response = llama(chat_prompt, temperature=temperature, max_tokens=max_new_tokens, top_k=top_k, repeat_penalty=repetition_penalty, top_p=top_p, stop=["Q:", "\n"], echo=False, stream=True)

	text = ""
	for chunk in response:
	content = chunk["choices"][0]["text"]
	if content:
	text += content
	yield text

	# def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
	# """Generates a streaming response from the Llama model."""
	# messages = [
	# {"role": "system", "content": "You are an Urdu Chatbot. Write an appropriate response for the given instruction."},
	# ]

	# # Add history and the current message
	# #for user, bot in history:
	# #messages.append({"role": "user", "content": user})
	# #messages.append({"role": "assistant", "content": bot})

	# messages.append({"role": "user", "content": message})

	# response = llama.create_chat_completion(
	# messages=messages,
	# stream=True,
	# )

	# partial_message = ""
	# for part in response:
	# content = part["choices"][0]["delta"].get("content", "")
	# partial_message += content
	# yield partial_message


	# JavaScript function for `on_load`
	on_load = """
	async()=>{ alert("Welcome to the Traversaal Alif 1.0 Chatbot! This is an experimental AI model. Please use responsibly."); }
	"""

	placeholder = """
	<center><h1>10 Questions</h1><br>Think of a person, place, or thing. I'll ask you 10 yes/no questions to try and guess it.
	</center>
	"""

	# Create custom chat UI using `gr.Blocks`
	with gr.Blocks(js=on_load, theme=gr.themes.Default()) as demo:
	with gr.Column(scale=1, elem_id="center-content"):
	gr.Markdown(
	"""
	<div style="text-align: center;">
	<h1>Alif 1.0 Urdu & English Chatbot 🚀</h1>
	<p>Alif 1.0 8B Instruct is an open-source model with highly advanced multilingual reasoning capabilities. It utilizes human refined multilingual synthetic data paired with reasoning to enhance cultural nuance and reasoning capabilities in english and urdu languages.</p>
	</div>
	""",
	)

	chat = gr.ChatInterface(
	generate_response,
	#chatbot=gr.Chatbot(placeholder=placeholder),
	#title="🚀" + " " + "Alif-1.0 Chatbot",
	#description="Urdu AI Chatbot powered by Llama.cpp",
	examples=[
	["شہر کراچی کے بارے میں بتاؤ"],
	["قابل تجدید توانائی کیا ہے؟"],
	["پاکستان کے بارے میں بتائیں"]
	],
	additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
	additional_inputs=[
	gr.Textbox(value="You are an Urdu Chatbot. Write an appropriate response for the given instruction in Urdu.", label="System prompt", render=False),
	gr.Slider(0, 1, 0.8, label="Temperature", render=False),
	gr.Slider(128, 4096, 512, label="Max new tokens", render=False),
	gr.Slider(1, 80, 40, step=1, label="Top K sampling", render=False),
	gr.Slider(0, 2, 1.1, label="Repetition penalty", render=False),
	gr.Slider(0, 1, 0.95, label="Top P sampling", render=False),
	],
	)

	demo.queue(max_size=10).launch(share=True)