Spaces:

robiro
/

k8o1

Running

App Files Files Community

k8o1 / app.py

robiro

Create app.py

0b70ac0 verified 13 days ago

raw

history blame

5.03 kB

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import os

	# --- Configuration ---
	MODEL_NAME_OR_PATH = "unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF"
	# Select a specific GGUF file. Check the "Files and versions" tab on Hugging Face
	# For this model, a common choice might be a Q4_K_M quant. Let's pick one.
	# Example: "DeepSeek-R1-0528-Qwen3-8B-Q4_K_M.gguf"
	# You MUST check the Hugging Face repo for the exact filename you want to use.
	# Let's assume this one exists for the example. Replace if needed.
	MODEL_FILE = "DeepSeek-R1-0528-Qwen3-8B-Q4_K_M.gguf" # MAKE SURE THIS FILENAME IS CORRECT on HF

	# Download the model file if it doesn't exist
	if not os.path.exists(MODEL_FILE):
	print(f"Downloading {MODEL_FILE} from {MODEL_NAME_OR_PATH}...")
	try:
	hf_hub_download(
	repo_id=MODEL_NAME_OR_PATH,
	filename=MODEL_FILE,
	local_dir=".", # Download to current directory
	local_dir_use_symlinks=False # Good practice for GGUF
	)
	print("Download complete.")
	except Exception as e:
	print(f"Error downloading model: {e}")
	print("Please ensure the MODEL_FILE name is correct and available in the repository.")
	exit()
	else:
	print(f"Model file {MODEL_FILE} already exists.")

	# --- Load the GGUF Model ---
	# Adjust n_gpu_layers if you have a GPU-enabled llama-cpp-python
	# -1 means all possible layers to GPU, 0 means CPU only.
	try:
	print("Loading model...")
	llm = Llama(
	model_path=MODEL_FILE,
	n_ctx=2048, # Context window size
	n_threads=None, # None for llama.cpp to auto-detect, or set a specific number
	n_gpu_layers=0 # Change to -1 or a positive number if you have GPU support
	# and want to offload layers to GPU.
	)
	print("Model loaded successfully.")
	except Exception as e:
	print(f"Error loading Llama model: {e}")
	print("Ensure llama-cpp-python is installed correctly and the model file is valid.")
	exit()

	# --- Chat Function ---
	def predict(message, history):
	history_llama_format = []
	for human, ai in history:
	history_llama_format.append({"role": "user", "content": human})
	history_llama_format.append({"role": "assistant", "content": ai})
	history_llama_format.append({"role": "user", "content": message})

	# Qwen models often use a specific chat template.
	# We need to format the prompt correctly for the model.
	# llama-cpp-python's create_chat_completion can handle this if the model
	# has chat template info embedded, or you might need to construct it manually.
	# For simpler generation:
	# prompt = f"<\|im_start\|>user\n{message}<\|im_end\|>\n<\|im_start\|>assistant\n"

	# Using create_chat_completion for a more robust approach if model supports it
	try:
	response = llm.create_chat_completion(
	messages=history_llama_format,
	# temperature=0.7, # Example: Adjust for creativity
	# top_p=0.9, # Example: Nucleus sampling
	# max_tokens=256 # Max tokens to generate for the response
	)
	assistant_response = response['choices'][0]['message']['content']
	except Exception as e:
	print(f"Error during model inference: {e}")
	assistant_response = "Sorry, I encountered an error."
	# Fallback to simpler generation if create_chat_completion fails or is not well-supported for this GGUF
	# This is a very basic prompt construction, might need adjustment based on Qwen's specific format
	prompt = ""
	for entry in history_llama_format:
	if entry["role"] == "user":
	prompt += f"<\|im_start\|>user\n{entry['content']}<\|im_end\|>\n"
	elif entry["role"] == "assistant":
	prompt += f"<\|im_start\|>assistant\n{entry['content']}<\|im_end\|>\n"
	prompt += "<\|im_start\|>assistant\n" # Start of assistant's turn

	try:
	output = llm(
	prompt,
	max_tokens=256,
	stop=["<\|im_end\|>", "<\|im_start\|>user"], # Stop generation at these tokens
	echo=False # Don't echo the prompt
	)
	assistant_response = output['choices'][0]['text'].strip()
	except Exception as e_fallback:
	print(f"Error during fallback model inference: {e_fallback}")
	assistant_response = "Sorry, I encountered an error during fallback."


	return assistant_response

	# --- Gradio Interface ---
	iface = gr.ChatInterface(
	fn=predict,
	title="Unsloth DeepSeek-Qwen3-8B GGUF Chat",
	description="Chat with the unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF model.",
	examples=[
	["Hello, how are you?"],
	["What is the capital of France?"],
	["Write a short story about a friendly robot."]
	],
	chatbot=gr.Chatbot(height=600)
	)

	# --- Launch the App ---
	if __name__ == "__main__":
	print("Launching Gradio interface...")
	iface.launch()