File size: 5,027 Bytes
0b70ac0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

# --- Configuration ---
MODEL_NAME_OR_PATH = "unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF"
# Select a specific GGUF file. Check the "Files and versions" tab on Hugging Face
# For this model, a common choice might be a Q4_K_M quant. Let's pick one.
# Example: "DeepSeek-R1-0528-Qwen3-8B-Q4_K_M.gguf"
# You MUST check the Hugging Face repo for the exact filename you want to use.
# Let's assume this one exists for the example. Replace if needed.
MODEL_FILE = "DeepSeek-R1-0528-Qwen3-8B-Q4_K_M.gguf" # MAKE SURE THIS FILENAME IS CORRECT on HF

# Download the model file if it doesn't exist
if not os.path.exists(MODEL_FILE):
    print(f"Downloading {MODEL_FILE} from {MODEL_NAME_OR_PATH}...")
    try:
        hf_hub_download(
            repo_id=MODEL_NAME_OR_PATH,
            filename=MODEL_FILE,
            local_dir=".", # Download to current directory
            local_dir_use_symlinks=False # Good practice for GGUF
        )
        print("Download complete.")
    except Exception as e:
        print(f"Error downloading model: {e}")
        print("Please ensure the MODEL_FILE name is correct and available in the repository.")
        exit()
else:
    print(f"Model file {MODEL_FILE} already exists.")

# --- Load the GGUF Model ---
# Adjust n_gpu_layers if you have a GPU-enabled llama-cpp-python
# -1 means all possible layers to GPU, 0 means CPU only.
try:
    print("Loading model...")
    llm = Llama(
        model_path=MODEL_FILE,
        n_ctx=2048,        # Context window size
        n_threads=None,    # None for llama.cpp to auto-detect, or set a specific number
        n_gpu_layers=0     # Change to -1 or a positive number if you have GPU support
                           # and want to offload layers to GPU.
    )
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading Llama model: {e}")
    print("Ensure llama-cpp-python is installed correctly and the model file is valid.")
    exit()

# --- Chat Function ---
def predict(message, history):
    history_llama_format = []
    for human, ai in history:
        history_llama_format.append({"role": "user", "content": human})
        history_llama_format.append({"role": "assistant", "content": ai})
    history_llama_format.append({"role": "user", "content": message})

    # Qwen models often use a specific chat template.
    # We need to format the prompt correctly for the model.
    # llama-cpp-python's create_chat_completion can handle this if the model
    # has chat template info embedded, or you might need to construct it manually.
    # For simpler generation:
    # prompt = f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"

    # Using create_chat_completion for a more robust approach if model supports it
    try:
        response = llm.create_chat_completion(
            messages=history_llama_format,
            # temperature=0.7, # Example: Adjust for creativity
            # top_p=0.9,       # Example: Nucleus sampling
            # max_tokens=256   # Max tokens to generate for the response
        )
        assistant_response = response['choices'][0]['message']['content']
    except Exception as e:
        print(f"Error during model inference: {e}")
        assistant_response = "Sorry, I encountered an error."
        # Fallback to simpler generation if create_chat_completion fails or is not well-supported for this GGUF
        # This is a very basic prompt construction, might need adjustment based on Qwen's specific format
        prompt = ""
        for entry in history_llama_format:
            if entry["role"] == "user":
                prompt += f"<|im_start|>user\n{entry['content']}<|im_end|>\n"
            elif entry["role"] == "assistant":
                prompt += f"<|im_start|>assistant\n{entry['content']}<|im_end|>\n"
        prompt += "<|im_start|>assistant\n" # Start of assistant's turn

        try:
            output = llm(
                prompt,
                max_tokens=256,
                stop=["<|im_end|>", "<|im_start|>user"], # Stop generation at these tokens
                echo=False # Don't echo the prompt
            )
            assistant_response = output['choices'][0]['text'].strip()
        except Exception as e_fallback:
            print(f"Error during fallback model inference: {e_fallback}")
            assistant_response = "Sorry, I encountered an error during fallback."


    return assistant_response

# --- Gradio Interface ---
iface = gr.ChatInterface(
    fn=predict,
    title="Unsloth DeepSeek-Qwen3-8B GGUF Chat",
    description="Chat with the unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF model.",
    examples=[
        ["Hello, how are you?"],
        ["What is the capital of France?"],
        ["Write a short story about a friendly robot."]
    ],
    chatbot=gr.Chatbot(height=600)
)

# --- Launch the App ---
if __name__ == "__main__":
    print("Launching Gradio interface...")
    iface.launch()