Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from llama_cpp import Llama
|
3 |
+
from huggingface_hub import hf_hub_download
|
4 |
+
import os
|
5 |
+
|
6 |
+
# --- Model Configuration ---
|
7 |
+
# The Hugging Face model repository ID
|
8 |
+
MODEL_REPO_ID = "mradermacher/Sam-reason-v3-GGUF"
|
9 |
+
# The specific GGUF filename within that repository
|
10 |
+
MODEL_FILENAME = "Sam-reason-v3.Q4_K_M.gguf"
|
11 |
+
# Maximum context window for the model (how much text it can 'remember')
|
12 |
+
# Adjust this based on your needs and available memory.
|
13 |
+
N_CTX = 2048
|
14 |
+
# Maximum number of tokens the model will generate in a single response
|
15 |
+
MAX_TOKENS = 500
|
16 |
+
# Temperature for generation: higher values (e.g., 0.8-1.0) make output more random,
|
17 |
+
# lower values (e.g., 0.2-0.5) make it more focused.
|
18 |
+
TEMPERATURE = 0.7
|
19 |
+
# Top-p sampling: controls diversity. Lower values focus on more probable tokens.
|
20 |
+
TOP_P = 0.9
|
21 |
+
# Stop sequences: the model will stop generating when it encounters any of these strings.
|
22 |
+
# This prevents it from generating further turns or excessive boilerplate.
|
23 |
+
STOP_SEQUENCES = ["USER:", "\n\n"]
|
24 |
+
|
25 |
+
# --- Model Loading ---
|
26 |
+
print(f"Downloading model: {MODEL_FILENAME} from {MODEL_REPO_ID}...")
|
27 |
+
try:
|
28 |
+
# Download the GGUF model file from Hugging Face Hub
|
29 |
+
model_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME)
|
30 |
+
print(f"Model downloaded to: {model_path}")
|
31 |
+
except Exception as e:
|
32 |
+
print(f"Error downloading model: {e}")
|
33 |
+
# Exit or handle the error appropriately if the model can't be downloaded
|
34 |
+
exit(1)
|
35 |
+
|
36 |
+
print("Initializing Llama model (this may take a moment)...")
|
37 |
+
try:
|
38 |
+
# Initialize the Llama model
|
39 |
+
# n_gpu_layers=0 ensures the model runs entirely on the CPU,
|
40 |
+
# which is necessary for the free tier on Hugging Face Spaces.
|
41 |
+
llm = Llama(
|
42 |
+
model_path=model_path,
|
43 |
+
n_gpu_layers=0, # Force CPU usage
|
44 |
+
n_ctx=N_CTX, # Set context window size
|
45 |
+
verbose=False # Suppress llama_cpp verbose output
|
46 |
+
)
|
47 |
+
print("Llama model initialized successfully.")
|
48 |
+
except Exception as e:
|
49 |
+
print(f"Error initializing Llama model: {e}")
|
50 |
+
exit(1)
|
51 |
+
|
52 |
+
# --- Inference Function ---
|
53 |
+
def generate_word_by_word(prompt_text: str):
|
54 |
+
"""
|
55 |
+
Generates text from the LLM word by word (or token by token) and yields the output.
|
56 |
+
This provides a streaming experience in the Gradio UI and for API calls.
|
57 |
+
"""
|
58 |
+
# Define the prompt template. This model does not specify a strict chat format,
|
59 |
+
# so a simple instruction-following format is used.
|
60 |
+
formatted_prompt = f"USER: {prompt_text}\nASSISTANT:"
|
61 |
+
|
62 |
+
print(f"Starting generation for prompt: '{prompt_text[:50]}...'")
|
63 |
+
output_tokens = []
|
64 |
+
try:
|
65 |
+
# Use the create_completion method with stream=True for token-by-token generation
|
66 |
+
for chunk in llm.create_completion(
|
67 |
+
formatted_prompt,
|
68 |
+
max_tokens=MAX_TOKENS,
|
69 |
+
stop=STOP_SEQUENCES,
|
70 |
+
stream=True,
|
71 |
+
temperature=TEMPERATURE,
|
72 |
+
top_p=TOP_P,
|
73 |
+
):
|
74 |
+
token = chunk["choices"][0]["text"]
|
75 |
+
output_tokens.append(token)
|
76 |
+
# Yield the accumulated text to update the UI/API response in real-time
|
77 |
+
yield "".join(output_tokens)
|
78 |
+
except Exception as e:
|
79 |
+
print(f"Error during text generation: {e}")
|
80 |
+
yield f"An error occurred during generation: {e}"
|
81 |
+
|
82 |
+
# --- Gradio Interface ---
|
83 |
+
# Create the Gradio Interface for the web UI and API endpoint
|
84 |
+
iface = gr.Interface(
|
85 |
+
fn=generate_word_by_word,
|
86 |
+
inputs=gr.Textbox(
|
87 |
+
lines=5,
|
88 |
+
label="Enter your prompt here:",
|
89 |
+
placeholder="e.g., Explain the concept of quantum entanglement in simple terms."
|
90 |
+
),
|
91 |
+
outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
|
92 |
+
title="SmilyAI: Sam-reason-v3-GGUF Word-by-Word Inference (CPU)",
|
93 |
+
description=(
|
94 |
+
"Enter a prompt and get a word-by-word response from the "
|
95 |
+
"Sam-reason-v3-GGUF model, running on Hugging Face Spaces' free CPU tier. "
|
96 |
+
"The response will stream as it's generated."
|
97 |
+
),
|
98 |
+
live=True, # Enable live streaming updates in the UI
|
99 |
+
api_name="predict", # Expose this function as a REST API endpoint
|
100 |
+
theme=gr.themes.Soft(), # A modern, soft theme for better aesthetics
|
101 |
+
)
|
102 |
+
|
103 |
+
# Launch the Gradio application
|
104 |
+
if __name__ == "__main__":
|
105 |
+
print("Launching Gradio app...")
|
106 |
+
iface.launch(server_name="0.0.0.0", server_port=7860) # Standard ports for HF Spaces
|