boning123 commited on
Commit
f38ab88
·
verified ·
1 Parent(s): 73532f5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -0
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from llama_cpp import Llama
3
+ from huggingface_hub import hf_hub_download
4
+ import os
5
+
6
+ # --- Model Configuration ---
7
+ # The Hugging Face model repository ID
8
+ MODEL_REPO_ID = "mradermacher/Sam-reason-v3-GGUF"
9
+ # The specific GGUF filename within that repository
10
+ MODEL_FILENAME = "Sam-reason-v3.Q4_K_M.gguf"
11
+ # Maximum context window for the model (how much text it can 'remember')
12
+ # Adjust this based on your needs and available memory.
13
+ N_CTX = 2048
14
+ # Maximum number of tokens the model will generate in a single response
15
+ MAX_TOKENS = 500
16
+ # Temperature for generation: higher values (e.g., 0.8-1.0) make output more random,
17
+ # lower values (e.g., 0.2-0.5) make it more focused.
18
+ TEMPERATURE = 0.7
19
+ # Top-p sampling: controls diversity. Lower values focus on more probable tokens.
20
+ TOP_P = 0.9
21
+ # Stop sequences: the model will stop generating when it encounters any of these strings.
22
+ # This prevents it from generating further turns or excessive boilerplate.
23
+ STOP_SEQUENCES = ["USER:", "\n\n"]
24
+
25
+ # --- Model Loading ---
26
+ print(f"Downloading model: {MODEL_FILENAME} from {MODEL_REPO_ID}...")
27
+ try:
28
+ # Download the GGUF model file from Hugging Face Hub
29
+ model_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME)
30
+ print(f"Model downloaded to: {model_path}")
31
+ except Exception as e:
32
+ print(f"Error downloading model: {e}")
33
+ # Exit or handle the error appropriately if the model can't be downloaded
34
+ exit(1)
35
+
36
+ print("Initializing Llama model (this may take a moment)...")
37
+ try:
38
+ # Initialize the Llama model
39
+ # n_gpu_layers=0 ensures the model runs entirely on the CPU,
40
+ # which is necessary for the free tier on Hugging Face Spaces.
41
+ llm = Llama(
42
+ model_path=model_path,
43
+ n_gpu_layers=0, # Force CPU usage
44
+ n_ctx=N_CTX, # Set context window size
45
+ verbose=False # Suppress llama_cpp verbose output
46
+ )
47
+ print("Llama model initialized successfully.")
48
+ except Exception as e:
49
+ print(f"Error initializing Llama model: {e}")
50
+ exit(1)
51
+
52
+ # --- Inference Function ---
53
+ def generate_word_by_word(prompt_text: str):
54
+ """
55
+ Generates text from the LLM word by word (or token by token) and yields the output.
56
+ This provides a streaming experience in the Gradio UI and for API calls.
57
+ """
58
+ # Define the prompt template. This model does not specify a strict chat format,
59
+ # so a simple instruction-following format is used.
60
+ formatted_prompt = f"USER: {prompt_text}\nASSISTANT:"
61
+
62
+ print(f"Starting generation for prompt: '{prompt_text[:50]}...'")
63
+ output_tokens = []
64
+ try:
65
+ # Use the create_completion method with stream=True for token-by-token generation
66
+ for chunk in llm.create_completion(
67
+ formatted_prompt,
68
+ max_tokens=MAX_TOKENS,
69
+ stop=STOP_SEQUENCES,
70
+ stream=True,
71
+ temperature=TEMPERATURE,
72
+ top_p=TOP_P,
73
+ ):
74
+ token = chunk["choices"][0]["text"]
75
+ output_tokens.append(token)
76
+ # Yield the accumulated text to update the UI/API response in real-time
77
+ yield "".join(output_tokens)
78
+ except Exception as e:
79
+ print(f"Error during text generation: {e}")
80
+ yield f"An error occurred during generation: {e}"
81
+
82
+ # --- Gradio Interface ---
83
+ # Create the Gradio Interface for the web UI and API endpoint
84
+ iface = gr.Interface(
85
+ fn=generate_word_by_word,
86
+ inputs=gr.Textbox(
87
+ lines=5,
88
+ label="Enter your prompt here:",
89
+ placeholder="e.g., Explain the concept of quantum entanglement in simple terms."
90
+ ),
91
+ outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
92
+ title="SmilyAI: Sam-reason-v3-GGUF Word-by-Word Inference (CPU)",
93
+ description=(
94
+ "Enter a prompt and get a word-by-word response from the "
95
+ "Sam-reason-v3-GGUF model, running on Hugging Face Spaces' free CPU tier. "
96
+ "The response will stream as it's generated."
97
+ ),
98
+ live=True, # Enable live streaming updates in the UI
99
+ api_name="predict", # Expose this function as a REST API endpoint
100
+ theme=gr.themes.Soft(), # A modern, soft theme for better aesthetics
101
+ )
102
+
103
+ # Launch the Gradio application
104
+ if __name__ == "__main__":
105
+ print("Launching Gradio app...")
106
+ iface.launch(server_name="0.0.0.0", server_port=7860) # Standard ports for HF Spaces