Spaces:

Smilyai-labs
/

Sam-S-3-api

Running

App Files Files Community

boning123 commited on 16 days ago

Commit

f38ab88

verified ·

1 Parent(s): 73532f5

Create app.py

Browse files

Files changed (1) hide show

app.py +106 -0

app.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import gradio as gr
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+import os
+# --- Model Configuration ---
+# The Hugging Face model repository ID
+MODEL_REPO_ID = "mradermacher/Sam-reason-v3-GGUF"
+# The specific GGUF filename within that repository
+MODEL_FILENAME = "Sam-reason-v3.Q4_K_M.gguf"
+# Maximum context window for the model (how much text it can 'remember')
+# Adjust this based on your needs and available memory.
+N_CTX = 2048
+# Maximum number of tokens the model will generate in a single response
+MAX_TOKENS = 500
+# Temperature for generation: higher values (e.g., 0.8-1.0) make output more random,
+# lower values (e.g., 0.2-0.5) make it more focused.
+TEMPERATURE = 0.7
+# Top-p sampling: controls diversity. Lower values focus on more probable tokens.
+TOP_P = 0.9
+# Stop sequences: the model will stop generating when it encounters any of these strings.
+# This prevents it from generating further turns or excessive boilerplate.
+STOP_SEQUENCES = ["USER:", "\n\n"]
+# --- Model Loading ---
+print(f"Downloading model: {MODEL_FILENAME} from {MODEL_REPO_ID}...")
+try:
+    # Download the GGUF model file from Hugging Face Hub
+    model_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME)
+    print(f"Model downloaded to: {model_path}")
+except Exception as e:
+    print(f"Error downloading model: {e}")
+    # Exit or handle the error appropriately if the model can't be downloaded
+    exit(1)
+print("Initializing Llama model (this may take a moment)...")
+try:
+    # Initialize the Llama model
+    # n_gpu_layers=0 ensures the model runs entirely on the CPU,
+    # which is necessary for the free tier on Hugging Face Spaces.
+    llm = Llama(
+        model_path=model_path,
+        n_gpu_layers=0,  # Force CPU usage
+        n_ctx=N_CTX,     # Set context window size
+        verbose=False    # Suppress llama_cpp verbose output
+    )
+    print("Llama model initialized successfully.")
+except Exception as e:
+    print(f"Error initializing Llama model: {e}")
+    exit(1)
+# --- Inference Function ---
+def generate_word_by_word(prompt_text: str):
+    """
+    Generates text from the LLM word by word (or token by token) and yields the output.
+    This provides a streaming experience in the Gradio UI and for API calls.
+    """
+    # Define the prompt template. This model does not specify a strict chat format,
+    # so a simple instruction-following format is used.
+    formatted_prompt = f"USER: {prompt_text}\nASSISTANT:"
+    print(f"Starting generation for prompt: '{prompt_text[:50]}...'")
+    output_tokens = []
+    try:
+        # Use the create_completion method with stream=True for token-by-token generation
+        for chunk in llm.create_completion(
+            formatted_prompt,
+            max_tokens=MAX_TOKENS,
+            stop=STOP_SEQUENCES,
+            stream=True,
+            temperature=TEMPERATURE,
+            top_p=TOP_P,
+        ):
+            token = chunk["choices"][0]["text"]
+            output_tokens.append(token)
+            # Yield the accumulated text to update the UI/API response in real-time
+            yield "".join(output_tokens)
+    except Exception as e:
+        print(f"Error during text generation: {e}")
+        yield f"An error occurred during generation: {e}"
+# --- Gradio Interface ---
+# Create the Gradio Interface for the web UI and API endpoint
+iface = gr.Interface(
+    fn=generate_word_by_word,
+    inputs=gr.Textbox(
+        lines=5,
+        label="Enter your prompt here:",
+        placeholder="e.g., Explain the concept of quantum entanglement in simple terms."
+    ),
+    outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
+    title="SmilyAI: Sam-reason-v3-GGUF Word-by-Word Inference (CPU)",
+    description=(
+        "Enter a prompt and get a word-by-word response from the "
+        "Sam-reason-v3-GGUF model, running on Hugging Face Spaces' free CPU tier. "
+        "The response will stream as it's generated."
+    ),
+    live=True,  # Enable live streaming updates in the UI
+    api_name="predict",  # Expose this function as a REST API endpoint
+    theme=gr.themes.Soft(), # A modern, soft theme for better aesthetics
+)
+# Launch the Gradio application
+if __name__ == "__main__":
+    print("Launching Gradio app...")
+    iface.launch(server_name="0.0.0.0", server_port=7860) # Standard ports for HF Spaces