import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download import os # --- Model Configuration --- # The Hugging Face model repository ID MODEL_REPO_ID = "mradermacher/Sam-reason-v3-GGUF" # The specific GGUF filename within that repository MODEL_FILENAME = "Sam-reason-v3.Q4_K_M.gguf" # Maximum context window for the model (how much text it can 'remember') # Adjust this based on your needs and available memory. N_CTX = 2048 # Maximum number of tokens the model will generate in a single response MAX_TOKENS = 500 # Temperature for generation: higher values (e.g., 0.8-1.0) make output more random, # lower values (e.g., 0.2-0.5) make it more focused. TEMPERATURE = 0.7 # Top-p sampling: controls diversity. Lower values focus on more probable tokens. TOP_P = 0.9 # Stop sequences: the model will stop generating when it encounters any of these strings. # This prevents it from generating further turns or excessive boilerplate. STOP_SEQUENCES = ["USER:", "\n\n"] # --- Model Loading --- print(f"Downloading model: {MODEL_FILENAME} from {MODEL_REPO_ID}...") try: # Download the GGUF model file from Hugging Face Hub model_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME) print(f"Model downloaded to: {model_path}") except Exception as e: print(f"Error downloading model: {e}") # Exit or handle the error appropriately if the model can't be downloaded exit(1) print("Initializing Llama model (this may take a moment)...") try: # Initialize the Llama model # n_gpu_layers=0 ensures the model runs entirely on the CPU, # which is necessary for the free tier on Hugging Face Spaces. llm = Llama( model_path=model_path, n_gpu_layers=0, # Force CPU usage n_ctx=N_CTX, # Set context window size verbose=False # Suppress llama_cpp verbose output ) print("Llama model initialized successfully.") except Exception as e: print(f"Error initializing Llama model: {e}") exit(1) # --- Inference Function --- def generate_word_by_word(prompt_text: str): """ Generates text from the LLM word by word (or token by token) and yields the output. This provides a streaming experience in the Gradio UI and for API calls. """ # Define the prompt template. This model does not specify a strict chat format, # so a simple instruction-following format is used. formatted_prompt = f"USER: {prompt_text}\nASSISTANT:" print(f"Starting generation for prompt: '{prompt_text[:50]}...'") output_tokens = [] try: # Use the create_completion method with stream=True for token-by-token generation for chunk in llm.create_completion( formatted_prompt, max_tokens=MAX_TOKENS, stop=STOP_SEQUENCES, stream=True, temperature=TEMPERATURE, top_p=TOP_P, ): token = chunk["choices"][0]["text"] output_tokens.append(token) # Yield the accumulated text to update the UI/API response in real-time yield "".join(output_tokens) except Exception as e: print(f"Error during text generation: {e}") yield f"An error occurred during generation: {e}" # --- Gradio Interface --- # Create the Gradio Interface for the web UI and API endpoint iface = gr.Interface( fn=generate_word_by_word, inputs=gr.Textbox( lines=5, label="Enter your prompt here:", placeholder="e.g., Explain the concept of quantum entanglement in simple terms." ), outputs=gr.Textbox(label="Generated Text", show_copy_button=True), title="SmilyAI: Sam-reason-v3-GGUF Word-by-Word Inference (CPU)", description=( "Enter a prompt and get a word-by-word response from the " "Sam-reason-v3-GGUF model, running on Hugging Face Spaces' free CPU tier. " "The response will stream as it's generated." ), live=True, # Enable live streaming updates in the UI api_name="predict", # Expose this function as a REST API endpoint theme=gr.themes.Soft(), # A modern, soft theme for better aesthetics ) # Launch the Gradio application if __name__ == "__main__": print("Launching Gradio app...") iface.launch(server_name="0.0.0.0", server_port=7860) # Standard ports for HF Spaces