Spaces:

Smilyai-labs
/

Sam-S-3-api

Running

App Files Files Community

boning123 commited on May 27

Commit

cffaee2

verified ·

1 Parent(s): a392d62

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -68

app.py CHANGED Viewed

@@ -1,106 +1,168 @@
 import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-import os
-# --- Model Configuration ---
-# The Hugging Face model repository ID
 MODEL_REPO_ID = "mradermacher/Sam-reason-v3-GGUF"
-# The specific GGUF filename within that repository
 MODEL_FILENAME = "Sam-reason-v3.Q4_K_M.gguf"
-# Maximum context window for the model (how much text it can 'remember')
-# Adjust this based on your needs and available memory.
 N_CTX = 2048
-# Maximum number of tokens the model will generate in a single response
 MAX_TOKENS = 500
-# Temperature for generation: higher values (e.g., 0.8-1.0) make output more random,
-# lower values (e.g., 0.2-0.5) make it more focused.
 TEMPERATURE = 0.7
-# Top-p sampling: controls diversity. Lower values focus on more probable tokens.
 TOP_P = 0.9
-# Stop sequences: the model will stop generating when it encounters any of these strings.
-# This prevents it from generating further turns or excessive boilerplate.
-STOP_SEQUENCES = ["USER:", "\n\n"]
-# --- Model Loading ---
 print(f"Downloading model: {MODEL_FILENAME} from {MODEL_REPO_ID}...")
 try:
-    # Download the GGUF model file from Hugging Face Hub
     model_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME)
     print(f"Model downloaded to: {model_path}")
 except Exception as e:
     print(f"Error downloading model: {e}")
-    # Exit or handle the error appropriately if the model can't be downloaded
     exit(1)
 print("Initializing Llama model (this may take a moment)...")
 try:
-    # Initialize the Llama model
-    # n_gpu_layers=0 ensures the model runs entirely on the CPU,
-    # which is necessary for the free tier on Hugging Face Spaces.
     llm = Llama(
         model_path=model_path,
         n_gpu_layers=0,  # Force CPU usage
-        n_ctx=N_CTX,     # Set context window size
-        verbose=False    # Suppress llama_cpp verbose output
     )
     print("Llama model initialized successfully.")
 except Exception as e:
     print(f"Error initializing Llama model: {e}")
     exit(1)
-# --- Inference Function ---
-def generate_word_by_word(prompt_text: str):
     """
-    Generates text from the LLM word by word (or token by token) and yields the output.
-    This provides a streaming experience in the Gradio UI and for API calls.
     """
-    # Define the prompt template. This model does not specify a strict chat format,
-    # so a simple instruction-following format is used.
     formatted_prompt = f"USER: {prompt_text}\nASSISTANT:"
-    print(f"Starting generation for prompt: '{prompt_text[:50]}...'")
-    output_tokens = []
-    try:
-        # Use the create_completion method with stream=True for token-by-token generation
-        for chunk in llm.create_completion(
-            formatted_prompt,
-            max_tokens=MAX_TOKENS,
-            stop=STOP_SEQUENCES,
-            stream=True,
-            temperature=TEMPERATURE,
-            top_p=TOP_P,
-        ):
-            token = chunk["choices"][0]["text"]
-            output_tokens.append(token)
-            # Yield the accumulated text to update the UI/API response in real-time
-            yield "".join(output_tokens)
-    except Exception as e:
-        print(f"Error during text generation: {e}")
-        yield f"An error occurred during generation: {e}"
-# --- Gradio Interface ---
-# Create the Gradio Interface for the web UI and API endpoint
-iface = gr.Interface(
-    fn=generate_word_by_word,
-    inputs=gr.Textbox(
-        lines=5,
-        label="Enter your prompt here:",
-        placeholder="e.g., Explain the concept of quantum entanglement in simple terms."
-    ),
-    outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
-    title="SmilyAI: Sam-reason-v3-GGUF Word-by-Word Inference (CPU)",
-    description=(
-        "Enter a prompt and get a word-by-word response from the "
-        "Sam-reason-v3-GGUF model, running on Hugging Face Spaces' free CPU tier. "
-        "The response will stream as it's generated."
-    ),
-    live=True,  # Enable live streaming updates in the UI
-    api_name="predict",  # Expose this function as a REST API endpoint
-    theme=gr.themes.Soft(), # A modern, soft theme for better aesthetics
-)
 # Launch the Gradio application
 if __name__ == "__main__":
     print("Launching Gradio app...")
-    iface.launch(server_name="0.0.0.0", server_port=7860) # Standard ports for HF Spaces

 import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
+from transformers import pipeline
+import re # For sentence splitting
+# --- Model Configuration (same as before) ---
 MODEL_REPO_ID = "mradermacher/Sam-reason-v3-GGUF"
 MODEL_FILENAME = "Sam-reason-v3.Q4_K_M.gguf"
 N_CTX = 2048
 MAX_TOKENS = 500
 TEMPERATURE = 0.7
 TOP_P = 0.9
+STOP_SEQUENCES = ["USER:", "\n\n"] # Model will stop generating when it encounters these
+# --- Safety Configuration ---
+# Initialize the toxicity classifier pipeline
+# This model identifies if text is 'toxic' with a confidence score.
+print("Loading safety model (unitary/toxic-bert)...")
+try:
+    safety_classifier = pipeline(
+        "text-classification",
+        model="unitary/toxic-bert",
+        framework="pt" # Use PyTorch backend
+    )
+    print("Safety model loaded successfully.")
+except Exception as e:
+    print(f"Error loading safety model: {e}")
+    # Consider handling this error more gracefully, e.g., run without safety if model fails to load
+    exit(1)
+# Threshold for flagging content as unsafe (0.0 to 1.0)
+# A higher threshold means it's stricter (less likely to flag non-toxic content).
+TOXICITY_THRESHOLD = 0.9
+def is_text_safe(text: str) -> tuple[bool, str | None]:
+    """
+    Checks if the given text contains unsafe content using the safety classifier.
+    Returns (True, None) if safe, or (False, detected_label) if unsafe.
+    """
+    if not text.strip():
+        return True, None # Empty strings are safe
+    try:
+        # Classify the text. The model typically returns [{'label': 'toxic', 'score': X.XX}]
+        # or [{'label': 'nontoxic', 'score': X.XX}] depending on thresholding in the model.
+        # For unitary/toxic-bert, 'toxic' is the positive label.
+        results = safety_classifier(text)
+        if results and results[0]['label'] == 'toxic' and results[0]['score'] > TOXICITY_THRESHOLD:
+            print(f"Detected unsafe content: '{text.strip()}' (Score: {results[0]['score']:.4f})")
+            return False, results[0]['label']
+        return True, None
+    except Exception as e:
+        print(f"Error during safety check: {e}")
+        # If the safety check fails, consider it unsafe by default or log and let it pass.
+        # For a robust solution, you might want to re-raise or yield an error message.
+        return False, "safety_check_failed"
+# --- Main Model Loading (same as before) ---
 print(f"Downloading model: {MODEL_FILENAME} from {MODEL_REPO_ID}...")
 try:
     model_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME)
     print(f"Model downloaded to: {model_path}")
 except Exception as e:
     print(f"Error downloading model: {e}")
     exit(1)
 print("Initializing Llama model (this may take a moment)...")
 try:
     llm = Llama(
         model_path=model_path,
         n_gpu_layers=0,  # Force CPU usage
+        n_ctx=N_CTX,
+        verbose=False
     )
     print("Llama model initialized successfully.")
 except Exception as e:
     print(f"Error initializing Llama model: {e}")
     exit(1)
+# --- Inference Function with Safety ---
+def generate_word_by_word_with_safety(prompt_text: str):
     """
+    Generates text word by word, checking each sentence for safety before yielding.
     """
     formatted_prompt = f"USER: {prompt_text}\nASSISTANT:"
+    current_sentence_buffer = ""
+    full_output_so_far = ""
+    # Stream tokens from the main LLM
+    token_stream = llm.create_completion(
+        formatted_prompt,
+        max_tokens=MAX_TOKENS,
+        stop=STOP_SEQUENCES,
+        stream=True,
+        temperature=TEMPERATURE,
+        top_p=TOP_P,
+    )
+    for chunk in token_stream:
+        token = chunk["choices"][0]["text"]
+        current_sentence_buffer += token
+        full_output_so_far += token # Keep track of full output for comprehensive check if needed
+        # Simple sentence detection (look for common sentence endings)
+        if re.search(r'[.!?]\s*$', current_sentence_buffer) or len(current_sentence_buffer) > 100: # Max sentence length fallback
+            is_safe, detected_label = is_text_safe(current_sentence_buffer)
+            if not is_safe:
+                print(f"Safety check failed for sentence: '{current_sentence_buffer.strip()}' (Detected: {detected_label})")
+                yield "[Content removed due to safety guidelines]" # Replace unsafe content
+                current_sentence_buffer = "" # Clear buffer for next tokens
+                # Optionally: return here to stop further generation if first unsafe content is found.
+                # If you return here, uncomment the `return` statement below.
+                # return
+            else:
+                yield current_sentence_buffer # Yield the safe sentence
+                current_sentence_buffer = "" # Clear buffer for next sentence
+    # After the loop, check and yield any remaining text in the buffer
+    if current_sentence_buffer.strip():
+        is_safe, detected_label = is_text_safe(current_sentence_buffer)
+        if not is_safe:
+            print(f"Safety check failed for remaining text: '{current_sentence_buffer.strip()}' (Detected: {detected_label})")
+            yield "[Content removed due to safety guidelines]"
+        else:
+            yield current_sentence_buffer
+# --- Gradio Blocks Interface ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # SmilyAI: Sam-reason-v3-GGUF Word-by-Word Inference (CPU with Safety Filter)
+        Enter a prompt and get a word-by-word response from the Sam-reason-v3-GGUF model.
+        **Please note:** All generated sentences are checked for safety using an AI filter.
+        Potentially unsafe content will be replaced with `[Content removed due to safety guidelines]`.
+        Running on Hugging Face Spaces' free CPU tier.
+        """
+    )
+    with gr.Row():
+        user_prompt = gr.Textbox(
+            lines=5,
+            label="Enter your prompt here:",
+            placeholder="e.g., Explain the concept of quantum entanglement in simple terms.",
+            scale=4
+        )
+        generated_text = gr.Textbox(label="Generated Text", show_copy_button=True, scale=6)
+    send_button = gr.Button("Send", variant="primary")
+    # Connect the button click to the inference function with safety check
+    send_button.click(
+        fn=generate_word_by_word_with_safety, # Use the new safety-enabled function
+        inputs=user_prompt,
+        outputs=generated_text,
+        api_name="predict",
+    )
 # Launch the Gradio application
 if __name__ == "__main__":
     print("Launching Gradio app...")
+    demo.launch(server_name="0.0.0.0", server_port=7860)