File size: 2,297 Bytes
fb28324
c587117
 
fb28324
c587117
 
 
fb28324
c587117
fb28324
00f554d
 
fb28324
00f554d
fb28324
00f554d
 
b6b98c7
 
00f554d
fb28324
00f554d
fb28324
00f554d
 
fb28324
 
 
00f554d
fb28324
 
 
 
 
00f554d
 
fb28324
 
00f554d
fb28324
 
 
00f554d
 
fb28324
00f554d
fb28324
 
 
 
 
 
 
 
 
00f554d
fb28324
00f554d
fb28324
 
 
00f554d
fb28324
00f554d
fb28324
00f554d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from together import Together
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()
api_key = os.getenv("API_KEY")
# Initialize Together client
client = Together(api_key=api_key)

# Set the safety content policy
safe_content_policy = {
    "policy": """
    O1: Topic Restriction.
    Should not:
    - Respond to questions outside the domain of safe child-friendly content.
    - Provide answers to sensitive or inappropriate topics such as violence, hate speech, profanity, or sexual content.
    - Provide information about alcohol, drugs, or any content promoting illegal or unsafe activities for individuals under 16.

    O2: Sensitive Information.
    Should not:
    - Request or respond with any personal or sensitive user data.
    Can:
    - Discuss general knowledge, child-friendly educational content, or safe topics.
    O3: Language.
    Should not:
    - Use profane, offensive, or inappropriate language.
    Can:
    - Use simple, clear, and friendly language suitable for children.
    """
}

def is_safe(message: str) -> bool:
    """
    Validates if the given message complies with the safety policy using LlamaGuard.

    Args:
        message (str): User message or chatbot response.

    Returns:
        bool: True if the message is safe, False otherwise.
    """
    # Prepare the LlamaGuard prompt
    prompt = f"""[INST] Task: Validate if the given 'message' adheres to the safety policy below:
    <BEGIN SAFETY POLICY>
    {safe_content_policy['policy']}
    <END SAFETY POLICY>
    <BEGIN MESSAGE>
    {message}
    <END MESSAGE>
    Provide your safety assessment for the above message:
    - First line must read 'safe' or 'unsafe'.
    - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]"""

    try:
        # Send the prompt to the LlamaGuard model
        response = client.completions.create(
            model="meta-llama/Meta-Llama-Guard-3-8B",
            prompt=prompt
        )

        # Parse the result
        result = response.choices[0].text.strip().lower()
        return result.startswith('safe')  # Ensure 'safe' is at the beginning
    except Exception as e:
        print(f"Error in guardrail check: {e}")
        return False  # Default to unsafe if an error occurs