File size: 3,298 Bytes
bc152f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# app.py

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# --- 1. Model and Tokenizer Configuration ---
# We are using the specific model you mentioned earlier.
# The Space will download this from the Hugging Face Hub automatically.
model_name = "likhonsheikh/sheikh-coder-v1-3b"
print("Starting script...")

# --- 2. Load the Model ---
# We'll wrap this in a try-except block to provide clear error messages if something goes wrong on the Space.
try:
    # Use torch_dtype="auto" to let transformers choose the best precision (like bfloat16 on new GPUs)
    # This can significantly speed up inference and reduce memory usage.
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        trust_remote_code=True,
        torch_dtype="auto" 
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Move model to GPU if available on the Space's hardware
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    
    model_loaded = True
    print(f"Model '{model_name}' loaded successfully on device: {device}")

except Exception as e:
    model_loaded = False
    error_message = str(e)
    print(f"FATAL: Failed to load model. Error: {error_message}")

# --- 3. Define the Prediction Function ---
def generate_code(prompt):
    """
    This function takes a text prompt and returns the model's completion.
    """
    if not model_loaded:
        # If the model failed to load, show an error in the UI.
        raise gr.Error(f"Model failed to load: {error_message}")
        
    try:
        # Tokenize the input prompt and move it to the same device as the model.
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        
        # Generate the output from the model
        outputs = model.generate(
            **inputs, 
            max_new_tokens=256, # Limit the number of new tokens to generate
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id # Set pad token to avoid warnings
        )
        
        # Decode the generated tokens into a string
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return generated_text
        
    except Exception as e:
        print(f"Error during generation: {str(e)}")
        raise gr.Error(f"An error occurred during code generation: {str(e)}")

# --- 4. Create the Gradio Interface ---
demo = gr.Interface(
    fn=generate_code,
    inputs=gr.Textbox(
        lines=5, 
        label="Enter your code snippet or question:", 
        placeholder="def fibonacci(n):"
    ),
    outputs=gr.Textbox(label="AI Sheikh's Response:", lines=10),
    title="AI Sheikh Coder (3B Model)",
    description="A Gradio app for the sheikh-coder-v1-3b model. Provide a starting piece of code or a question, and the AI will complete it. Model loading can take a minute on boot.",
    examples=[
        ["def factorial(n):"],
        ["import pandas as pd\n# create a dataframe with 3 columns: 'name', 'age', 'city'"],
        ["# A python function to check if a number is prime"]
    ]
)

# --- 5. Launch the App (for Hugging Face Spaces) ---
# The demo.launch() command is all that's needed to start the web server.
if __name__ == "__main__":
    demo.launch()