GPT-4.1 mini: Research Preview (Short-Term Availability)

import gradio as gr
import sys
import json
from huggingface_hub import InferenceClient

MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
client = InferenceClient(model=MODEL)
NUM_THREADS = 2
DISABLED = False

def exception_handler(exception_type, exception, traceback):
    print("%s: %s" % (exception_type.__name__, exception))
sys.excepthook = exception_handler
sys.tracebacklimit = 0

def predict(inputs, top_p, temperature, chat_counter, chatbot, history, request: gr.Request):
    prompt = "<|system|>You are a helpful assistant.<|end|>\n"
    for i, msg in enumerate(history):
        role = "user" if i % 2 == 0 else "assistant"
        prompt += f"<|{role}|>{msg}<|end|>\n"
    prompt += f"<|user|>{inputs}<|end|>\n<|assistant|>"

    chat_counter += 1
    history.append(inputs)
    partial_words = ""
    token_counter = 0

    try:
        for token in client.text_generation(prompt, max_new_tokens=200, temperature=temperature, top_p=top_p, stream=True):
            partial_words += token
            if token_counter == 0:
                history.append(partial_words)
            else:
                history[-1] = partial_words
            token_counter += 1
            yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, "200 OK", gr.update(interactive=False), gr.update(interactive=False)
    except Exception as e:
        print(f'error found: {e}')
        yield [], history, chat_counter, f"Error: {e}", gr.update(interactive=True), gr.update(interactive=True)

    print(json.dumps({"chat_counter": chat_counter, "partial_words": partial_words, "token_counter": token_counter}))

def reset_textbox():
    return gr.update(value='', interactive=False), gr.update(interactive=False)

title = """<h1 align="center">GPT-4.1 mini: Research Preview (Short-Term Availability)</h1>"""
description = """Language models can be conditioned to act like dialogue agents through a conversational prompt that typically takes the form: