import gradio as gr import sys import json from huggingface_hub import InferenceClient MODEL = "meta-llama/Meta-Llama-3-8B-Instruct" client = InferenceClient(model=MODEL) NUM_THREADS = 2 DISABLED = False def exception_handler(exception_type, exception, traceback): print("%s: %s" % (exception_type.__name__, exception)) sys.excepthook = exception_handler sys.tracebacklimit = 0 def predict(inputs, top_p, temperature, chat_counter, chatbot, history, request: gr.Request): prompt = "<|system|>You are a helpful assistant.<|end|>\n" for i, msg in enumerate(history): role = "user" if i % 2 == 0 else "assistant" prompt += f"<|{role}|>{msg}<|end|>\n" prompt += f"<|user|>{inputs}<|end|>\n<|assistant|>" chat_counter += 1 history.append(inputs) partial_words = "" token_counter = 0 try: for token in client.text_generation(prompt, max_new_tokens=200, temperature=temperature, top_p=top_p, stream=True): partial_words += token if token_counter == 0: history.append(partial_words) else: history[-1] = partial_words token_counter += 1 yield [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)], history, chat_counter, "200 OK", gr.update(interactive=False), gr.update(interactive=False) except Exception as e: print(f'error found: {e}') yield [], history, chat_counter, f"Error: {e}", gr.update(interactive=True), gr.update(interactive=True) print(json.dumps({"chat_counter": chat_counter, "partial_words": partial_words, "token_counter": token_counter})) def reset_textbox(): return gr.update(value='', interactive=False), gr.update(interactive=False) title = """