import os import gradio as gr from openai import OpenAI title = None # "ServiceNow-AI Chat" description = None modelConfig = { "MODEL_NAME": os.environ.get("MODEL_NAME"), "MODE_DISPLAY_NAME": os.environ.get("MODE_DISPLAY_NAME"), "MODEL_HF_URL": os.environ.get("MODEL_HF_URL"), "VLLM_API_URL": os.environ.get("VLLM_API_URL"), "AUTH_TOKEN": os.environ.get("AUTH_TOKEN") } # Initialize the OpenAI client with the vLLM API URL and token client = OpenAI( api_key=modelConfig.get('AUTH_TOKEN'), base_url=modelConfig.get('VLLM_API_URL') ) def chat_fn(message, history): # Format history as OpenAI expects formatted = [{"role": "user", "content": user} if i % 2 == 0 else {"role": "assistant", "content": assistant} for i, (user, assistant) in enumerate(history)] formatted.append({"role": "user", "content": message}) # Create the streaming response stream = client.chat.completions.create( model=modelConfig.get('MODEL_NAME'), messages=formatted, temperature=0.8, stream=True ) output = "" for chunk in stream: # Extract the new content from the delta field content = getattr(chunk.choices[0].delta, "content", "") output += content # Yield the current accumulated output, removing "<|end|>" if present if output.endswith("<|end|>"): yield {"role": "assistant", "content": output[:-7]} else: yield {"role": "assistant", "content": output} # Add the model display name and Hugging Face URL to the description # description = f"### Model: [{MODE_DISPLAY_NAME}]({MODEL_HF_URL})" print(f"Running model {modelConfig.get('MODE_DISPLAY_NAME')} ({modelConfig.get('MODEL_NAME')})") gr.ChatInterface( chat_fn, title=title, description=description, theme=gr.themes.Default(primary_hue="green"), type="messages" ).launch()