"""This simple script shows how to interact with an OpenAI-compatible server from a client.""" import argparse import modal from openai import OpenAI class Colors: """ANSI color codes""" GREEN = "\033[0;32m" RED = "\033[0;31m" BLUE = "\033[0;34m" GRAY = "\033[0;90m" BOLD = "\033[1m" END = "\033[0m" def get_completion(client, model_id, messages, args): completion_args = { "model": model_id, "messages": messages, "frequency_penalty": args.frequency_penalty, "max_tokens": args.max_tokens, "n": args.n, "presence_penalty": args.presence_penalty, "seed": args.seed, "stop": args.stop, "stream": args.stream, "temperature": args.temperature, "top_p": args.top_p, } completion_args = {k: v for k, v in completion_args.items() if v is not None} try: response = client.chat.completions.create(**completion_args) return response except Exception as e: print(Colors.RED, f"Error during API call: {e}", Colors.END, sep="") return None def main(): parser = argparse.ArgumentParser(description="OpenAI Client CLI") parser.add_argument( "--model", type=str, default=None, help="The model to use for completion, defaults to the first available model", ) parser.add_argument( "--workspace", type=str, default=None, help="The workspace where the LLM server app is hosted, defaults to your current Modal workspace", ) parser.add_argument( "--environment", type=str, default=None, help="The environment in your Modal workspace where the LLM server app is hosted, defaults to your current environment", ) parser.add_argument( "--app-name", type=str, default="example-vllm-openai-compatible", help="A Modal App serving an OpenAI-compatible API", ) parser.add_argument( "--function-name", type=str, default="serve", help="A Modal Function serving an OpenAI-compatible API. Append `-dev` to use a `modal serve`d Function.", ) parser.add_argument( "--api-key", type=str, default="super-secret-key", help="The API key to use for authentication, set in your api.py", ) # Completion parameters parser.add_argument("--max-tokens", type=int, default=None) parser.add_argument("--temperature", type=float, default=0.7) parser.add_argument("--top-p", type=float, default=0.9) parser.add_argument("--top-k", type=int, default=0) parser.add_argument("--frequency-penalty", type=float, default=0) parser.add_argument("--presence-penalty", type=float, default=0) parser.add_argument( "--n", type=int, default=1, help="Number of completions to generate. Streaming and chat mode only support n=1.", ) parser.add_argument("--stop", type=str, default=None) parser.add_argument("--seed", type=int, default=None) # Prompting parser.add_argument( "--prompt", type=str, default="Compose a limerick about baboons and racoons.", help="The user prompt for the chat completion", ) parser.add_argument( "--system-prompt", type=str, default="You are a poetic assistant, skilled in writing satirical doggerel with creative flair.", help="The system prompt for the chat completion", ) # UI options parser.add_argument( "--no-stream", dest="stream", action="store_false", help="Disable streaming of response chunks", ) parser.add_argument( "--chat", action="store_true", help="Enable interactive chat mode" ) args = parser.parse_args() client = OpenAI(api_key=args.api_key) workspace = args.workspace or modal.config._profile environment = args.environment or modal.config.config["environment"] prefix = workspace + (f"-{environment}" if environment else "") client.base_url = ( f"https://{prefix}--{args.app_name}-{args.function_name}.modal.run/v1" ) if args.model: model_id = args.model print( Colors.BOLD, f"🧠: Using model {model_id}. This may trigger a model load on first call!", Colors.END, sep="", ) else: print( Colors.BOLD, f"šŸ”Ž: Looking up available models on server at {client.base_url}. This may trigger a model load!", Colors.END, sep="", ) model = client.models.list().data[0] model_id = model.id print( Colors.BOLD, f"🧠: Using {model_id}", Colors.END, sep="", ) messages = [ { "role": "system", "content": args.system_prompt, } ] print(Colors.BOLD + "🧠: Using system prompt: " + args.system_prompt + Colors.END) if args.chat: print( Colors.GREEN + Colors.BOLD + "\nEntering chat mode. Type 'bye' to end the conversation." + Colors.END ) while True: user_input = input("\nYou: ") if user_input.lower() in ["bye"]: break MAX_HISTORY = 10 if len(messages) > MAX_HISTORY: messages = messages[:1] + messages[-MAX_HISTORY + 1 :] messages.append({"role": "user", "content": user_input}) response = get_completion(client, model_id, messages, args) if response: if args.stream: # only stream assuming n=1 print(Colors.BLUE + "\nšŸ¤–: ", end="") assistant_message = "" for chunk in response: if chunk.choices[0].delta.content: content = chunk.choices[0].delta.content print(content, end="") assistant_message += content print(Colors.END) else: assistant_message = response.choices[0].message.content print( Colors.BLUE + "\nšŸ¤–:" + assistant_message + Colors.END, sep="", ) messages.append({"role": "assistant", "content": assistant_message}) else: messages.append({"role": "user", "content": args.prompt}) print(Colors.GREEN + f"\nYou: {args.prompt}" + Colors.END) response = get_completion(client, model_id, messages, args) if response: if args.stream: print(Colors.BLUE + "\nšŸ¤–:", end="") for chunk in response: if chunk.choices[0].delta.content: print(chunk.choices[0].delta.content, end="") print(Colors.END) else: # only case where multiple completions are returned for i, response in enumerate(response.choices): print( Colors.BLUE + f"\nšŸ¤– Choice {i + 1}:{response.message.content}" + Colors.END, sep="", ) if __name__ == "__main__": main()