Spaces:

keungliang
/

LiteLLM

Running

File size: 3,186 Bytes

model_list:
  - model_name: azure_o3-pro
    litellm_params:
      model: azure/azure_o3-pro
      api_key: os.environ/AZURE_RESPONSES_OPENAI_API_KEY
      api_base: os.environ/AZURE_RESPONSES_OPENAI_BASE_URL
      api_version: "preview"
      drop_params: true
      additional_drop_params: ["temperature", "top_p", "presence_penalty", "frequency_penalty", "logprobs", "top_logprobs", "logit_bias", "max_tokens"]
      reasoning:
        effort: high 
        summary: detailed
      text:
        verbosity: high
    model_info:
      mode: responses
      background: True
      max_tokens: 100000
      max_input_tokens: 200000
      max_output_tokens: 100000
      input_cost_per_token:  0.00002
      output_cost_per_token: 0.00008
      input_cost_per_token_batches:  0.00001
      output_cost_per_token_batches: 0.00004
      supports_function_calling:          true
      supports_parallel_function_calling: false
      supports_vision:                    true
      supports_pdf_input:                 true
      supports_prompt_caching:            true
      supports_response_schema:           true
      supports_reasoning:                 true
      supports_tool_choice:               true
      supported_endpoints:
        - /v1/responses
        - /v1/batch
      supported_modalities:
        - text
        - image
      supported_output_modalities:
        - text
  - model_name: azure_gpt-5
    litellm_params:
      model: azure/azure_gpt-5
      api_base: os.environ/AZURE_RESPONSES_OPENAI_BASE_URL
      api_version: preview
      api_key: os.environ/AZURE_RESPONSES_OPENAI_API_KEY
      merge_reasoning_content_in_choices: true
      drop_params: True
      additional_drop_params: ["temperature", "top_p", "presence_penalty", "frequency_penalty", "logprobs", "top_logprobs", "logit_bias", "max_tokens"]
      reasoning:
        effort: high 
        summary: detailed
      model_info:
        supports_reasoning: True
        max_input_tokens: 128000
        max_output_tokens: 64000
        supports_tool_choice: True
        supports_vision: True
        supports_response_schema: True
        supports_prompt_caching: True
        background: True
        mode: responses
      input_cost_per_token: 0.00000125
      output_cost_per_token: 0.000010
# --------------Other Settings--------------------

litellm_settings:
  # Networking settings
  request_timeout: 4000 # (int) llm request timeout in seconds. Raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout
  num_retries: 3
  # fallbacks: [{ "gemini-1.5-pro": ["gemini-1.5-flash"] }]
  allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
  cooldown_time: 30 # how long to cooldown model if fails/min > allowed_fails
  drop_params: true

general_settings:
  master_key: os.environ/MASTER_KEY # sk-1234 # [OPTIONAL] Only use this if you require all calls to contain this key (Authorization: Bearer sk-1234)

# router_settings:
#  fallbacks:
#    [
#      { "or/gemini-2.0-flash-exp": ["gg1/gemini-2.0-flash-exp"] },
#      { "gpt-3.5-turbo": ["gemini-1.5-flash"] },
#    ]
#  model_group_alias: { "gpt-4": "gemini-1.5-pro" }
#  routing_strategy: simple-shuffle