File size: 16,029 Bytes
9d5b280 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 |
import copy
import json
import os
from functools import lru_cache
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
from tqdm import tqdm
from lm_eval.api.instance import Instance
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.models.api_models import JsonChatStr
from lm_eval.utils import eval_logger, simple_parse_args_string
class LogLikelihoodResult(NamedTuple):
log_likelihood: float
is_greedy: bool
def _verify_credentials(creds: Any) -> None:
"""
Verifies that all required keys are present in the credentials dictionary.
Args:
creds (Any): A dictionary containing the credentials.
Raises:
ValueError: If any of the necessary credentials are missing, with guidance on which environment variables need to be set.
"""
required_keys = ["apikey", "url", "project_id"]
env_var_mapping = {
"apikey": "WATSONX_API_KEY",
"url": "WATSONX_URL",
"project_id": "WATSONX_PROJECT_ID",
}
missing_keys = [key for key in required_keys if key not in creds or not creds[key]]
if missing_keys:
missing_env_vars = [env_var_mapping[key] for key in missing_keys]
raise ValueError(
f"Missing required credentials: {', '.join(missing_keys)}. Please set the following environment variables: {', '.join(missing_env_vars)}"
)
@lru_cache(maxsize=None)
def get_watsonx_credentials() -> Dict[str, str]:
"""
Retrieves Watsonx API credentials from environmental variables.
Returns:
Dict[str, str]: A dictionary containing the credentials necessary for authentication, including
keys such as `apikey`, `url`, and `project_id`.
Raises:
AssertionError: If the credentials format is invalid or any of the necessary credentials are missing.
"""
credentials = {
"apikey": os.getenv("WATSONX_API_KEY", None),
"url": os.getenv("WATSONX_URL", None),
"project_id": os.getenv("WATSONX_PROJECT_ID", None),
}
_verify_credentials(credentials)
return credentials
@register_model("watsonx_llm")
class WatsonxLLM(LM):
"""
Implementation of LM model interface for evaluating Watsonx model with the lm_eval framework.
See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md for reference.
"""
@classmethod
def create_from_arg_string(
cls: Type["WatsonxLLM"],
arg_string: str,
additional_config: Optional[Dict] = None,
) -> "WatsonxLLM":
"""
Allow the user to specify model parameters (TextGenerationParameters) in CLI arguments.
"""
try:
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
args = simple_parse_args_string(arg_string)
args.update(additional_config)
model_id = args.pop("model_id", None)
if model_id is None:
raise ValueError("'model_id' is required, please pass it in 'model_args'")
if not args.get("do_sample", None):
args["temperature"] = None
args["top_p"] = None
args["top_k"] = None
args["seed"] = None
generate_params = {
GenParams.DECODING_METHOD: (
"greedy" if not args.get("do_sample", None) else "sample"
),
GenParams.LENGTH_PENALTY: args.get("length_penalty", None),
GenParams.TEMPERATURE: args.get("temperature", None),
GenParams.TOP_P: args.get("top_p", None),
GenParams.TOP_K: args.get("top_k", None),
GenParams.RANDOM_SEED: args.get("seed", None),
GenParams.REPETITION_PENALTY: args.get("repetition_penalty", None),
GenParams.MIN_NEW_TOKENS: args.get("min_new_tokens", None),
GenParams.MAX_NEW_TOKENS: args.get("max_new_tokens", 256),
GenParams.STOP_SEQUENCES: args.get("stop_sequences", None),
GenParams.TIME_LIMIT: args.get("time_limit", None),
GenParams.TRUNCATE_INPUT_TOKENS: args.get("truncate_input_tokens", None),
GenParams.RETURN_OPTIONS: {
"generated_tokens": True,
"input_tokens": True,
"token_logprobs": True,
"token_ranks": True,
},
}
generate_params = {k: v for k, v in generate_params.items() if v is not None}
return cls(
watsonx_credentials=get_watsonx_credentials(),
model_id=model_id,
generate_params=generate_params,
)
def __init__(
self,
watsonx_credentials: Dict,
model_id,
generate_params: Optional[Dict[Any, Any]] = None,
) -> None:
try:
from ibm_watsonx_ai import APIClient
from ibm_watsonx_ai.foundation_models import ModelInference
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
super().__init__()
client = APIClient(watsonx_credentials)
project_id = watsonx_credentials.get("project_id", None)
deployment_id = watsonx_credentials.get("deployment_id", None)
client.set.default_project(project_id)
self.generate_params = generate_params
self.model = ModelInference(
model_id=model_id,
deployment_id=deployment_id,
api_client=client,
project_id=project_id,
)
self._model_id = model_id
@staticmethod
def _has_stop_token(response_tokens: List[str], context_tokens: List[str]) -> bool:
"""
Determines whether a stop token has been generated in the `response_tokens` compared to the `context_tokens`.
If the tokens do not match as expected, the function raises a RuntimeError, indicating a possible
misalignment between the tokens generated by the tokenizer and the model.
Args:
response_tokens (List[str]): The List of tokens generated as a response by the model.
context_tokens (List[str]): The List of tokens representing the input context.
Returns:
bool: True if the `response_tokens` likely contain a stop token that terminates the sequence,
otherwise raises an exception.
Raises:
RuntimeError: If there is an unexpected mismatch between the `response_tokens` and the `context_tokens`.
"""
context_length = len(context_tokens)
if response_tokens[: context_length - 1] == context_tokens[:-1]:
return (
response_tokens[-1] != context_tokens[-1]
) # only last token differs, probably stop sequence (</s>)
raise RuntimeError(
f"There is an unexpected difference between tokenizer and model tokens:\n"
f"context_tokens={context_tokens}\n"
f"response_tokens={response_tokens[:context_length]}"
)
def _check_model_logprobs_support(self):
"""
Verifies if the model supports returning log probabilities for input tokens.
This function sends a prompt to the model and checks whether the model's response
includes log probabilities for the input tokens. If log probabilities are not present,
it raises a `RuntimeError`, indicating that the model is not supported.
Raises:
RuntimeError: If the model does not return log probabilities for input tokens.
"""
tokens = self.model.generate_text(
prompt=["The best ice cream flavor is:"],
params=self.generate_params,
raw_response=True,
)[0]["results"][0]
if all(token.get("logprob", None) is None for token in tokens["input_tokens"]):
raise RuntimeError(
f"Model {self._model_id} is not supported: does not return logprobs for input tokens"
)
def _get_log_likelihood(
self,
input_tokens: List[Dict[str, float]],
context_tokens: List[Dict[str, float]],
) -> LogLikelihoodResult:
"""
Calculates the log likelihood of the generated tokens compared to the context tokens.
Args:
input_tokens (List[Dict[str, float]]): A List of token dictionaries, each containing
token information like `text` and `logprob`.
context_tokens (List[Dict[str, float]]): A List of token dictionaries representing
the input context.
Returns:
LogLikelihoodResult: An object containing the calculated log likelihood and a boolean
flag indicating if the tokens were generated greedily.
"""
response_tokens = [token["text"] for token in input_tokens]
context_length = len(context_tokens)
if self._has_stop_token(response_tokens, context_tokens):
context_length -= 1
return LogLikelihoodResult(
log_likelihood=sum(
token.get("logprob", 0) for token in input_tokens[context_length:]
),
is_greedy=all(
token["rank"] == 1 for token in input_tokens[context_length:]
),
)
def generate_until(self, requests: List[Instance]) -> List[str]:
"""
Generates text responses for a List of requests, with progress tracking and caching.
Args:
requests (List[Instance]): A List of instances, each containing a text input to be processed.
Returns:
List[str]: A List of generated responses.
"""
requests = [request.args for request in requests]
results = []
for request in tqdm(
requests,
desc="Running generate_until function ...",
):
context, continuation = request
try:
if isinstance(context, JsonChatStr):
context = json.loads(context.prompt)
response = self.model.chat(context, self.generate_params)
response = response["choices"][0]["message"]["content"]
else:
response = self.model.generate_text(context, self.generate_params)
except Exception as exp:
eval_logger.error("Error while generating text.")
raise exp
results.append(response)
self.cache_hook.add_partial(
"generate_until", (context, continuation), response
)
return results
def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
"""
Args:
requests: Each request contains Instance.args : Tuple[str, str] containing:
1. an input string to the LM and
2. a target string on which the loglikelihood of the LM producing this target,
conditioned on the input, will be returned.
Returns:
Tuple (loglikelihood, is_greedy) for each request according to the input order:
loglikelihood: probability of generating the target string conditioned on the input
is_greedy: True if and only if the target string would be generated by greedy sampling from the LM
"""
try:
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
self._check_model_logprobs_support()
generate_params = copy.copy(self.generate_params)
generate_params[GenParams.MAX_NEW_TOKENS] = 1
requests = [request.args for request in requests]
results: List[LogLikelihoodResult] = []
# Note: We're not using batching due to (current) indeterminism of loglikelihood values when sending batch of requests
for request in tqdm(
requests,
desc="Running loglikelihood function ...",
):
context, continuation = request
try:
tokenized_context = self.model.tokenize(
prompt=context, return_tokens=True
)["result"]["tokens"]
except Exception as exp:
eval_logger.error("Error while model tokenize.")
raise exp
input_prompt = context + continuation
try:
response = self.model.generate_text(
prompt=input_prompt, params=generate_params, raw_response=True
)
except Exception as exp:
eval_logger.error("Error while model generate text.")
raise exp
log_likelihood_response = self._get_log_likelihood(
response["results"][0]["input_tokens"], tokenized_context
)
results.append(log_likelihood_response)
self.cache_hook.add_partial(
"loglikelihood",
(context, continuation),
(
log_likelihood_response.log_likelihood,
log_likelihood_response.is_greedy,
),
)
return cast(List[Tuple[float, bool]], results)
def loglikelihood_rolling(self, requests) -> List[Tuple[float, bool]]:
"""
Used to evaluate perplexity on a data distribution.
Args:
requests: Each request contains Instance.args : Tuple[str] containing an input string to the model whose
entire loglikelihood, conditioned on purely the EOT token, will be calculated.
Returns:
Tuple (loglikelihood,) for each request according to the input order:
loglikelihood: solely the probability of producing each piece of text given no starting input.
"""
try:
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
self._check_model_logprobs_support()
generate_params = copy.deepcopy(self.generate_params)
generate_params[GenParams.MAX_NEW_TOKENS] = 1
requests = [request.args for request in requests]
results: List[LogLikelihoodResult] = []
# Note: We're not using batching due to (current) indeterminism of loglikelihood values when sending batch of requests
for request in tqdm(
requests,
desc="Running loglikelihood_rolling function ...",
):
context, continuation = request
try:
response = self.model.generate_text(
prompt=context, params=generate_params, raw_response=True
)
except Exception as exp:
eval_logger.error("Error while model generate text.")
raise exp
log_likelihood_response = self._get_log_likelihood(
response["results"][0]["input_tokens"], []
)
results.append(log_likelihood_response)
self.cache_hook.add_partial(
"loglikelihood_rolling",
(context, continuation),
log_likelihood_response.log_likelihood,
)
return cast(List[Tuple[float, bool]], results)
@property
def tokenizer_name(self) -> str:
return ""
def apply_chat_template(
self, chat_history: List[Dict[str, str]]
) -> List[Dict[str, str]]:
# A hack similar from api_model to allow encoding for cache
return JsonChatStr(json.dumps(chat_history))
|