LVNet / src /run_gpt.py

jongwoopark7978

chore: add project files

54216bc 5 months ago

5.74 kB

	import requests
	from time import sleep
	from rp import (
	encode_image_to_base64,
	is_image,
	lazy_par_map,
	fansi_print,
	format_current_date,
	)

	import rp


	def _get_gpt_request_json(image, text, max_tokens, model, temperature, dataset="egoschema"):

	options = {
	"temperature": temperature,
	"max_tokens": max_tokens,
	}
	options = {key: value for key, value in options.items() if value is not None}

	sysprompt = ""
	if dataset == "egoschema":
	sysprompt = 'You are a language model designed to analyze video content based on provided captions. These captions summarize key information from multiple frames. In the captions, \'C\' stands for the cameraman. \nYour task is to use these captions to answer questions about the video. For each question, you will choose the most accurate answer from five options, relying only on the given captions without any external knowledge. Provide your response following this specified JSON format.\n{"selection": write your selection number here, "reasons": state the reasons for your selection less than 30 words.}. This is one example output format. {"selection": 3, "reasons": The primary objective and focus within the video content is on cleaning dishes}'
	else:
	sysprompt = 'You are a language model designed to analyze video content based on provided captions. These captions summarize key information from multiple frames.\nYour task is to use these captions to answer questions about the video. For each question, you will choose the most accurate answer from five options, relying only on the given captions without any external knowledge. Provide your response following this specified JSON format.\n{"selection": write your selection number here, "reasons": state the reasons for your selection less than 30 words.}. This is one example output format. {"selection": 3, "reasons": The primary objective and focus within the video content is on cleaning dishes}'

	if image == None:
	context = {"role": "user",
	"content": [{"type": "text","text": text,},],
	}

	else:
	base64_image = encode_image_to_base64(image)
	context = {"role": "user",
	"content": [{"type": "text", "text": text,},
	{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},},
	],
	}

	return options \| {
	"model": model,
	"messages": [
	{"role": "system", "content": sysprompt},
	context,
	],
	}

	def _run_gpt(image, text, max_tokens, model, temperature, api_key):
	"""Processes a single text"""

	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {api_key}",
	}

	request_json = _get_gpt_request_json(image, text, max_tokens, model, temperature)

	response = requests.post(
	"https://api.openai.com/v1/chat/completions",
	headers=headers,
	json=request_json,
	)

	return response.json()["choices"][0]["message"]["content"]

	def run_gpt(
	images=None,
	texts="",
	api_keys=None,
	max_tokens=2000,
	model="gpt-4-vision-preview",
	temperature=None,
	num_threads=10,
	backoff_time=1 * 60,
	silent=False,
	dataset="egoschema",
	verbose=True,
	):
	"""
	Asks GPT a question about an text, returning a string.
	If given multiple texts, will process them in parallel lazily (retuning a generator instead)

	Args:
	text (str, optional): The question we ask GPT
	max_tokens (int, optional): Maximum tokens in the response
	api_key (str, list[str], optional): If specified, overwrites the default openai api_key
	Can be a string or a list of strings
	backoff_time (float): number of seconds to wait if there's an error
	num_threads (int): number of threads if you'd like to run in parallel. 1 thread means it runs sequentially
	silent (bool): if True, won't report errors
	temperature (float, optional)
	model (str)

	Returns:
	(str or generator): GPT3.5, GPT4's response (or a lazy generator of responses if given a list of texts)
	"""
	assert api_keys is not None, "Please provide api_keys for GPT calls"
	if isinstance(api_keys, str):
	api_keys = [api_keys]
	if isinstance(texts, str):
	texts = [texts]
	if images is None: images = [None]
	elif isinstance(images, str) or is_image(images): images = [images]

	assert len(images)==len(texts) or len(images)==1 or len(texts)==1

	if len(texts)==1: texts = texts * len(images)
	if len(images)==1: images = list(images) * len(texts)

	assert len(images) == len(texts)

	api_key_index = 0

	def run(args):
	image, text = args

	while True:
	nonlocal api_key_index
	api_key_index += 1
	api_key_index %= len(api_keys)

	api_key = api_keys[api_key_index]

	try:
	output = _run_gpt(image, text, max_tokens, model, temperature, api_key)
	if verbose: fansi_print("output: " + str(output), "yellow", "bold")
	return output
	except Exception as e:
	if not silent:
	# rp.print_stack_trace()
	# rp.print_verbose_stack_trace()
	fansi_print(
	"Error (" + format_current_date() + "): " + repr(e),
	"red",
	"bold",
	)
	sleep(backoff_time)

	return lazy_par_map(
	run,
	zip(images, texts),
	num_threads=num_threads,
	)