Spaces:

jbilcke-hf
/

tikslop

Running on CPU Upgrade

App Files Files Community

tikslop / server /llm_utils.py

jbilcke-hf HF Staff

starting to take shape

8ab4638 9 days ago

raw

history blame contribute delete

15.2 kB

	"""
	LLM-related utilities, templates, and text generation functions.
	"""
	import asyncio
	import logging
	from typing import Optional, Dict, Any
	from huggingface_hub import InferenceClient
	from .api_config import HF_TOKEN, TEXT_MODEL

	logger = logging.getLogger(__name__)


	# LLM prompt templates
	SEARCH_VIDEO_PROMPT_TEMPLATE = """# Instruction
	Your response MUST be a YAML object containing a title and description, consistent with what we can find on a video sharing platform.
	Format your YAML response with only those fields: "title" (a short string) and "description" (string caption of the scene). Do not add any other field.
	In the description field, describe in a very synthetic way the visuals of the first shot (first scene), eg "<STYLE>, medium close-up shot, high angle view. In the foreground a <OPTIONAL AGE> <OPTIONAL GENDER> <CHARACTERS> <ACTIONS>. In the background <DESCRIBE LOCATION, BACKGROUND CHARACTERS, OBJECTS ETC>. The scene is lit by <LIGHTING> <WEATHER>". This is just an example! you MUST replace the <TAGS>!!.
	Don't forget to replace <STYLE> etc, by the actual fields!!
	For the style, be creative, for instance you can use anything like a "documentary footage", "japanese animation", "movie scene", "tv series", "tv show", "security footage" etc.
	If the user ask for something specific eg "movie screencap", "movie scene", "documentary footage" "animation" as a style etc.
	Keep it minimalist but still descriptive, don't use bullets points, use simple words, go to the essential to describe style (cinematic, documentary footage, 3D rendering..), camera modes and angles, characters, age, gender, action, location, lighting, country, costume, time, weather, textures, color palette.. etc). Write about 80 words, and use between 2 and 3 sentences.
	The most import part is to describe the actions and movements in the scene, so don't forget that!
	Don't describe sound, so ever say things like "atmospheric music playing in the background".
	Instead describe the visual elements we can see in the background, be precise, (if there are anything, cars, objects, people, bricks, birds, clouds, trees, leaves or grass then say it so etc).
	Make the result unique and different from previous search results. ONLY RETURN YAML AND WITH ENGLISH CONTENT, NOT CHINESE - DO NOT ADD ANY OTHER COMMENT!

	# Context
	This is attempt {current_attempt}.

	# Input
	Describe the first scene/shot for: "{query}".

	# Output

	```yaml
	title: \""""

	GENERATE_CAPTION_PROMPT_TEMPLATE = """Generate a detailed story for a video named: "{title}"
	Visual description of the video: {description}.
	Instructions: Write the story summary, including the plot, action, what should happen.
	Make it around 200-300 words long.
	A video can be anything from a tutorial, webcam, trailer, movie, live stream etc."""

	SIMULATE_VIDEO_FIRST_PROMPT_TEMPLATE = """You are tasked with evolving the narrative for a video titled: "{original_title}"

	Original description:
	{original_description}
	{chat_section}

	Instructions:
	1. Imagine the next logical scene or development that would follow the current description.
	2. Consider the video context and recent events
	3. Create a natural progression from previous clips
	4. Take into account user suggestions (chat messages) into the scene
	5. IMPORTANT: viewers have shared messages, consider their input in priority to guide your story, and incorporate relevant suggestions or reactions into your narrative evolution.
	6. Keep visual consistency with previous clips (in most cases you should repeat the same exact description of the location, characters etc but only change a few elements. If this is a webcam scenario, don't touch the camera orientation or focus)
	7. Return ONLY the caption text, no additional formatting or explanation
	8. Write in English, about 200 words.
	9. Keep the visual style consistant, but content as well (repeat the style, character, locations, appearance etc..from the previous description, when it makes sense).
	10. Your caption must describe visual elements of the scene in details, including: camera angle and focus, people's appearance, age, look, costumes, clothes, the location visual characteristics and geometry, lighting, action, objects, weather, textures, lighting.
	11. Please write in the same style as the original description, by keeping things brief etc.

	Remember to obey to what users said in the chat history!!

	Now, you must write down the new scene description (don't write a long story! write a synthetic description!):"""

	SIMULATE_VIDEO_CONTINUE_PROMPT_TEMPLATE = """You are tasked with continuing to evolve the narrative for a video titled: "{original_title}"

	Original description:
	{original_description}

	Condensed history of scenes so far:
	{condensed_history}

	Current description (most recent scene):
	{current_description}
	{chat_section}

	Instructions:
	1. Imagine the next logical scene or development that would follow the current description.
	2. Consider the video context and recent events
	3. Create a natural progression from previous clips
	4. Take into account user suggestions (chat messages) into the scene
	5. IMPORTANT: if viewers have shared messages, consider their input in priority to guide your story, and incorporate relevant suggestions or reactions into your narrative evolution.
	6. Keep visual consistency with previous clips (in most cases you should repeat the same exact description of the location, characters etc but only change a few elements. If this is a webcam scenario, don't touch the camera orientation or focus)
	7. Return ONLY the caption text, no additional formatting or explanation
	8. Write in English, about 200 words.
	9. Keep the visual style consistant, but content as well (repeat the style, character, locations, appearance etc..from the previous description, when it makes sense).
	10. Your caption must describe visual elements of the scene in details, including: camera angle and focus, people's appearance, age, look, costumes, clothes, the location visual characteristics and geometry, lighting, action, objects, weather, textures, lighting.
	11. Please write in the same style as the original description, by keeping things brief etc.

	Remember to obey to what users said in the chat history!!

	Now, you must write down the new scene description (don't write a long story! write a synthetic description!):"""

	GENERATE_CLIP_PROMPT_TEMPLATE = """# Context and task
	Please write the caption for a new clip.

	# Instructions
	1. Consider the video context and recent events
	2. Create a natural progression from previous clips
	3. Take into account user suggestions (chat messages) into the scene
	4. Don't generate hateful, political, violent or sexual content
	5. Keep visual consistency with previous clips (in most cases you should repeat the same exact description of the location, characters etc but only change a few elements. If this is a webcam scenario, don't touch the camera orientation or focus)
	6. Return ONLY the caption text, no additional formatting or explanation
	7. Write in English, about 200 words.
	8. Keep the visual style consistant, but content as well (repeat the style, character, locations, appearance etc.. across scenes, when it makes sense).
	8. Your caption must describe visual elements of the scene in details, including: camera angle and focus, people's appearance, age, look, costumes, clothes, the location visual characteristics and geometry, lighting, action, objects, weather, textures, lighting.

	# Examples
	Here is a demo scenario, with fake data:
	{{"time": "2024-11-29T13:36:15Z", "event": "new_stream_clip", "caption": "webcam view of a beautiful park, squirrels are playing in the lush grass, blablabla etc... (rest omitted for brevity)"}}
	{{"time": "2024-11-29T13:36:20Z", "event": "new_chat_message", "username": "MonkeyLover89", "data": "hi"}}
	{{"time": "2024-11-29T13:36:25Z", "event": "new_chat_message", "username": "MonkeyLover89", "data": "more squirrels plz"}}
	{{"time": "2024-11-29T13:36:26Z", "event": "new_stream_clip", "caption": "webcam view of a beautiful park, a lot of squirrels are playing in the lush grass, blablabla etc... (rest omitted for brevity)"}}

	# Real scenario and data

	We are inside a video titled "{title}"
	The video is described by: "{description}".
	Here is a summary of the {event_count} most recent events:
	{events_json}

	# Your response
	Your caption:"""


	def get_inference_client(llm_config: Optional[dict] = None) -> InferenceClient:
	"""
	Get an InferenceClient configured with the provided LLM settings.

	Priority order for API keys:
	1. Provider-specific API key (if provided)
	2. User's HF token (if provided)
	3. Server's HF token (only for built-in provider)
	4. Raise exception if no valid key is available
	"""

	if not llm_config:
	if HF_TOKEN:
	return InferenceClient(
	model=TEXT_MODEL,
	token=HF_TOKEN
	)
	else:
	raise ValueError("Built-in provider is not available. Server HF_TOKEN is not configured.")

	provider = llm_config.get('provider', '').lower()
	#logger.info(f"provider = {provider}")

	# If no provider or model specified, use default
	if not provider or provider == 'built-in':
	if HF_TOKEN:
	return InferenceClient(
	model=TEXT_MODEL,
	token=HF_TOKEN
	)
	else:
	raise ValueError("Built-in provider is not available. Server HF_TOKEN is not configured.")

	model = llm_config.get('model', '')
	user_provider_api_key = llm_config.get('api_key', '') # Provider-specific API key
	user_hf_token = llm_config.get('hf_token', '') # User's HF token

	try:
	# Case 1: Use a provider with a provider-specific API key if available
	# This mode is currently hidden in the Flutter UI (we don't ask for provider-specific keys yet)
	# but it is implemented here so that we don't forget it later
	if user_provider_api_key:
	return InferenceClient(
	provider=provider,
	model=model,
	api_key=user_provider_api_key
	)

	# Case 2: Use a provider with user's HF token if available
	elif user_hf_token:
	return InferenceClient(
	provider=provider,
	model=model,
	token=user_hf_token
	)
	else:
	raise ValueError(f"No API key provided for provider '{provider}'. Please provide either a valid {provider} API key or your Hugging Face API key.")

	except ValueError:
	# Re-raise ValueError for missing API keys
	raise
	except Exception as e:
	logger.error(f"Error creating InferenceClient for provider '{provider}' and model '{model}': {e}")
	# Re-raise all other exceptions
	raise


	async def generate_text(prompt: str, llm_config: Optional[dict] = None,
	max_new_tokens: int = 200, temperature: float = 0.7,
	model_override: Optional[str] = None) -> str:
	"""
	Helper method to generate text using the appropriate client and configuration.
	Tries chat_completion first (modern standard), falls back to text_generation.

	Args:
	prompt: The prompt to generate text from
	llm_config: Optional LLM configuration dict
	max_new_tokens: Maximum number of new tokens to generate
	temperature: Temperature for generation
	model_override: Optional model to use instead of the one in llm_config

	Returns:
	Generated text string
	"""
	# Add game master prompt if provided
	if llm_config and llm_config.get('game_master_prompt'):
	game_master_prompt = llm_config['game_master_prompt'].strip()
	if game_master_prompt:
	prompt = f"Important contextual rules: {game_master_prompt}\n\n{prompt}"

	# Get the appropriate client
	client = get_inference_client(llm_config)

	# Determine the model to use
	if model_override:
	model_to_use = model_override
	elif llm_config:
	model_to_use = llm_config.get('model', TEXT_MODEL)
	else:
	model_to_use = TEXT_MODEL

	# Try chat_completion first (modern standard, more widely supported)
	try:
	messages = [{"role": "user", "content": prompt}]

	if llm_config and llm_config.get('provider') != 'huggingface':
	# For third-party providers
	completion = await asyncio.get_event_loop().run_in_executor(
	None,
	lambda: client.chat.completions.create(
	messages=messages,
	max_tokens=max_new_tokens,
	temperature=temperature
	)
	)
	else:
	# For HuggingFace models, specify the model
	completion = await asyncio.get_event_loop().run_in_executor(
	None,
	lambda: client.chat.completions.create(
	model=model_to_use,
	messages=messages,
	max_tokens=max_new_tokens,
	temperature=temperature
	)
	)

	# Extract the generated text from the chat completion response
	return completion.choices[0].message.content

	except Exception as e:
	error_message = str(e).lower()
	# Check if the error is related to task compatibility or API not supported
	if ("not supported for task" in error_message or
	"conversational" in error_message or
	"chat" in error_message):
	logger.info(f"chat_completion not supported, falling back to text_generation: {e}")

	# Fall back to text_generation API
	try:
	if llm_config and llm_config.get('provider') != 'huggingface':
	# For third-party providers
	response = await asyncio.get_event_loop().run_in_executor(
	None,
	lambda: client.text_generation(
	prompt,
	max_new_tokens=max_new_tokens,
	temperature=temperature
	)
	)
	else:
	# For HuggingFace models, specify the model
	response = await asyncio.get_event_loop().run_in_executor(
	None,
	lambda: client.text_generation(
	prompt,
	model=model_to_use,
	max_new_tokens=max_new_tokens,
	temperature=temperature
	)
	)
	return response

	except Exception as text_error:
	logger.error(f"Both chat_completion and text_generation failed: {text_error}")
	raise text_error
	else:
	# Re-raise the original error if it's not a task compatibility issue
	logger.error(f"chat_completion failed with non-compatibility error: {e}")
	raise e