tikslop / server /llm_utils.py
jbilcke-hf's picture
jbilcke-hf HF Staff
starting to take shape
8ab4638
"""
LLM-related utilities, templates, and text generation functions.
"""
import asyncio
import logging
from typing import Optional, Dict, Any
from huggingface_hub import InferenceClient
from .api_config import HF_TOKEN, TEXT_MODEL
logger = logging.getLogger(__name__)
# LLM prompt templates
SEARCH_VIDEO_PROMPT_TEMPLATE = """# Instruction
Your response MUST be a YAML object containing a title and description, consistent with what we can find on a video sharing platform.
Format your YAML response with only those fields: "title" (a short string) and "description" (string caption of the scene). Do not add any other field.
In the description field, describe in a very synthetic way the visuals of the first shot (first scene), eg "<STYLE>, medium close-up shot, high angle view. In the foreground a <OPTIONAL AGE> <OPTIONAL GENDER> <CHARACTERS> <ACTIONS>. In the background <DESCRIBE LOCATION, BACKGROUND CHARACTERS, OBJECTS ETC>. The scene is lit by <LIGHTING> <WEATHER>". This is just an example! you MUST replace the <TAGS>!!.
Don't forget to replace <STYLE> etc, by the actual fields!!
For the style, be creative, for instance you can use anything like a "documentary footage", "japanese animation", "movie scene", "tv series", "tv show", "security footage" etc.
If the user ask for something specific eg "movie screencap", "movie scene", "documentary footage" "animation" as a style etc.
Keep it minimalist but still descriptive, don't use bullets points, use simple words, go to the essential to describe style (cinematic, documentary footage, 3D rendering..), camera modes and angles, characters, age, gender, action, location, lighting, country, costume, time, weather, textures, color palette.. etc). Write about 80 words, and use between 2 and 3 sentences.
The most import part is to describe the actions and movements in the scene, so don't forget that!
Don't describe sound, so ever say things like "atmospheric music playing in the background".
Instead describe the visual elements we can see in the background, be precise, (if there are anything, cars, objects, people, bricks, birds, clouds, trees, leaves or grass then say it so etc).
Make the result unique and different from previous search results. ONLY RETURN YAML AND WITH ENGLISH CONTENT, NOT CHINESE - DO NOT ADD ANY OTHER COMMENT!
# Context
This is attempt {current_attempt}.
# Input
Describe the first scene/shot for: "{query}".
# Output
```yaml
title: \""""
GENERATE_CAPTION_PROMPT_TEMPLATE = """Generate a detailed story for a video named: "{title}"
Visual description of the video: {description}.
Instructions: Write the story summary, including the plot, action, what should happen.
Make it around 200-300 words long.
A video can be anything from a tutorial, webcam, trailer, movie, live stream etc."""
SIMULATE_VIDEO_FIRST_PROMPT_TEMPLATE = """You are tasked with evolving the narrative for a video titled: "{original_title}"
Original description:
{original_description}
{chat_section}
Instructions:
1. Imagine the next logical scene or development that would follow the current description.
2. Consider the video context and recent events
3. Create a natural progression from previous clips
4. Take into account user suggestions (chat messages) into the scene
5. IMPORTANT: viewers have shared messages, consider their input in priority to guide your story, and incorporate relevant suggestions or reactions into your narrative evolution.
6. Keep visual consistency with previous clips (in most cases you should repeat the same exact description of the location, characters etc but only change a few elements. If this is a webcam scenario, don't touch the camera orientation or focus)
7. Return ONLY the caption text, no additional formatting or explanation
8. Write in English, about 200 words.
9. Keep the visual style consistant, but content as well (repeat the style, character, locations, appearance etc..from the previous description, when it makes sense).
10. Your caption must describe visual elements of the scene in details, including: camera angle and focus, people's appearance, age, look, costumes, clothes, the location visual characteristics and geometry, lighting, action, objects, weather, textures, lighting.
11. Please write in the same style as the original description, by keeping things brief etc.
Remember to obey to what users said in the chat history!!
Now, you must write down the new scene description (don't write a long story! write a synthetic description!):"""
SIMULATE_VIDEO_CONTINUE_PROMPT_TEMPLATE = """You are tasked with continuing to evolve the narrative for a video titled: "{original_title}"
Original description:
{original_description}
Condensed history of scenes so far:
{condensed_history}
Current description (most recent scene):
{current_description}
{chat_section}
Instructions:
1. Imagine the next logical scene or development that would follow the current description.
2. Consider the video context and recent events
3. Create a natural progression from previous clips
4. Take into account user suggestions (chat messages) into the scene
5. IMPORTANT: if viewers have shared messages, consider their input in priority to guide your story, and incorporate relevant suggestions or reactions into your narrative evolution.
6. Keep visual consistency with previous clips (in most cases you should repeat the same exact description of the location, characters etc but only change a few elements. If this is a webcam scenario, don't touch the camera orientation or focus)
7. Return ONLY the caption text, no additional formatting or explanation
8. Write in English, about 200 words.
9. Keep the visual style consistant, but content as well (repeat the style, character, locations, appearance etc..from the previous description, when it makes sense).
10. Your caption must describe visual elements of the scene in details, including: camera angle and focus, people's appearance, age, look, costumes, clothes, the location visual characteristics and geometry, lighting, action, objects, weather, textures, lighting.
11. Please write in the same style as the original description, by keeping things brief etc.
Remember to obey to what users said in the chat history!!
Now, you must write down the new scene description (don't write a long story! write a synthetic description!):"""
GENERATE_CLIP_PROMPT_TEMPLATE = """# Context and task
Please write the caption for a new clip.
# Instructions
1. Consider the video context and recent events
2. Create a natural progression from previous clips
3. Take into account user suggestions (chat messages) into the scene
4. Don't generate hateful, political, violent or sexual content
5. Keep visual consistency with previous clips (in most cases you should repeat the same exact description of the location, characters etc but only change a few elements. If this is a webcam scenario, don't touch the camera orientation or focus)
6. Return ONLY the caption text, no additional formatting or explanation
7. Write in English, about 200 words.
8. Keep the visual style consistant, but content as well (repeat the style, character, locations, appearance etc.. across scenes, when it makes sense).
8. Your caption must describe visual elements of the scene in details, including: camera angle and focus, people's appearance, age, look, costumes, clothes, the location visual characteristics and geometry, lighting, action, objects, weather, textures, lighting.
# Examples
Here is a demo scenario, with fake data:
{{"time": "2024-11-29T13:36:15Z", "event": "new_stream_clip", "caption": "webcam view of a beautiful park, squirrels are playing in the lush grass, blablabla etc... (rest omitted for brevity)"}}
{{"time": "2024-11-29T13:36:20Z", "event": "new_chat_message", "username": "MonkeyLover89", "data": "hi"}}
{{"time": "2024-11-29T13:36:25Z", "event": "new_chat_message", "username": "MonkeyLover89", "data": "more squirrels plz"}}
{{"time": "2024-11-29T13:36:26Z", "event": "new_stream_clip", "caption": "webcam view of a beautiful park, a lot of squirrels are playing in the lush grass, blablabla etc... (rest omitted for brevity)"}}
# Real scenario and data
We are inside a video titled "{title}"
The video is described by: "{description}".
Here is a summary of the {event_count} most recent events:
{events_json}
# Your response
Your caption:"""
def get_inference_client(llm_config: Optional[dict] = None) -> InferenceClient:
"""
Get an InferenceClient configured with the provided LLM settings.
Priority order for API keys:
1. Provider-specific API key (if provided)
2. User's HF token (if provided)
3. Server's HF token (only for built-in provider)
4. Raise exception if no valid key is available
"""
if not llm_config:
if HF_TOKEN:
return InferenceClient(
model=TEXT_MODEL,
token=HF_TOKEN
)
else:
raise ValueError("Built-in provider is not available. Server HF_TOKEN is not configured.")
provider = llm_config.get('provider', '').lower()
#logger.info(f"provider = {provider}")
# If no provider or model specified, use default
if not provider or provider == 'built-in':
if HF_TOKEN:
return InferenceClient(
model=TEXT_MODEL,
token=HF_TOKEN
)
else:
raise ValueError("Built-in provider is not available. Server HF_TOKEN is not configured.")
model = llm_config.get('model', '')
user_provider_api_key = llm_config.get('api_key', '') # Provider-specific API key
user_hf_token = llm_config.get('hf_token', '') # User's HF token
try:
# Case 1: Use a provider with a provider-specific API key if available
# This mode is currently hidden in the Flutter UI (we don't ask for provider-specific keys yet)
# but it is implemented here so that we don't forget it later
if user_provider_api_key:
return InferenceClient(
provider=provider,
model=model,
api_key=user_provider_api_key
)
# Case 2: Use a provider with user's HF token if available
elif user_hf_token:
return InferenceClient(
provider=provider,
model=model,
token=user_hf_token
)
else:
raise ValueError(f"No API key provided for provider '{provider}'. Please provide either a valid {provider} API key or your Hugging Face API key.")
except ValueError:
# Re-raise ValueError for missing API keys
raise
except Exception as e:
logger.error(f"Error creating InferenceClient for provider '{provider}' and model '{model}': {e}")
# Re-raise all other exceptions
raise
async def generate_text(prompt: str, llm_config: Optional[dict] = None,
max_new_tokens: int = 200, temperature: float = 0.7,
model_override: Optional[str] = None) -> str:
"""
Helper method to generate text using the appropriate client and configuration.
Tries chat_completion first (modern standard), falls back to text_generation.
Args:
prompt: The prompt to generate text from
llm_config: Optional LLM configuration dict
max_new_tokens: Maximum number of new tokens to generate
temperature: Temperature for generation
model_override: Optional model to use instead of the one in llm_config
Returns:
Generated text string
"""
# Add game master prompt if provided
if llm_config and llm_config.get('game_master_prompt'):
game_master_prompt = llm_config['game_master_prompt'].strip()
if game_master_prompt:
prompt = f"Important contextual rules: {game_master_prompt}\n\n{prompt}"
# Get the appropriate client
client = get_inference_client(llm_config)
# Determine the model to use
if model_override:
model_to_use = model_override
elif llm_config:
model_to_use = llm_config.get('model', TEXT_MODEL)
else:
model_to_use = TEXT_MODEL
# Try chat_completion first (modern standard, more widely supported)
try:
messages = [{"role": "user", "content": prompt}]
if llm_config and llm_config.get('provider') != 'huggingface':
# For third-party providers
completion = await asyncio.get_event_loop().run_in_executor(
None,
lambda: client.chat.completions.create(
messages=messages,
max_tokens=max_new_tokens,
temperature=temperature
)
)
else:
# For HuggingFace models, specify the model
completion = await asyncio.get_event_loop().run_in_executor(
None,
lambda: client.chat.completions.create(
model=model_to_use,
messages=messages,
max_tokens=max_new_tokens,
temperature=temperature
)
)
# Extract the generated text from the chat completion response
return completion.choices[0].message.content
except Exception as e:
error_message = str(e).lower()
# Check if the error is related to task compatibility or API not supported
if ("not supported for task" in error_message or
"conversational" in error_message or
"chat" in error_message):
logger.info(f"chat_completion not supported, falling back to text_generation: {e}")
# Fall back to text_generation API
try:
if llm_config and llm_config.get('provider') != 'huggingface':
# For third-party providers
response = await asyncio.get_event_loop().run_in_executor(
None,
lambda: client.text_generation(
prompt,
max_new_tokens=max_new_tokens,
temperature=temperature
)
)
else:
# For HuggingFace models, specify the model
response = await asyncio.get_event_loop().run_in_executor(
None,
lambda: client.text_generation(
prompt,
model=model_to_use,
max_new_tokens=max_new_tokens,
temperature=temperature
)
)
return response
except Exception as text_error:
logger.error(f"Both chat_completion and text_generation failed: {text_error}")
raise text_error
else:
# Re-raise the original error if it's not a task compatibility issue
logger.error(f"chat_completion failed with non-compatibility error: {e}")
raise e