Spaces:
Sleeping
Sleeping
# Tools/scene_extractor_tool.py | |
from typing import Dict | |
from smolagents import tool | |
from llm_utils import tokenizer, generate_completion | |
import torch | |
def extract_scene(context: str) -> str: | |
""" | |
Identify the key visual scene in one vivid paragraph (≤77 tokens). | |
""" | |
prompt = f""" | |
You are a visual scene extractor. Given the text below, | |
produce one vivid paragraph (max 77 tokens) describing the key visual moment. Return only that paragraph. | |
Text: | |
\"\"\" | |
{context} | |
\"\"\" | |
Visual description:""" | |
inputs = tokenizer.apply_chat_template( | |
[{"role":"system","content":"Extract a single visual scene."}, | |
{"role":"user","content":prompt}], | |
tokenize=True, | |
add_generation_prompt=True, | |
return_tensors="pt", | |
return_dict=True | |
).to(tokenizer.device) | |
with torch.no_grad(): | |
outputs = generate_completion(inputs, | |
max_new_tokens=100, | |
temperature=0.0, | |
do_sample=False) | |
plen = inputs["input_ids"].shape[-1] | |
gen_ids = outputs[0][plen:] | |
raw = tokenizer.decode(gen_ids, skip_special_tokens=True).strip() | |
# enforce token limit | |
words = raw.split() | |
if len(words) > 77: | |
raw = " ".join(words[:77]) | |
return raw | |