# Tools/scene_extractor_tool.py from typing import Dict from smolagents import tool from llm_utils import tokenizer, generate_completion import torch @tool def extract_scene(context: str) -> str: """ Identify the key visual scene in one vivid paragraph (≤77 tokens). """ prompt = f""" You are a visual scene extractor. Given the text below, produce one vivid paragraph (max 77 tokens) describing the key visual moment. Return only that paragraph. Text: \"\"\" {context} \"\"\" Visual description:""" inputs = tokenizer.apply_chat_template( [{"role":"system","content":"Extract a single visual scene."}, {"role":"user","content":prompt}], tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True ).to(tokenizer.device) with torch.no_grad(): outputs = generate_completion(inputs, max_new_tokens=100, temperature=0.0, do_sample=False) plen = inputs["input_ids"].shape[-1] gen_ids = outputs[0][plen:] raw = tokenizer.decode(gen_ids, skip_special_tokens=True).strip() # enforce token limit words = raw.split() if len(words) > 77: raw = " ".join(words[:77]) return raw