import re
from typing import Generator
from smolagents import ToolCallingAgent, OpenAIServerModel, ActionStep
from PIL import Image
import tools
from configs import settings
from prompt import image_to_text_prompt, video_to_text_prompt
from rag import VideoRAG
class VideoChatbot:
def __init__(
self,
model: str = 'gemini-2.0-flash',
api_base: str = None,
api_key: str = None
):
self.video_rag = VideoRAG(
video_frame_rate=settings.VIDEO_EXTRACTION_FRAME_RATE,
audio_segment_length=settings.AUDIO_SEGMENT_LENGTH,
)
self.agent = ToolCallingAgent(
tools=[
tools.download_video,
*tools.create_video_rag_tools(self.video_rag)
],
model=OpenAIServerModel(
model_id=model,
api_base=api_base,
api_key=api_key
),
step_callbacks=[self._step_callback],
)
def chat(self, message: str, attachments: list[str] = None) -> Generator:
"""Chats with the bot, including handling attachments (images and videos).
Args:
message: The text message to send to the bot.
attachments: A list of file paths for images or videos to include in the chat.
Returns:
A generator yielding step objects representing the bot's responses and actions.
"""
images = []
for filepath in attachments or []:
if filepath.endswith(('.jpg', '.jpeg', '.png')):
images.append(Image.open(filepath))
message = image_to_text_prompt(filepath) + message
if filepath.endswith('.mp4'):
message = video_to_text_prompt(filepath) + message
for step in self.agent.run(
message,
stream=True,
reset=False,
images=images,
):
yield step
def clear(self):
"""Clears the chatbot message history and context."""
self.agent.state.clear()
self.agent.memory.reset()
self.agent.monitor.reset()
self.video_rag.clear()
def _step_callback(self, step: ActionStep, agent: ToolCallingAgent):
if step.observations:
for image_path in re.findall(r'(.*?)\n', step.observations):
try:
image = Image.open(image_path)
step.observations_images.append(image)
step.observations = step.observations.replace(f'{image_path}\n', '')
except Exception as e:
print(f'Error loading image {image_path}: {e}')