smolagent_course_final_assignment

Sleeping

App Files Files Community

Kai Jennissen commited on May 15

Commit

8102d4b

unverified ·

1 Parent(s): 1c0a810

added tools

Browse files

Files changed (5) hide show

agent.py +120 -20
app.py +20 -5
requirements.in +3 -0
requirements.txt +6 -0
tools.py +672 -0

agent.py CHANGED Viewed

@@ -3,10 +3,21 @@ from smolagents import (
     CodeAgent,
     DuckDuckGoSearchTool,
     VisitWebpageTool,
-    InferenceClientModel,
 )
 from dotenv import load_dotenv
 from tracing import setup_tracing
 load_dotenv()
@@ -22,6 +33,59 @@ If you are asked for a string, don't use articles, neither abbreviations (e.g. f
 If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
 Your answer should only start with "FINAL ANSWER: ", then follows with the answer. """
 def initialize_tracing(enabled=True, provider="langfuse"):
     """
@@ -45,39 +109,75 @@ def get_agent():
     # SmolagentsInstrumentor will automatically trace agent operations
-    llm_qwen = InferenceClientModel(
-        model_id="Qwen/Qwen2.5-Coder-32B-Instruct", provider="together"
-    )
-    llm_deepseek = InferenceClientModel(
-        "deepseek-ai/DeepSeek-R1",
-        provider="together",
-        max_tokens=8096,
-        # "Qwen/Qwen3-235B-A22B-FP8",
-        # provider="together",
-        # max_tokens=8096,
-    )
     # Create web agent
     web_agent = ToolCallingAgent(
-        tools=[DuckDuckGoSearchTool(), VisitWebpageTool()],
-        model=llm_qwen,
         max_steps=3,
         name="Web_Agent",
         description="A web agent that can search the web and visit webpages.",
         verbosity_level=1,
     )
     # Create manager agent
     manager_agent = CodeAgent(
-        tools=[],
-        managed_agents=[web_agent],
-        model=llm_deepseek,
         max_steps=5,
         planning_interval=10,
         additional_authorized_imports=["pandas", "numpy"],
         verbosity_level=1,
-        description=MANAGER_PROMPT,
     )
     return manager_agent
@@ -88,11 +188,11 @@ if __name__ == "__main__":
     # Get agent with tracing already configured
     agent = get_agent()
     # Run agent - SmolagentsInstrumentor will automatically trace the execution
     print("Running agent with tracing enabled...")
     result = agent.run(
-        "What is the latest news about AI? Please search the web and summarize the results."
     )
     print(f"Result: {result}")
     print(

     CodeAgent,
     DuckDuckGoSearchTool,
     VisitWebpageTool,
+    # InferenceClientModel,
+    OpenAIServerModel,
+    WikipediaSearchTool,
 )
 from dotenv import load_dotenv
 from tracing import setup_tracing
+from tools import (
+    read_image,
+    transcribe_audio,
+    run_video,
+    read_code,
+    fetch_task_files,
+)
+# from tools import go_back, close_popups, search_item_ctrl_f, save_screenshot
 load_dotenv()
 If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
 Your answer should only start with "FINAL ANSWER: ", then follows with the answer. """
+helium_instructions = """
+You can use helium to access websites. Don't bother about the helium driver, it's already managed.
+We've already ran "from helium import *"
+Then you can go to pages!
+Code:
+```py
+go_to('github.com/trending')
+```<end_code>
+You can directly click clickable elements by inputting the text that appears on them.
+Code:
+```py
+click("Top products")
+```<end_code>
+If it's a link:
+Code:
+```py
+click(Link("Top products"))
+```<end_code>
+If you try to interact with an element and it's not found, you'll get a LookupError.
+In general stop your action after each button click to see what happens on your screenshot.
+Never try to login in a page.
+To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
+Code:
+```py
+scroll_down(num_pixels=1200) # This will scroll one viewport down
+```<end_code>
+When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
+Just use your built-in tool `close_popups` to close them:
+Code:
+```py
+close_popups()
+```<end_code>
+You can use .exists() to check for the existence of an element. For example:
+Code:
+```py
+if Text('Accept cookies?').exists():
+    click('I accept')
+```<end_code>
+"""
+add_sys_prompt = """\n\nIf a file_url is available or an url is given in question statement, then request and use the content to answer the question. \
+        If a code file, such as .py file, is given, do not attempt to execute it but rather open it as a text file and analyze the content. \
+        When a tabluar file, such as csv, tsv, xlsx, is given, read it using pandas.
+        Make sure you provide the answer in accordance with the instruction provided in the question. Do not return the result of tool as a final_answer.
+        Do Not add any additional information, explanation, unnecessary words or symbols. The answer is likely as simple as one word."""
 def initialize_tracing(enabled=True, provider="langfuse"):
     """
     # SmolagentsInstrumentor will automatically trace agent operations
+    # llm_qwen = InferenceClientModel(
+    #     model_id="Qwen/Qwen2.5-Coder-32B-Instruct", provider="together"
+    # )
+    # llm_deepseek = InferenceClientModel(
+    #     "deepseek-ai/DeepSeek-R1",
+    #     provider="together",
+    #     max_tokens=8096,
+    #     # "Qwen/Qwen3-235B-A22B-FP8",
+    #     # provider="together",
+    #     # max_tokens=8096,
+    # )
     # Create web agent
     web_agent = ToolCallingAgent(
+        tools=[
+            DuckDuckGoSearchTool(),
+            VisitWebpageTool(),
+            WikipediaSearchTool(),
+        ],
+        model=OpenAIServerModel(model_id="gpt-4.1", temperature=0.1),
         max_steps=3,
         name="Web_Agent",
         description="A web agent that can search the web and visit webpages.",
         verbosity_level=1,
     )
+    mm_agent = CodeAgent(
+        tools=[
+            read_image,
+            transcribe_audio,
+            read_code,
+            run_video,
+        ],
+        model=OpenAIServerModel(model_id="gpt-4.1", temperature=0.1),
+        max_steps=3,
+        name="Multimedia_Agent",
+        description="An agent that can answer questions about all types of images, videos and speech. Needs to be provided with a valid url or an image.",
+        verbosity_level=1,
+    )
+    # Initialize the model
+    # vlm = InferenceClientModel(model_id="Qwen/Qwen2.5-Vision-32B", provider="together")
+    # # Create the agent
+    # vision_agent = CodeAgent(
+    #     tools=[go_back, close_popups, search_item_ctrl_f],
+    #     model=vlm,
+    #     additional_authorized_imports=["helium", "selenium"],
+    #     step_callbacks=[save_screenshot],
+    #     max_steps=10,
+    #     planning_interval=10,
+    #     verbosity_level=1,
+    #     name="Vision_Agent",
+    #     description="A vision agent that can interact with webpages and take screenshots.",
+    # )
+    # vision_agent.prompt_templates["system_prompt"] += helium_instructions
+    # Import helium for the agent
     # Create manager agent
     manager_agent = CodeAgent(
+        tools=[fetch_task_files],
+        managed_agents=[web_agent, mm_agent],
+        model=OpenAIServerModel(model_id="gpt-4.1", temperature=0.1),
         max_steps=5,
         planning_interval=10,
         additional_authorized_imports=["pandas", "numpy"],
         verbosity_level=1,
     )
+    manager_agent.prompt_templates["system_prompt"] += add_sys_prompt
     return manager_agent
     # Get agent with tracing already configured
     agent = get_agent()
+    agent.visualize()
     # Run agent - SmolagentsInstrumentor will automatically trace the execution
     print("Running agent with tracing enabled...")
     result = agent.run(
+        "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
     )
     print(f"Result: {result}")
     print(

app.py CHANGED Viewed

@@ -24,10 +24,24 @@ class BasicAgent:
         self.agent = get_agent()
         print("BasicAgent initialized.")
-    def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
-        answer = self.agent.run(question)
-        print(f"Agent returning fixed answer: {answer}")
         return answer
@@ -93,14 +107,15 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     results_log = []
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
-    for item in questions_data[:1]:
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
-            submitted_answer = agent(question_text)
             answers_payload.append(
                 {"task_id": task_id, "submitted_answer": submitted_answer}
             )

         self.agent = get_agent()
         print("BasicAgent initialized.")
+    def __call__(self, question: str, task_id: str = None) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
+        # If task_id is provided, we'll include context about possible files
+        if task_id:
+            # Add context about files to the question
+            context = f"""Task ID: {task_id}
+If you need files for this task, you can use the fetch_task_files tool with the task_id.
+Example: fetch_task_files(task_id="{task_id}")
+Question: {question}"""
+            answer = self.agent.run(context)
+        else:
+            answer = self.agent.run(question)
+        print(f"Agent returning answer: {answer}")
         return answer
     results_log = []
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
+    for item in questions_data[3:4]:
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
+            # Pass both question text and task_id to the agent
+            submitted_answer = agent(question_text, task_id)
             answers_payload.append(
                 {"task_id": task_id, "submitted_answer": submitted_answer}
             )

requirements.in CHANGED Viewed

@@ -1,5 +1,8 @@
 duckduckgo_search>=7.0.0,<8.0.0
 gradio[oauth]
 requests
 smolagents[gradio,litellm,openai,telemetry,toolkit,torch,transformers,vision]
 wikipedia-api

+av
 duckduckgo_search>=7.0.0,<8.0.0
 gradio[oauth]
+pytube
 requests
 smolagents[gradio,litellm,openai,telemetry,toolkit,torch,transformers,vision]
 wikipedia-api
+yt-dlp

requirements.txt CHANGED Viewed

@@ -47,6 +47,8 @@ authlib==1.5.2
     # via
     #   arize-phoenix
     #   gradio
 beautifulsoup4==4.13.4
     # via markdownify
 cachetools==5.5.2
@@ -353,6 +355,8 @@ python-multipart==0.0.20
     # via
     #   arize-phoenix
     #   gradio
 pytz==2025.2
     # via pandas
 pyyaml==6.0.2
@@ -526,5 +530,7 @@ wsproto==1.2.0
     # via trio-websocket
 yarl==1.20.0
     # via aiohttp
 zipp==3.21.0
     # via importlib-metadata

     # via
     #   arize-phoenix
     #   gradio
+av==14.3.0
+    # via -r requirements.in
 beautifulsoup4==4.13.4
     # via markdownify
 cachetools==5.5.2
     # via
     #   arize-phoenix
     #   gradio
+pytube==15.0.0
+    # via -r requirements.in
 pytz==2025.2
     # via pandas
 pyyaml==6.0.2
     # via trio-websocket
 yarl==1.20.0
     # via aiohttp
+yt-dlp==2025.4.30
+    # via -r requirements.in
 zipp==3.21.0
     # via importlib-metadata

tools.py ADDED Viewed

	@@ -0,0 +1,672 @@

+import requests
+import io
+import base64
+import openai
+from openai import OpenAI
+from smolagents import tool
+import os
+import pandas as pd
+import functools
+from typing import List, Optional, Dict, Any
+import sys
+import av
+from yt_dlp import YoutubeDL
+from PIL import Image
+import wikipediaapi
+import tempfile
+model_id = "gpt-4.1"
+@tool
+def read_image(query: str, img_url: str) -> str:
+    """
+    Use a visual question answering (VQA) model to generate a response to a query based on an image.
+    Args:
+        query (str): A natural language question about the image.
+        img_url (str): The URL of the image to analyze.
+    Returns:
+        str: A response generated by the VQA model based on the provided image and question.
+    """
+    client = OpenAI()
+    response = client.responses.create(
+        model=model_id,
+        input=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "input_text", "text": query},
+                    {
+                        "type": "input_image",
+                        "image_url": img_url,
+                    },
+                ],
+            }
+        ],
+    )
+    return response.output_text
+@tool
+def read_code(file_url: str) -> str:
+    """
+    Read the contents of a code file such as py file instead of executing it. Use this tool to analyze a code snippet.
+    Args:
+        file_url (str): The URL of the code file to retrieve.
+    Returns:
+        str: The content of the file as a string.
+    """
+    response = requests.get(file_url)
+    response.raise_for_status()
+    return response.text
+@tool
+def transcribe_audio(file_url: str, file_name: str) -> str:
+    """
+    Download and transcribe an audio file using transcription model.
+    Args:
+        file_url (str): Direct URL to the audio file (e.g., .mp3, .wav).
+        file_name (str): Filename including extension, used to determine format.
+    Returns:
+        str: The transcribed text from the audio file.
+    """
+    # Download audio content
+    response = requests.get(file_url)
+    response.raise_for_status()
+    # Extract extension (fallback to mp3 if missing)
+    extension = file_name.split(".")[-1].lower() or "mp3"
+    # Wrap bytes in a file-like object with a valid name
+    audio_file = io.BytesIO(response.content)
+    audio_file.name = f"audio.{extension}"
+    # Create OpenAI client and transcribe
+    client = OpenAI()
+    transcription = client.audio.transcriptions.create(
+        model="gpt-4o-transcribe", file=audio_file
+    )
+    return transcription.text
+### set of functions for youtube video processing
+def _pytube_buffer(url: str) -> Optional[io.BytesIO]:
+    try:
+        from pytube import YouTube
+        yt = YouTube(url)
+        stream = (
+            yt.streams.filter(progressive=True, file_extension="mp4")
+            .order_by("resolution")
+            .desc()
+            .first()
+        )
+        if stream is None:  # no progressive stream
+            raise RuntimeError("No MP4 with audio found")
+        buf = io.BytesIO()
+        stream.stream_to_buffer(buf)  # PyTube’s built-in helper
+        buf.seek(0)
+        return buf
+    except Exception as e:
+        print(f"[youtube_to_buffer] PyTube failed → {e}", file=sys.stderr)
+        return None  # trigger fallback
+def _ytdlp_buffer(url: str) -> io.BytesIO:
+    """
+    Return a BytesIO containing some MP4 video stream for `url`.
+    Works whether YouTube serves a progressive file or separate A/V.
+    """
+    ydl_opts = {
+        "quiet": True,
+        "skip_download": True,
+        "format": "bestvideo[ext=mp4]/best[ext=mp4]/best",
+    }
+    with YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=False)
+        if "entries" in info:  # playlists
+            info = info["entries"][0]
+    if "url" in info:
+        video_urls = [info["url"]]
+    elif "requested_formats" in info:
+        video_urls = [
+            fmt["url"]
+            for fmt in info["requested_formats"]
+            if fmt.get("vcodec") != "none"  # keep only video
+        ]
+        if not video_urls:
+            raise RuntimeError("yt-dlp returned audio-only formats")
+    else:
+        raise RuntimeError("yt-dlp could not extract a stream URL")
+    buf = io.BytesIO()
+    for direct_url in video_urls:
+        with requests.get(direct_url, stream=True) as r:
+            r.raise_for_status()
+            for chunk in r.iter_content(chunk_size=1 << 16):
+                buf.write(chunk)
+    buf.seek(0)
+    return buf
+@functools.lru_cache(maxsize=8)  # tiny cache so repeat calls are fast
+def youtube_to_buffer(url: str) -> io.BytesIO:
+    """
+    Return a BytesIO containing a single progressive MP4
+    (H.264 + AAC) – the safest thing PyAV can open everywhere.
+    """
+    ydl_opts = {
+        "quiet": True,
+        "skip_download": True,
+        # progressive (has both audio+video) • mp4 • h264
+        "format": (
+            "best[ext=mp4][vcodec^=avc1][acodec!=none]"
+            "/best[ext=mp4][acodec!=none]"  # fallback: any prog-MP4
+        ),
+    }
+    with YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=False)
+        if "entries" in info:  # playlists → first entry
+            info = info["entries"][0]
+    direct_url = info.get("url")
+    if not direct_url:
+        raise RuntimeError("yt-dlp could not find a progressive MP4 track")
+    # Stream it straight into RAM
+    buf = io.BytesIO()
+    with requests.get(direct_url, stream=True) as r:
+        r.raise_for_status()
+        for chunk in r.iter_content(chunk_size=1 << 17):  # 128 kB
+            buf.write(chunk)
+    buf.seek(0)
+    return buf
+def sample_frames(video_bytes: io.BytesIO, n_frames: int = 6) -> List[Image.Image]:
+    """Decode `n_frames` uniformly spaced RGB frames as PIL images."""
+    container = av.open(video_bytes, metadata_errors="ignore")
+    video = container.streams.video[0]
+    total = video.frames or 0
+    # If PyAV couldn't count frames (‐1), fall back to timestamp spacing
+    step = max(1, total // n_frames) if total else 30
+    frames: list[Image.Image] = []
+    for i, frame in enumerate(container.decode(video=0)):
+        if i % step == 0:
+            frames.append(frame.to_image())
+        if len(frames) >= n_frames:
+            break
+    container.close()
+    return frames
+def pil_to_data_url(img: Image.Image, quality: int = 80) -> str:
+    buf = io.BytesIO()
+    img.save(buf, format="JPEG", quality=quality, optimize=True)
+    b64 = base64.b64encode(buf.getvalue()).decode()
+    return f"data:image/jpeg;base64,{b64}"
+def save_audio_stream_to_temp_wav_file(video_bytes: io.BytesIO) -> Optional[str]:
+    """
+    Extracts the audio stream from video_bytes, saves it as a temporary WAV file,
+    and returns the path to the file.
+    Returns None if no audio stream is found or an error occurs.
+    """
+    try:
+        video_bytes.seek(0)  # Ensure buffer is at the beginning
+        input_container = av.open(video_bytes, metadata_errors="ignore")
+        if not input_container.streams.audio:
+            print("No audio streams found in the video.", file=sys.stderr)
+            return None
+        input_audio_stream = input_container.streams.audio[0]
+        # Create a temporary file with .wav suffix
+        # delete=False because we need to pass the path to another process (Whisper)
+        # and we will manually delete it later.
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+            temp_audio_file_path = tmp_file.name
+        output_container = av.open(temp_audio_file_path, mode="w", format="wav")
+        # For WAV, a common codec is pcm_s16le (16-bit signed PCM).
+        # Use the input stream's sample rate.
+        # Determine channel layout (e.g., 'stereo', 'mono')
+        channel_layout = "stereo"  # Default
+        if (
+            hasattr(input_audio_stream.codec_context, "layout")
+            and input_audio_stream.codec_context.layout
+        ):
+            channel_layout = input_audio_stream.codec_context.layout.name
+        elif (
+            hasattr(input_audio_stream.codec_context, "channels")
+            and input_audio_stream.codec_context.channels == 1
+        ):
+            channel_layout = "mono"
+        output_audio_stream = output_container.add_stream(
+            "pcm_s16le",
+            rate=input_audio_stream.codec_context.sample_rate,
+            layout=channel_layout,
+        )
+        for frame in input_container.decode(input_audio_stream):
+            # PyAV decodes audio into AudioFrame objects.
+            # These frames need to be encoded by the output stream's codec.
+            for packet in output_audio_stream.encode(frame):
+                output_container.mux(packet)
+        # Flush any remaining frames from the encoder
+        for packet in output_audio_stream.encode():
+            output_container.mux(packet)
+        output_container.close()
+        input_container.close()
+        return temp_audio_file_path
+    except Exception as e:
+        print(f"Error extracting audio to temp WAV file: {e}", file=sys.stderr)
+        # Clean up if temp file path was assigned and file exists
+        if "temp_audio_file_path" in locals() and os.path.exists(temp_audio_file_path):
+            os.remove(temp_audio_file_path)
+        return None
+@tool
+def run_video(query: str, url: str) -> str:
+    """
+    Get a YouTube video from url and return an answer to a natural-language query using the video.
+    Args:
+        query (str):  A natural-language question whose answer is expected to be found in the visual content of the video.
+        url (str): Fully qualified URL of the YouTube video to analyze.
+    Returns:
+        str: A response generated by the VQA model based on the provided video and question.
+    """
+    n_frames = 4
+    buff = youtube_to_buffer(url)
+    if buff is None:
+        return "Error: Could not download or buffer the video."
+    # 1. Sample visual frames
+    frames = sample_frames(buff, n_frames=n_frames)
+    buff.seek(0)  # Reset buffer pointer for audio extraction
+    # 2. Extract and Transcribe Audio
+    transcript = "[Audio could not be processed]"
+    audio_file_path = None
+    try:
+        audio_file_path = save_audio_stream_to_temp_wav_file(buff)
+        if audio_file_path:
+            with open(audio_file_path, "rb") as audio_data:
+                # Make sure you have the OpenAI client initialized, e.g., client = openai.OpenAI()
+                transcription_response = openai.audio.transcriptions.create(
+                    model="gpt-4o-transcribe", file=audio_data
+                )
+                transcript = transcription_response.text
+        else:
+            transcript = "[No audio stream found or error during extraction]"
+            print(
+                "No audio file path returned, skipping transcription.", file=sys.stderr
+            )
+    except Exception as e:
+        print(f"Error during audio transcription: {e}", file=sys.stderr)
+        transcript = f"[Error during audio transcription: {e}]"
+    finally:
+        if audio_file_path and os.path.exists(audio_file_path):
+            os.remove(audio_file_path)  # Clean up the temporary audio file
+    # 3. Prepare content for the AI model (text query, transcript, and images)
+    prompt_text = f"Original Query: {query}\n\nVideo Transcript:\n{transcript}\n\nKey Visual Frames (analyze these along with the transcript to answer the query):"
+    content = [{"type": "text", "text": prompt_text}]
+    for img in frames:
+        content.append(
+            {
+                "type": "image_url",
+                "image_url": {"url": pil_to_data_url(img)},
+            }
+        )
+    # 4. Send to AI model
+    try:
+        resp = openai.chat.completions.create(
+            model=model_id,
+            messages=[{"role": "user", "content": content}],
+            temperature=0.1,
+        )
+        result = resp.choices[0].message.content.strip()
+    except Exception as e:
+        print(f"Error calling OpenAI API: {e}", file=sys.stderr)
+        result = f"[Error processing with AI model: {e}]"
+    return result
+## Read video only, ignore audio
+# @tool
+# def run_video(query: str, url: str) -> str:
+#     """
+#     Get a YouTube video from url and return an answer to a natural-language query using the video.
+#     Args:
+#         query (str):  A natural-language question whose answer is expected to be found in the visual content of the video.
+#         url (str): Fully qualified URL of the YouTube video to analyze.
+#     Returns:
+#         str: A response generated by the VQA model based on the provided video and question.
+#     """
+#     buff = youtube_to_buffer(url)
+#     n_frames = 8
+#     frames = sample_frames(buff, n_frames=n_frames)
+#     content = [{"type": "text", "text": query}] + [
+#         {
+#             "type": "image_url",
+#             "image_url": {"url": pil_to_data_url(img)},
+#         }
+#         for img in frames
+#     ]
+#     resp = openai.chat.completions.create(
+#         model="gpt-4.1-mini",
+#         messages=[{"role": "user", "content": content}],
+#         temperature=0.1,
+#     )
+#     return resp.choices[0].message.content.strip()
+# Helper functions for processing different file types
+def process_image(response, filename, content_type):
+    """Process image files - convert to base64 data URL for vision models"""
+    img_data = base64.b64encode(response.content).decode("utf-8")
+    data_url = f"data:{content_type};base64,{img_data}"
+    return {
+        "file_type": "image",
+        "filename": filename,
+        "content_type": content_type,
+        "data_url": data_url,
+    }
+def process_audio(response, filename, content_type):
+    """Process audio files - either return data URL or save to temp file for processing"""
+    audio_data = base64.b64encode(response.content).decode("utf-8")
+    data_url = f"data:{content_type};base64,{audio_data}"
+    # For compatibility with audio processing tools, save to temp file
+    audio_file = io.BytesIO(response.content)
+    extension = os.path.splitext(filename)[1].lower() or ".mp3"
+    audio_file.name = f"audio{extension}"  # Some libraries need filename
+    return {
+        "file_type": "audio",
+        "filename": filename,
+        "content_type": content_type,
+        "data_url": data_url,
+        "audio_buffer": audio_file,  # Include buffer for processing
+    }
+def process_video(response, filename, content_type):
+    """Process video files - save to buffer and extract frames"""
+    video_buffer = io.BytesIO(response.content)
+    # Option to extract frames - similar to what run_video does
+    try:
+        frames = sample_frames(video_buffer, n_frames=4)  # Reuse existing function
+        frame_urls = [pil_to_data_url(img) for img in frames]
+        frame_extraction_success = True
+    except Exception:
+        frame_urls = []
+        frame_extraction_success = False
+    return {
+        "file_type": "video",
+        "filename": filename,
+        "content_type": content_type,
+        "video_buffer": video_buffer,
+        "frame_urls": frame_urls,
+        "frames_extracted": frame_extraction_success,
+    }
+def process_tabular(response, filename, content_type):
+    """Process spreadsheet files using pandas"""
+    excel_buffer = io.BytesIO(response.content)
+    try:
+        # Determine format based on extension
+        if filename.lower().endswith(".csv"):
+            df = pd.read_csv(excel_buffer)
+        else:  # Excel formats
+            df = pd.read_excel(excel_buffer)
+        return {
+            "file_type": "tabular",
+            "filename": filename,
+            "content_type": content_type,
+            "data": df.to_dict(orient="records"),
+            "columns": df.columns.tolist(),
+            "shape": df.shape,
+        }
+    except Exception as e:
+        # Fallback if parsing fails
+        return {
+            "file_type": "tabular",
+            "filename": filename,
+            "content_type": content_type,
+            "error": f"Failed to parse tabular data: {e}",
+            "raw_data": base64.b64encode(response.content).decode("utf-8"),
+        }
+def process_text(response, filename, content_type):
+    """Process text files (code, plain text, etc.)"""
+    try:
+        text_content = response.text
+        return {
+            "file_type": "text",
+            "filename": filename,
+            "content_type": content_type,
+            "content": text_content,
+            "extension": os.path.splitext(filename)[
+                1
+            ],  # Useful for syntax highlighting
+        }
+    except Exception as e:
+        return {
+            "file_type": "text",
+            "filename": filename,
+            "content_type": content_type,
+            "error": f"Failed to decode text: {e}",
+            "raw_data": base64.b64encode(response.content).decode("utf-8"),
+        }
+def process_json(response, filename, content_type):
+    """Process JSON data"""
+    try:
+        json_data = response.json()
+        return {
+            "file_type": "json",
+            "filename": filename,
+            "content_type": content_type,
+            "data": json_data,
+        }
+    except Exception:
+        # Try as text if JSON parsing fails
+        return process_text(response, filename, content_type)
+def process_pdf(response, filename, content_type):
+    """Process PDF files - return as binary with metadata"""
+    # Simple version - just return binary for now
+    # Could be enhanced with PDF text extraction libraries
+    pdf_data = base64.b64encode(response.content).decode("utf-8")
+    return {
+        "file_type": "pdf",
+        "filename": filename,
+        "content_type": content_type,
+        "data": pdf_data,
+    }
+def process_binary(response, filename, content_type):
+    """Process other binary files (fallback handler)"""
+    binary_data = base64.b64encode(response.content).decode("utf-8")
+    return {
+        "file_type": "binary",
+        "filename": filename,
+        "content_type": content_type,
+        "data": binary_data,
+    }
+@tool
+def fetch_task_files(task_id: str) -> Dict[str, Any]:
+    """
+    Download files associated with a specific task from the API.
+    Args:
+        task_id (str): The Task-ID of the task to download files for.
+    Returns:
+        dict: A dictionary containing file information and data in appropriate format for the file type
+    """
+    api_base_url: str = "https://agents-course-unit4-scoring.hf.space"
+    files_url = f"{api_base_url}/files/{task_id}"
+    try:
+        response = requests.get(files_url, timeout=15)
+        response.raise_for_status()
+        # Extract metadata
+        content_type = response.headers.get("Content-Type", "").lower()
+        filename = response.headers.get("content-disposition", "")
+        if "filename=" in filename:
+            filename = filename.split("filename=")[-1].strip('"')
+        else:
+            filename = f"{task_id}.bin"  # Default filename
+        print(f"Received file: {filename}, type: {content_type}")
+        # Route to appropriate helper based on content type or file extension
+        if "image/" in content_type or any(
+            filename.lower().endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".gif"]
+        ):
+            return process_image(response, filename, content_type)
+        elif "audio/" in content_type or any(
+            filename.lower().endswith(ext) for ext in [".mp3", ".wav", ".ogg"]
+        ):
+            return process_audio(response, filename, content_type)
+        elif "video/" in content_type or any(
+            filename.lower().endswith(ext) for ext in [".mp4", ".avi", ".mov"]
+        ):
+            return process_video(response, filename, content_type)
+        elif (
+            "spreadsheet" in content_type
+            or "excel" in content_type
+            or any(filename.lower().endswith(ext) for ext in [".xlsx", ".xls", ".csv"])
+        ):
+            return process_tabular(response, filename, content_type)
+        elif (
+            "text/" in content_type
+            or "code" in content_type
+            or any(
+                filename.lower().endswith(ext)
+                for ext in [".txt", ".py", ".js", ".html", ".md"]
+            )
+        ):
+            return process_text(response, filename, content_type)
+        elif "application/json" in content_type or filename.lower().endswith(".json"):
+            return process_json(response, filename, content_type)
+        elif "application/pdf" in content_type or filename.lower().endswith(".pdf"):
+            return process_pdf(response, filename, content_type)
+        else:
+            # Default fallback for binary files
+            return process_binary(response, filename, content_type)
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching files for task {task_id}: {e}")
+        return {"error": f"Error fetching files: {e}"}
+    except Exception as e:
+        print(f"An unexpected error occurred fetching files for task {task_id}: {e}")
+        return {"error": f"An unexpected error occurred: {e}"}
+@tool
+def search_wikipedia(query: str) -> str:
+    """
+    get the contents of wikipedia page retrieved by search query.
+    Args:
+        query (str):  A search term to search within wikipedia. Ideally it should be one word or a group of few words.
+    Returns:
+        str: The text content of wikipedia page
+    """
+    get_wiki = wikipediaapi.Wikipedia(
+        language="en",
+        user_agent="test_tokki",
+        extract_format=wikipediaapi.ExtractFormat.WIKI,
+    )
+    page_content = get_wiki.page(query)
+    text_content = page_content.text
+    cutoff = 25000
+    text_content = " ".join(text_content.split(" ")[:cutoff])
+    return text_content
+if __name__ == "__main__":
+    # Simple test for fetch_task_files
+    task_ids = [
+        "cca530fc-4052-43b2-b130-b30968d8aa44",
+        "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
+        "7bd855d8-463d-4ed5-93ca-5fe35145f733",
+    ]
+    for task_id in task_ids:
+        print(
+            "=" * 20
+            + " "
+            + f"Testing fetch_task_files with task_id: {task_id}"
+            + " "
+            + "=" * 20
+        )
+        result = fetch_task_files(task_id)
+        print(f"File type: {result.get('file_type')}")
+        print(f"Filename: {result.get('filename')}")