import requests import os import json from dotenv import load_dotenv ''' This script fetches all questions from the API and downloads associated files. It saves the questions to a JSON file and downloads the files to a local directory with name "downloaded_files". Questions are saved in "new_gaia_questions.json". ''' load_dotenv() BASE_URL = os.getenv("BASE_URL") def download_file(task_id, file_name): file_endpoint = f"{BASE_URL}/files/{task_id}" file_response = requests.get(file_endpoint) if file_response.status_code == 200: os.makedirs("data/downloaded_files", exist_ok=True) file_path = os.path.join("downloaded_files", file_name) with open(file_path, "wb") as f: f.write(file_response.content) print(f"Downloaded file for task_id {task_id} to {file_path}") return True else: print(f"Failed to download file for task_id {task_id}. Status code: {file_response.status_code}") return False def get_all_questions(): response = requests.get(f"{BASE_URL}/questions") downloaded_file_counter = 0 if response.status_code == 200: questions = response.json() for question in questions: if "task_id" in question and "file_name" in question and question["file_name"]: if download_file(question["task_id"], question["file_name"]): downloaded_file_counter += 1 print(f"Total downloaded files: {downloaded_file_counter}") return questions else: raise Exception(f"API request failed with status code {response.status_code}") questions = get_all_questions() print(f"Total questions retrieved: {len(questions)}") with open("data/question_set/new_gaia_questions.json", "w") as file: json.dump(questions, file, indent=4) print("Questions successfully saved to new_gaia_questions.json")