import json import os import random from datasets import Dataset PRETRAIN_PHRASES = [ "What is happening in the given music ?", "Describe the sound. ", "Describe the music. ", " Provide a description of the music.", " Provide a description of the sound.", "Can you interpret ?", "Please explain what's happening in ", "What does represent?", "Could you describe for me?", "What's the content of ?", "Can you depict ?", "What is ?", "In the music clip, , what is happening?", "Provide a description of the music. ", "Provide a description of the sound. ", "Provide a caption for the sound. ", "Provide a caption for the music. ", ] def convert_json_to_dataset(input_file, output_folder, train_ratio=0.8): with open(input_file, 'r') as f: data = [json.loads(line.strip()) for line in f] os.makedirs(output_folder, exist_ok=True) cache_path = os.path.join(output_folder, "gpt-cache.jsonl") cache = open(cache_path, "a") # Shuffle the data random.shuffle(data) train_size = int(len(data) * train_ratio) train_data = data[:train_size] val_data = data[train_size:] def gen(entries): for idx, entry in enumerate(entries): sound_location = entry["location"] main_caption = entry["main_caption"] alt_caption = entry["alt_caption"] # Randomly select a pretrain phrase for user content user_content = random.choice(PRETRAIN_PHRASES) # Construct the main example example_1 = { "id": f"{2*idx + 1:07}", "sounds": [sound_location], "messages": [ {"role": "user", "content": user_content}, {"role": "assistant", "content": main_caption} ] } cache.write(json.dumps(example_1) + "\n") yield example_1 # Construct the alt example example_2 = { "id": f"{2*idx+2:07}", "sounds": [sound_location], "messages": [ {"role": "user", "content": user_content}, {"role": "assistant", "content": alt_caption} ] } cache.write(json.dumps(example_2) + "\n") yield example_2 train_ds = Dataset.from_generator( gen, num_proc=1, # Set num_proc to adjust parallel processing gen_kwargs={"entries": train_data}, ) train_ds.save_to_disk(os.path.join(output_folder, "train")) val_ds = Dataset.from_generator( gen, num_proc=1, # Set num_proc to adjust parallel processing gen_kwargs={"entries": val_data}, ) val_ds.save_to_disk(os.path.join(output_folder, "val")) cache.close() if __name__ == "__main__": input_file = "/Users/anuradhachopra/Downloads/MusicBench_train.json" # Change this to your input JSON file path output_folder = "/Users/anuradhachopra/data/musicbench_multitoken" # Change this to your desired output folder path convert_json_to_dataset(input_file, output_folder, train_ratio=0.8)