import json import random # Input and output file paths input_filepath = 'commit_data_hpc_modified.jsonl' train_filepath = 'train.jsonl' valid_filepath = 'valid.jsonl' test_filepath = 'test.jsonl' # Read and collect entries based on "target" value data_0 = [] data_1 = [] with open(input_filepath, 'r') as infile: for line in infile: entry = json.loads(line.strip()) if entry['target'] == 0: data_0.append(entry) elif entry['target'] == 1: data_1.append(entry) # Shuffle the data random.shuffle(data_0) random.shuffle(data_1) # Split the data based on given ratios train_ratio, valid_ratio = 0.8, 0.1 # test_ratio = 0.1 train_data = data_0[:int(len(data_0)*train_ratio)] + data_1[:int(len(data_1)*train_ratio)] valid_data = data_0[int(len(data_0)*train_ratio):int(len(data_0)*(train_ratio+valid_ratio))] + \ data_1[int(len(data_1)*train_ratio):int(len(data_1)*(train_ratio+valid_ratio))] test_data = data_0[int(len(data_0)*(train_ratio+valid_ratio)):] + data_1[int(len(data_1)*(train_ratio+valid_ratio)):] # Write data to respective files with open(train_filepath, 'w') as f: for entry in train_data: f.write(json.dumps(entry) + '\n') with open(valid_filepath, 'w') as f: for entry in valid_data: f.write(json.dumps(entry) + '\n') with open(test_filepath, 'w') as f: for entry in test_data: f.write(json.dumps(entry) + '\n') print("File splitting complete!")