|
import json |
|
import random |
|
|
|
|
|
input_filepath = 'commit_data_hpc_modified.jsonl' |
|
train_filepath = 'train.jsonl' |
|
valid_filepath = 'valid.jsonl' |
|
test_filepath = 'test.jsonl' |
|
|
|
|
|
data_0 = [] |
|
data_1 = [] |
|
with open(input_filepath, 'r') as infile: |
|
for line in infile: |
|
entry = json.loads(line.strip()) |
|
if entry['target'] == 0: |
|
data_0.append(entry) |
|
elif entry['target'] == 1: |
|
data_1.append(entry) |
|
|
|
|
|
random.shuffle(data_0) |
|
random.shuffle(data_1) |
|
|
|
|
|
train_ratio, valid_ratio = 0.8, 0.1 |
|
|
|
train_data = data_0[:int(len(data_0)*train_ratio)] + data_1[:int(len(data_1)*train_ratio)] |
|
valid_data = data_0[int(len(data_0)*train_ratio):int(len(data_0)*(train_ratio+valid_ratio))] + \ |
|
data_1[int(len(data_1)*train_ratio):int(len(data_1)*(train_ratio+valid_ratio))] |
|
test_data = data_0[int(len(data_0)*(train_ratio+valid_ratio)):] + data_1[int(len(data_1)*(train_ratio+valid_ratio)):] |
|
|
|
|
|
with open(train_filepath, 'w') as f: |
|
for entry in train_data: |
|
f.write(json.dumps(entry) + '\n') |
|
|
|
with open(valid_filepath, 'w') as f: |
|
for entry in valid_data: |
|
f.write(json.dumps(entry) + '\n') |
|
|
|
with open(test_filepath, 'w') as f: |
|
for entry in test_data: |
|
f.write(json.dumps(entry) + '\n') |
|
|
|
print("File splitting complete!") |
|
|