ManojAlexender's picture
Upload folder using huggingface_hub
1007d88 verified
import json
import random
# Input and output file paths
input_filepath = 'commit_data_hpc_modified.jsonl'
train_filepath = 'train.jsonl'
valid_filepath = 'valid.jsonl'
test_filepath = 'test.jsonl'
# Read and collect entries based on "target" value
data_0 = []
data_1 = []
with open(input_filepath, 'r') as infile:
for line in infile:
entry = json.loads(line.strip())
if entry['target'] == 0:
data_0.append(entry)
elif entry['target'] == 1:
data_1.append(entry)
# Shuffle the data
random.shuffle(data_0)
random.shuffle(data_1)
# Split the data based on given ratios
train_ratio, valid_ratio = 0.8, 0.1 # test_ratio = 0.1
train_data = data_0[:int(len(data_0)*train_ratio)] + data_1[:int(len(data_1)*train_ratio)]
valid_data = data_0[int(len(data_0)*train_ratio):int(len(data_0)*(train_ratio+valid_ratio))] + \
data_1[int(len(data_1)*train_ratio):int(len(data_1)*(train_ratio+valid_ratio))]
test_data = data_0[int(len(data_0)*(train_ratio+valid_ratio)):] + data_1[int(len(data_1)*(train_ratio+valid_ratio)):]
# Write data to respective files
with open(train_filepath, 'w') as f:
for entry in train_data:
f.write(json.dumps(entry) + '\n')
with open(valid_filepath, 'w') as f:
for entry in valid_data:
f.write(json.dumps(entry) + '\n')
with open(test_filepath, 'w') as f:
for entry in test_data:
f.write(json.dumps(entry) + '\n')
print("File splitting complete!")