|
|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
set -e |
|
|
|
|
|
|
|
|
ROOT_DIR="/roshare/nlst_global/data_23Dec2024/manifest-NLST_allCT/NLST/" |
|
|
OUTPUT_BASE="site1_data" |
|
|
SITE_SPLITS="../subsets/site_splits" |
|
|
|
|
|
echo "============================================" |
|
|
echo "SITE 1 - Embedding Extraction" |
|
|
echo "============================================" |
|
|
echo "" |
|
|
|
|
|
|
|
|
mkdir -p ${OUTPUT_BASE}/train |
|
|
mkdir -p ${OUTPUT_BASE}/test |
|
|
|
|
|
|
|
|
echo "π¦ Extracting TRAINING embeddings..." |
|
|
python extract-embeddings.py \ |
|
|
--root-dir ${ROOT_DIR} \ |
|
|
--pid-csv ${SITE_SPLITS}/train_pid_labelsT0T7_split_1.csv \ |
|
|
--output-dir ${OUTPUT_BASE}/train \ |
|
|
--num-workers 8 \ |
|
|
--checkpoint-interval 500 |
|
|
|
|
|
echo "" |
|
|
echo "β Training embeddings complete!" |
|
|
echo "" |
|
|
|
|
|
|
|
|
echo "π¦ Extracting TEST embeddings..." |
|
|
python extract-embeddings.py \ |
|
|
--root-dir ${ROOT_DIR} \ |
|
|
--pid-csv ${SITE_SPLITS}/test_pid_labelsT0T7_split_1.csv \ |
|
|
--output-dir ${OUTPUT_BASE}/test \ |
|
|
--num-workers 8 \ |
|
|
--checkpoint-interval 500 |
|
|
|
|
|
echo "" |
|
|
echo "β Test embeddings complete!" |
|
|
echo "" |
|
|
|
|
|
|
|
|
echo "π Preparing files for federated learning..." |
|
|
mkdir -p ${OUTPUT_BASE}/fl_ready |
|
|
|
|
|
|
|
|
cp ${OUTPUT_BASE}/train/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site1_embeddings_train.parquet |
|
|
cp ${OUTPUT_BASE}/test/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site1_embeddings_test.parquet |
|
|
|
|
|
|
|
|
echo "Creating site1_labels-train.csv..." |
|
|
head -n 1 ${SITE_SPLITS}/train_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site1_labels-train.csv |
|
|
tail -n +2 ${SITE_SPLITS}/train_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site1_labels-train.csv |
|
|
|
|
|
echo "Creating site1_labels-test.csv..." |
|
|
head -n 1 ${SITE_SPLITS}/test_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site1_labels-test.csv |
|
|
tail -n +2 ${SITE_SPLITS}/test_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site1_labels-test.csv |
|
|
|
|
|
echo "" |
|
|
echo "============================================" |
|
|
echo "SITE 1 - COMPLETE! β
" |
|
|
echo "============================================" |
|
|
echo "" |
|
|
echo "FL-ready files in: ${OUTPUT_BASE}/fl_ready/" |
|
|
ls -lh ${OUTPUT_BASE}/fl_ready/ |
|
|
echo "" |
|
|
echo "Files ready for federated learning:" |
|
|
echo " β site1_embeddings_train.parquet" |
|
|
echo " β site1_embeddings_test.parquet" |
|
|
echo " β site1_labels-train.csv" |
|
|
echo " β site1_labels-test.csv" |
|
|
echo "" |
|
|
|