In [1]:
import mediapipe as mp
import cv2
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score, confusion_matrix
from sklearn.calibration import CalibratedClassifierCV

import warnings
warnings.filterwarnings('ignore')

# Drawing helpers
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

objc[58344]: Class CaptureDelegate is implemented in both /Users/fuixlabsdev1/Programming/PP/graduation-thesis/env/lib/python3.8/site-packages/mediapipe/.dylibs/libopencv_videoio.3.4.16.dylib (0x10ae08860) and /Users/fuixlabsdev1/Programming/PP/graduation-thesis/env/lib/python3.8/site-packages/cv2/cv2.abi3.so (0x15eece480). One of the two will be used. Which one is undefined.
objc[58344]: Class CVWindow is implemented in both /Users/fuixlabsdev1/Programming/PP/graduation-thesis/env/lib/python3.8/site-packages/mediapipe/.dylibs/libopencv_highgui.3.4.16.dylib (0x105baca68) and /Users/fuixlabsdev1/Programming/PP/graduation-thesis/env/lib/python3.8/site-packages/cv2/cv2.abi3.so (0x15eece4d0). One of the two will be used. Which one is undefined.
objc[58344]: Class CVView is implemented in both /Users/fuixlabsdev1/Programming/PP/graduation-thesis/env/lib/python3.8/site-packages/mediapipe/.dylibs/libopencv_highgui.3.4.16.dylib (0x105baca90) and /Users/fuixlabsdev1/Programming/PP/graduation-th

## 1. Set up important functions

In [2]:
def rescale_frame(frame, percent=50):
    '''
    Rescale a frame to a certain percentage compare to its original frame
    '''
    width = int(frame.shape[1] * percent/ 100)
    height = int(frame.shape[0] * percent/ 100)
    dim = (width, height)
    return cv2.resize(frame, dim, interpolation = cv2.INTER_AREA)


def describe_dataset(dataset_path: str):
    '''
    Describe dataset
    '''

    data = pd.read_csv(dataset_path)
    print(f"Headers: {list(data.columns.values)}")
    print(f'Number of rows: {data.shape[0]} \nNumber of columns: {data.shape[1]}\n')
    print(f"Labels: \n{data['label'].value_counts()}\n")
    print(f"Missing values: {data.isnull().values.any()}\n")
    
    duplicate = data[data.duplicated()]
    print(f"Duplicate Rows : {len(duplicate.sum(axis=1))}")

    return data


def round_up_metric_results(results) -> list:
    '''Round up metrics results such as precision score, recall score, ...'''
    return list(map(lambda el: round(el, 3), results))

## 2. Describe and process data

In [3]:
TRAIN_SET_PATH  = "./err.train.csv"
TEST_SET_PATH  = "./err.test.csv"

In [6]:
df = describe_dataset(TRAIN_SET_PATH)
# Categorizing label
df.loc[df["label"] == "L", "label"] = 0
df.loc[df["label"] == "C", "label"] = 1

df.tail(3)

Headers: ['label', 'nose_x', 'nose_y', 'nose_z', 'nose_v', 'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z', 'left_shoulder_v', 'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z', 'right_shoulder_v', 'left_hip_x', 'left_hip_y', 'left_hip_z', 'left_hip_v', 'right_hip_x', 'right_hip_y', 'right_hip_z', 'right_hip_v', 'left_knee_x', 'left_knee_y', 'left_knee_z', 'left_knee_v', 'right_knee_x', 'right_knee_y', 'right_knee_z', 'right_knee_v', 'left_ankle_x', 'left_ankle_y', 'left_ankle_z', 'left_ankle_v', 'right_ankle_x', 'right_ankle_y', 'right_ankle_z', 'right_ankle_v', 'left_heel_x', 'left_heel_y', 'left_heel_z', 'left_heel_v', 'right_heel_x', 'right_heel_y', 'right_heel_z', 'right_heel_v', 'left_foot_index_x', 'left_foot_index_y', 'left_foot_index_z', 'left_foot_index_v', 'right_foot_index_x', 'right_foot_index_y', 'right_foot_index_z', 'right_foot_index_v']
Number of rows: 17907 
Number of columns: 53

Labels: 
L    9114
C    8793
Name: label, dtype: int64

Missing values:

Unnamed: 0,label,nose_x,nose_y,nose_z,nose_v,left_shoulder_x,left_shoulder_y,left_shoulder_z,left_shoulder_v,right_shoulder_x,...,right_heel_z,right_heel_v,left_foot_index_x,left_foot_index_y,left_foot_index_z,left_foot_index_v,right_foot_index_x,right_foot_index_y,right_foot_index_z,right_foot_index_v
17904,1,0.647438,0.442268,0.004114,0.999985,0.615798,0.51717,0.151706,0.999579,0.631354,...,-0.034228,0.979719,0.701826,0.880516,0.134222,0.979319,0.50488,0.881748,-0.027911,0.986165
17905,1,0.649652,0.419057,0.008783,0.999983,0.617577,0.503514,0.158545,0.999529,0.631972,...,-0.061176,0.980431,0.704606,0.880248,0.071476,0.979932,0.504513,0.881766,-0.088832,0.986975
17906,1,0.653556,0.400394,0.014852,0.99998,0.620734,0.486522,0.169807,0.999556,0.631171,...,-0.138678,0.979078,0.705475,0.878981,0.00369,0.979199,0.504067,0.882642,-0.183304,0.986824


In [5]:
with open("./model/input_scaler.pkl", "rb") as f:
    sc = pickle.load(f)

In [10]:
# Extract features and class
X = df.drop("label", axis=1)
y = df["label"].astype("int")

X = pd.DataFrame(sc.transform(X))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
y_test.head(3)

10827    0
11395    0
3742     1
Name: label, dtype: int64

## 3. Train & Evaluate Model

### 3.1. Train and evaluate model with train set

In [12]:
algorithms =[("LR", LogisticRegression()),
         ("SVC", SVC(probability=True)),
         ('KNN',KNeighborsClassifier()),
         ("DTC", DecisionTreeClassifier()),
         ("SGDC", CalibratedClassifierCV(SGDClassifier())),
         ("NB", GaussianNB()),
         ('RF', RandomForestClassifier()),]

models = {}
final_results = []

for name, model in algorithms:
    trained_model = model.fit(X_train, y_train)
    models[name] = trained_model

    # Evaluate model
    model_results = model.predict(X_test)

    p_score = precision_score(y_test, model_results, average=None, labels=[1, 0])
    a_score = accuracy_score(y_test, model_results)
    r_score = recall_score(y_test, model_results, average=None, labels=[1, 0])
    f1_score_result = f1_score(y_test, model_results, average=None, labels=[1, 0])
    cm = confusion_matrix(y_test, model_results, labels=[1, 0])
    final_results.append(( name,  round_up_metric_results(p_score), a_score, round_up_metric_results(r_score), round_up_metric_results(f1_score_result), cm))

# Sort results by F1 score
final_results.sort(key=lambda k: sum(k[4]), reverse=True)
pd.DataFrame(final_results, columns=["Model", "Precision Score", "Accuracy score", "Recall Score", "F1 score", "Confusion Matrix"])

Unnamed: 0,Model,Precision Score,Accuracy score,Recall Score,F1 score,Confusion Matrix
0,SVC,"[1.0, 0.999]",0.999721,"[0.999, 1.0]","[1.0, 1.0]","[[1713, 1], [0, 1868]]"
1,KNN,"[1.0, 0.998]",0.999162,"[0.998, 1.0]","[0.999, 0.999]","[[1711, 3], [0, 1868]]"
2,RF,"[0.999, 0.999]",0.999162,"[0.999, 0.999]","[0.999, 0.999]","[[1712, 2], [1, 1867]]"
3,DTC,"[0.997, 0.997]",0.997208,"[0.997, 0.997]","[0.997, 0.997]","[[1709, 5], [5, 1863]]"
4,LR,"[0.992, 0.987]",0.989391,"[0.986, 0.993]","[0.989, 0.99]","[[1690, 24], [14, 1854]]"
5,SGDC,"[0.992, 0.988]",0.98995,"[0.987, 0.993]","[0.989, 0.99]","[[1692, 22], [14, 1854]]"
6,NB,"[0.963, 0.952]",0.957286,"[0.947, 0.967]","[0.955, 0.959]","[[1623, 91], [62, 1806]]"


### 3.2. Test set evaluation

In [15]:
test_df = describe_dataset(TEST_SET_PATH)
test_df = test_df.sample(frac=1).reset_index(drop=True)

# Categorizing label
test_df.loc[test_df["label"] == "L", "label"] = 0
test_df.loc[test_df["label"] == "C", "label"] = 1

test_x = test_df.drop("label", axis=1)
test_y = test_df["label"].astype("int")

test_x = pd.DataFrame(sc.transform(test_x))

Headers: ['label', 'nose_x', 'nose_y', 'nose_z', 'nose_v', 'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z', 'left_shoulder_v', 'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z', 'right_shoulder_v', 'left_hip_x', 'left_hip_y', 'left_hip_z', 'left_hip_v', 'right_hip_x', 'right_hip_y', 'right_hip_z', 'right_hip_v', 'left_knee_x', 'left_knee_y', 'left_knee_z', 'left_knee_v', 'right_knee_x', 'right_knee_y', 'right_knee_z', 'right_knee_v', 'left_ankle_x', 'left_ankle_y', 'left_ankle_z', 'left_ankle_v', 'right_ankle_x', 'right_ankle_y', 'right_ankle_z', 'right_ankle_v', 'left_heel_x', 'left_heel_y', 'left_heel_z', 'left_heel_v', 'right_heel_x', 'right_heel_y', 'right_heel_z', 'right_heel_v', 'left_foot_index_x', 'left_foot_index_y', 'left_foot_index_z', 'left_foot_index_v', 'right_foot_index_x', 'right_foot_index_y', 'right_foot_index_z', 'right_foot_index_v']
Number of rows: 1107 
Number of columns: 53

Labels: 
L    561
C    546
Name: label, dtype: int64

Missing values: Fa

In [17]:
testset_final_results = []

for name, model in models.items():
    # Evaluate model
    model_results = model.predict(test_x)

    p_score = precision_score(test_y, model_results, average=None, labels=[1, 0])
    a_score = accuracy_score(test_y, model_results)
    r_score = recall_score(test_y, model_results, average=None, labels=[1, 0])
    f1_score_result = f1_score(test_y, model_results, average=None, labels=[1, 0])
    cm = confusion_matrix(test_y, model_results, labels=[1, 0])
    testset_final_results.append(( name,  round_up_metric_results(p_score), a_score, round_up_metric_results(r_score), round_up_metric_results(f1_score_result), cm ))


testset_final_results.sort(key=lambda k: sum(k[4]), reverse=True)
pd.DataFrame(testset_final_results, columns=["Model", "Precision Score", "Accuracy score", "Recall Score", "F1 score", "Confusion Matrix"])

Unnamed: 0,Model,Precision Score,Accuracy score,Recall Score,F1 score,Confusion Matrix
0,LR,"[0.948, 0.998]",0.971996,"[0.998, 0.947]","[0.972, 0.972]","[[545, 1], [30, 531]]"
1,SGDC,"[0.922, 0.998]",0.957543,"[0.998, 0.918]","[0.959, 0.956]","[[545, 1], [46, 515]]"
2,DTC,"[0.95, 0.889]",0.916893,"[0.877, 0.955]","[0.912, 0.921]","[[479, 67], [25, 536]]"
3,RF,"[0.786, 0.921]",0.841915,"[0.934, 0.752]","[0.854, 0.828]","[[510, 36], [139, 422]]"
4,NB,"[0.79, 0.751]",0.768744,"[0.723, 0.813]","[0.755, 0.781]","[[395, 151], [105, 456]]"
5,KNN,"[0.737, 0.799]",0.765131,"[0.815, 0.717]","[0.774, 0.756]","[[445, 101], [159, 402]]"
6,SVC,"[0.659, 0.842]",0.719964,"[0.894, 0.551]","[0.759, 0.666]","[[488, 58], [252, 309]]"


## 4. Dump Models 

According to the evaluation above, LR and KNN SGDC would be chosen for more eval.

In [18]:
with open("./model/sklearn/err_all_sklearn.pkl", "wb") as f:
    pickle.dump(models, f)

In [19]:
with open("./model/sklearn/err_SGDC_model.pkl", "wb") as f:
    pickle.dump(models["SGDC"], f)

In [20]:
with open("./model/sklearn/err_LR_model.pkl", "wb") as f:
    pickle.dump(models["LR"], f)