Spaces:
Sleeping
Sleeping
## Necessary Packages | |
import scipy.stats | |
import numpy as np | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
from sklearn.manifold import TSNE | |
from sklearn.decomposition import PCA | |
def display_scores(results): | |
mean = np.mean(results) | |
sigma = scipy.stats.sem(results) | |
sigma = sigma * scipy.stats.t.ppf((1 + 0.95) / 2.0, 5 - 1) | |
# sigma = 1.96*(np.std(results)/np.sqrt(len(results))) | |
print("Final Score: ", f"{mean} \xB1 {sigma}") | |
return mean, sigma | |
def train_test_divide(data_x, data_x_hat, data_t, data_t_hat, train_rate=0.8): | |
"""Divide train and test data for both original and synthetic data. | |
Args: | |
- data_x: original data | |
- data_x_hat: generated data | |
- data_t: original time | |
- data_t_hat: generated time | |
- train_rate: ratio of training data from the original data | |
""" | |
# Divide train/test index (original data) | |
no = len(data_x) | |
idx = np.random.permutation(no) | |
train_idx = idx[: int(no * train_rate)] | |
test_idx = idx[int(no * train_rate) :] | |
train_x = [data_x[i] for i in train_idx] | |
test_x = [data_x[i] for i in test_idx] | |
train_t = [data_t[i] for i in train_idx] | |
test_t = [data_t[i] for i in test_idx] | |
# Divide train/test index (synthetic data) | |
no = len(data_x_hat) | |
idx = np.random.permutation(no) | |
train_idx = idx[: int(no * train_rate)] | |
test_idx = idx[int(no * train_rate) :] | |
train_x_hat = [data_x_hat[i] for i in train_idx] | |
test_x_hat = [data_x_hat[i] for i in test_idx] | |
train_t_hat = [data_t_hat[i] for i in train_idx] | |
test_t_hat = [data_t_hat[i] for i in test_idx] | |
return ( | |
train_x, | |
train_x_hat, | |
test_x, | |
test_x_hat, | |
train_t, | |
train_t_hat, | |
test_t, | |
test_t_hat, | |
) | |
def extract_time(data): | |
"""Returns Maximum sequence length and each sequence length. | |
Args: | |
- data: original data | |
Returns: | |
- time: extracted time information | |
- max_seq_len: maximum sequence length | |
""" | |
time = list() | |
max_seq_len = 0 | |
for i in range(len(data)): | |
max_seq_len = max(max_seq_len, len(data[i][:, 0])) | |
time.append(len(data[i][:, 0])) | |
return time, max_seq_len | |
def visualization(ori_data, generated_data, analysis, compare=3000, output_label=""): | |
"""Using PCA or tSNE for generated and original data visualization. | |
Args: | |
- ori_data: original data | |
- generated_data: generated synthetic data | |
- analysis: tsne or pca or kernel | |
""" | |
# Analysis sample size (for faster computation) | |
anal_sample_no = min([compare, ori_data.shape[0]]) | |
idx = np.random.permutation(ori_data.shape[0])[:anal_sample_no] | |
# Data preprocessing | |
# ori_data = np.asarray(ori_data) | |
# generated_data = np.asarray(generated_data) | |
ori_data = ori_data[idx] | |
generated_data = generated_data[idx] | |
no, seq_len, dim = ori_data.shape | |
for i in range(anal_sample_no): | |
if i == 0: | |
prep_data = np.reshape(np.mean(ori_data[0, :, :], 1), [1, seq_len]) | |
prep_data_hat = np.reshape( | |
np.mean(generated_data[0, :, :], 1), [1, seq_len] | |
) | |
else: | |
prep_data = np.concatenate( | |
(prep_data, np.reshape(np.mean(ori_data[i, :, :], 1), [1, seq_len])) | |
) | |
prep_data_hat = np.concatenate( | |
( | |
prep_data_hat, | |
np.reshape(np.mean(generated_data[i, :, :], 1), [1, seq_len]), | |
) | |
) | |
# Visualization parameter | |
# colors = [ | |
# "red" for i in range(anal_sample_no)] + [ | |
# "blue" for i in range(anal_sample_no) | |
# ] | |
colors = [ | |
# "#CA0020", | |
"#F4A582", | |
# "#92C5DE", | |
"#0571B0", | |
"#5E4FA2", | |
"#54278F", | |
] | |
if analysis == "pca": | |
# PCA Analysis | |
pca = PCA(n_components=2) | |
pca.fit(prep_data) | |
pca_results = pca.transform(prep_data) | |
pca_hat_results = pca.transform(prep_data_hat) | |
# Plotting | |
fig, ax = plt.subplots(1, figsize=(8, 6)) | |
plt.scatter( | |
pca_results[:, 0], | |
pca_results[:, 1], | |
# c=colors[:anal_sample_no], | |
c=[colors[0] for _ in range(anal_sample_no)], | |
alpha=0.5, | |
label="Original", | |
) | |
plt.scatter( | |
pca_hat_results[:, 0], | |
pca_hat_results[:, 1], | |
# c=colors[anal_sample_no:], | |
c=[colors[1] for _ in range(anal_sample_no)], | |
alpha=0.5, | |
label="Generated", | |
) | |
ax.legend() | |
plt.title("PCA plot") | |
plt.xlabel("x") | |
plt.ylabel("y") | |
plt.show() | |
from matplotlib.backends.backend_pdf import PdfPages | |
pdf = PdfPages(f"./figures/{output_label}_pca.pdf") | |
pdf.savefig(fig) | |
pdf.close() | |
elif analysis == "tsne": | |
# Do t-SNE Analysis together | |
prep_data_final = np.concatenate((prep_data, prep_data_hat), axis=0) | |
# TSNE anlaysis | |
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) | |
tsne_results = tsne.fit_transform(prep_data_final) | |
# Plotting | |
fig, ax = plt.subplots(1, figsize=(8, 6)) | |
plt.scatter( | |
tsne_results[:anal_sample_no, 0], | |
tsne_results[:anal_sample_no, 1], | |
c=[colors[0] for _ in range(anal_sample_no)], | |
alpha=0.5, | |
label="Original", | |
) | |
plt.scatter( | |
tsne_results[anal_sample_no:, 0], | |
tsne_results[anal_sample_no:, 1], | |
c=[colors[1] for _ in range(anal_sample_no)], | |
alpha=0.5, | |
label="Generated", | |
) | |
ax.legend() | |
plt.title("t-SNE plot") | |
plt.xlabel("x") | |
plt.ylabel("y") | |
plt.show() | |
from matplotlib.backends.backend_pdf import PdfPages | |
pdf = PdfPages(f"./figures/{output_label}_tsne.pdf") | |
pdf.savefig(fig) | |
pdf.close() | |
elif analysis == "kernel": | |
# Visualization parameter | |
# colors = ["red" for i in range(anal_sample_no)] + ["blue" for i in range(anal_sample_no)] | |
fig, ax = plt.subplots(1, figsize=(8, 6)) | |
sns.distplot( | |
prep_data, | |
hist=False, | |
kde=True, | |
kde_kws={"linewidth": 2}, | |
label="Original", | |
color=colors[0], | |
) | |
sns.distplot( | |
prep_data_hat, | |
hist=False, | |
kde=True, | |
kde_kws={"linewidth": 2, "linestyle": "--"}, | |
label="Generated", | |
color=colors[1], | |
) | |
# Plot formatting | |
# plt.legend(prop={'size': 22}) | |
plt.legend() | |
plt.xlabel("Data Value") | |
plt.ylabel("Data Density Estimate") | |
# plt.rcParams['pdf.fonttype'] = 42 | |
# plt.savefig(str(args.save_dir)+"/"+args.model1+"_histo.png", dpi=100,bbox_inches='tight') | |
# plt.ylim((0, 12)) | |
plt.show() | |
from matplotlib.backends.backend_pdf import PdfPages | |
pdf = PdfPages(f"./figures/{output_label}_kernel.pdf") | |
pdf.savefig(fig) | |
pdf.close() | |
plt.close() | |
def visualization_control(data, analysis, compare=3000, output_label=""): | |
"""Using PCA or tSNE for generated and original data visualization. | |
Args: | |
- data: dictionary of original and generated data | |
- analysis: tsne or pca or kernel | |
""" | |
ori_data = data.get("ori_data") | |
keys = list(data.keys()) | |
keys.remove("ori_data") | |
# Analysis sample size (for faster computation) | |
anal_sample_no = min([compare, ori_data.shape[0]]) | |
idx = np.random.permutation(ori_data.shape[0])[:anal_sample_no] | |
# Data preprocessing | |
# ori_data = np.asarray(ori_data) | |
# generated_data = np.asarray(generated_data) | |
ori_data = ori_data[idx] | |
for i, key in enumerate(keys): | |
data[key] = data[key][idx] | |
_, seq_len, dim = ori_data.shape | |
preprossed_data = {} | |
for i in range(anal_sample_no): | |
if i == 0: | |
prep_data = np.reshape(np.mean(ori_data[0, :, :], 1), [1, seq_len]) | |
# prep_data_hat = np.reshape( | |
# np.mean(generated_data[0, :, :], 1), [1, seq_len] | |
# ) | |
for key in keys: | |
prep_data_hat = np.reshape( | |
np.mean(data[key][0, :, :], 1), [1, seq_len] | |
) | |
preprossed_data[key] = prep_data_hat | |
else: | |
prep_data = np.concatenate( | |
(prep_data, np.reshape(np.mean(ori_data[i, :, :], 1), [1, seq_len])) | |
) | |
# prep_data_hat = np.concatenate( | |
# ( | |
# prep_data_hat, | |
# np.reshape(np.mean(generated_data[i, :, :], 1), [1, seq_len]), | |
# ) | |
# ) | |
for key in keys: | |
prep_data_hat = np.concatenate( | |
( | |
preprossed_data[key], | |
np.reshape(np.mean(data[key][i, :, :], 1), [1, seq_len]), | |
) | |
) | |
preprossed_data[key] = prep_data_hat | |
# Visualization parameter | |
# colors = [ | |
# "red" for i in range(anal_sample_no)] + [ | |
# "blue" for i in range(anal_sample_no) | |
# ] | |
colors = [ | |
"#CA0020", | |
"#F4A582", | |
"#92C5DE", | |
"#0571B0", | |
"#5E4FA2", | |
"#54278F", | |
"#6A3D9A", | |
"#9E0142", | |
"#D53E4F", | |
"#F46D43", | |
"#FDAE61", | |
"#FEE08B", | |
] * 3 | |
if analysis == "pca": | |
# PCA Analysis | |
pca = PCA(n_components=2) | |
pca.fit(prep_data) | |
pca_results = pca.transform(prep_data) | |
pca_control_results = {} | |
for key in keys: | |
pca_control_results[key] = pca.transform(preprossed_data[key]) | |
# pca_hat_results = pca.transform(prep_data_hat) | |
# Plotting | |
fig, ax = plt.subplots(1, figsize=(8, 6)) | |
plt.scatter( | |
pca_results[:, 0], | |
pca_results[:, 1], | |
# c=colors[:anal_sample_no], | |
c=[colors[0] for _ in range(anal_sample_no)], | |
alpha=0.5, | |
label="Original", | |
) | |
# plt.scatter( | |
# pca_hat_results[:, 0], | |
# pca_hat_results[:, 1], | |
# # c=colors[anal_sample_no:], | |
# c=[colors[1] for _ in range(anal_sample_no)], | |
# alpha=0.5, | |
# label="Generated", | |
# ) | |
for i, key in enumerate(keys): | |
plt.scatter( | |
pca_control_results[key][:, 0], | |
pca_control_results[key][:, 1], | |
c=[colors[i+1] for _ in range(anal_sample_no)], | |
alpha=0.5, | |
label=key, | |
) | |
ax.legend() | |
plt.title("PCA plot") | |
plt.xlabel("x") | |
plt.ylabel("y") | |
plt.show() | |
from matplotlib.backends.backend_pdf import PdfPages | |
pdf = PdfPages(f"./figures/{output_label}_pca.pdf") | |
pdf.savefig(fig) | |
pdf.close() | |
elif analysis == "tsne": | |
# Do t-SNE Analysis together | |
prep_data_final = np.concatenate([prep_data] + [preprossed_data[key] for key in keys], axis=0) | |
# TSNE anlaysis | |
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) | |
tsne_results = tsne.fit_transform(prep_data_final) | |
# Plotting | |
fig, ax = plt.subplots(1, figsize=(8, 6)) | |
plt.scatter( | |
tsne_results[:anal_sample_no, 0], | |
tsne_results[:anal_sample_no, 1], | |
c=[colors[0] for _ in range(anal_sample_no)], | |
alpha=0.5, | |
label="Original", | |
) | |
for i, key in enumerate(keys): | |
plt.scatter( | |
tsne_results[(i+1)*anal_sample_no:(i+2)*anal_sample_no, 0], | |
tsne_results[(i+1)*anal_sample_no:(i+2)*anal_sample_no, 1], | |
c=[colors[i+1] for _ in range(anal_sample_no)], | |
alpha=0.5, | |
label=key, | |
) | |
# plt.scatter( | |
# tsne_results[anal_sample_no:, 0], | |
# tsne_results[anal_sample_no:, 1], | |
# c=[colors[1] for _ in range(anal_sample_no)], | |
# alpha=0.5, | |
# label="Generated", | |
# ) | |
ax.legend() | |
plt.title("t-SNE plot") | |
plt.xlabel("x") | |
plt.ylabel("y") | |
plt.show() | |
from matplotlib.backends.backend_pdf import PdfPages | |
pdf = PdfPages(f"./figures/{output_label}_tsne.pdf") | |
pdf.savefig(fig) | |
pdf.close() | |
elif analysis == "kernel": | |
# Visualization parameter | |
# colors = ["red" for i in range(anal_sample_no)] + ["blue" for i in range(anal_sample_no)] | |
fig, ax = plt.subplots(1, figsize=(8, 6)) | |
sns.distplot( | |
prep_data, | |
hist=False, | |
kde=True, | |
kde_kws={"linewidth": 2}, | |
label="Original", | |
color=colors[0], | |
) | |
# sns.distplot( | |
# prep_data_hat, | |
# hist=False, | |
# kde=True, | |
# kde_kws={"linewidth": 2, "linestyle": "--"}, | |
# label="Generated", | |
# color=colors[1], | |
# ) | |
for i, key in enumerate(keys): | |
sns.distplot( | |
preprossed_data[key], | |
hist=False, | |
kde=True, | |
kde_kws={"linewidth": 2, "linestyle": "--"}, | |
label=key, | |
color=colors[i+1], | |
) | |
# Plot formatting | |
# plt.legend(prop={'size': 22}) | |
plt.legend() | |
plt.xlabel("Data Value") | |
plt.ylabel("Data Density Estimate") | |
# plt.rcParams['pdf.fonttype'] = 42 | |
# plt.savefig(str(args.save_dir)+"/"+args.model1+"_histo.png", dpi=100,bbox_inches='tight') | |
# plt.ylim((0, 12)) | |
plt.show() | |
from matplotlib.backends.backend_pdf import PdfPages | |
pdf = PdfPages(f"./figures/{output_label}_kernel.pdf") | |
pdf.savefig(fig) | |
pdf.close() | |
plt.close() | |
def save_pdf(fig, path): | |
# from matplotlib.backends.backend_pdf import PdfPages | |
# pdf = PdfPages(path) | |
# pdf.savefig(fig) | |
# pdf.close() | |
fig.savefig(path, format="pdf", bbox_inches="tight") | |
if __name__ == "__main__": | |
pass | |