TSEditor / utils /metric_utils.py
PeterYu's picture
update
2875fe6
raw
history blame
14.9 kB
## Necessary Packages
import scipy.stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
def display_scores(results):
mean = np.mean(results)
sigma = scipy.stats.sem(results)
sigma = sigma * scipy.stats.t.ppf((1 + 0.95) / 2.0, 5 - 1)
# sigma = 1.96*(np.std(results)/np.sqrt(len(results)))
print("Final Score: ", f"{mean} \xB1 {sigma}")
return mean, sigma
def train_test_divide(data_x, data_x_hat, data_t, data_t_hat, train_rate=0.8):
"""Divide train and test data for both original and synthetic data.
Args:
- data_x: original data
- data_x_hat: generated data
- data_t: original time
- data_t_hat: generated time
- train_rate: ratio of training data from the original data
"""
# Divide train/test index (original data)
no = len(data_x)
idx = np.random.permutation(no)
train_idx = idx[: int(no * train_rate)]
test_idx = idx[int(no * train_rate) :]
train_x = [data_x[i] for i in train_idx]
test_x = [data_x[i] for i in test_idx]
train_t = [data_t[i] for i in train_idx]
test_t = [data_t[i] for i in test_idx]
# Divide train/test index (synthetic data)
no = len(data_x_hat)
idx = np.random.permutation(no)
train_idx = idx[: int(no * train_rate)]
test_idx = idx[int(no * train_rate) :]
train_x_hat = [data_x_hat[i] for i in train_idx]
test_x_hat = [data_x_hat[i] for i in test_idx]
train_t_hat = [data_t_hat[i] for i in train_idx]
test_t_hat = [data_t_hat[i] for i in test_idx]
return (
train_x,
train_x_hat,
test_x,
test_x_hat,
train_t,
train_t_hat,
test_t,
test_t_hat,
)
def extract_time(data):
"""Returns Maximum sequence length and each sequence length.
Args:
- data: original data
Returns:
- time: extracted time information
- max_seq_len: maximum sequence length
"""
time = list()
max_seq_len = 0
for i in range(len(data)):
max_seq_len = max(max_seq_len, len(data[i][:, 0]))
time.append(len(data[i][:, 0]))
return time, max_seq_len
def visualization(ori_data, generated_data, analysis, compare=3000, output_label=""):
"""Using PCA or tSNE for generated and original data visualization.
Args:
- ori_data: original data
- generated_data: generated synthetic data
- analysis: tsne or pca or kernel
"""
# Analysis sample size (for faster computation)
anal_sample_no = min([compare, ori_data.shape[0]])
idx = np.random.permutation(ori_data.shape[0])[:anal_sample_no]
# Data preprocessing
# ori_data = np.asarray(ori_data)
# generated_data = np.asarray(generated_data)
ori_data = ori_data[idx]
generated_data = generated_data[idx]
no, seq_len, dim = ori_data.shape
for i in range(anal_sample_no):
if i == 0:
prep_data = np.reshape(np.mean(ori_data[0, :, :], 1), [1, seq_len])
prep_data_hat = np.reshape(
np.mean(generated_data[0, :, :], 1), [1, seq_len]
)
else:
prep_data = np.concatenate(
(prep_data, np.reshape(np.mean(ori_data[i, :, :], 1), [1, seq_len]))
)
prep_data_hat = np.concatenate(
(
prep_data_hat,
np.reshape(np.mean(generated_data[i, :, :], 1), [1, seq_len]),
)
)
# Visualization parameter
# colors = [
# "red" for i in range(anal_sample_no)] + [
# "blue" for i in range(anal_sample_no)
# ]
colors = [
# "#CA0020",
"#F4A582",
# "#92C5DE",
"#0571B0",
"#5E4FA2",
"#54278F",
]
if analysis == "pca":
# PCA Analysis
pca = PCA(n_components=2)
pca.fit(prep_data)
pca_results = pca.transform(prep_data)
pca_hat_results = pca.transform(prep_data_hat)
# Plotting
fig, ax = plt.subplots(1, figsize=(8, 6))
plt.scatter(
pca_results[:, 0],
pca_results[:, 1],
# c=colors[:anal_sample_no],
c=[colors[0] for _ in range(anal_sample_no)],
alpha=0.5,
label="Original",
)
plt.scatter(
pca_hat_results[:, 0],
pca_hat_results[:, 1],
# c=colors[anal_sample_no:],
c=[colors[1] for _ in range(anal_sample_no)],
alpha=0.5,
label="Generated",
)
ax.legend()
plt.title("PCA plot")
plt.xlabel("x")
plt.ylabel("y")
plt.show()
from matplotlib.backends.backend_pdf import PdfPages
pdf = PdfPages(f"./figures/{output_label}_pca.pdf")
pdf.savefig(fig)
pdf.close()
elif analysis == "tsne":
# Do t-SNE Analysis together
prep_data_final = np.concatenate((prep_data, prep_data_hat), axis=0)
# TSNE anlaysis
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(prep_data_final)
# Plotting
fig, ax = plt.subplots(1, figsize=(8, 6))
plt.scatter(
tsne_results[:anal_sample_no, 0],
tsne_results[:anal_sample_no, 1],
c=[colors[0] for _ in range(anal_sample_no)],
alpha=0.5,
label="Original",
)
plt.scatter(
tsne_results[anal_sample_no:, 0],
tsne_results[anal_sample_no:, 1],
c=[colors[1] for _ in range(anal_sample_no)],
alpha=0.5,
label="Generated",
)
ax.legend()
plt.title("t-SNE plot")
plt.xlabel("x")
plt.ylabel("y")
plt.show()
from matplotlib.backends.backend_pdf import PdfPages
pdf = PdfPages(f"./figures/{output_label}_tsne.pdf")
pdf.savefig(fig)
pdf.close()
elif analysis == "kernel":
# Visualization parameter
# colors = ["red" for i in range(anal_sample_no)] + ["blue" for i in range(anal_sample_no)]
fig, ax = plt.subplots(1, figsize=(8, 6))
sns.distplot(
prep_data,
hist=False,
kde=True,
kde_kws={"linewidth": 2},
label="Original",
color=colors[0],
)
sns.distplot(
prep_data_hat,
hist=False,
kde=True,
kde_kws={"linewidth": 2, "linestyle": "--"},
label="Generated",
color=colors[1],
)
# Plot formatting
# plt.legend(prop={'size': 22})
plt.legend()
plt.xlabel("Data Value")
plt.ylabel("Data Density Estimate")
# plt.rcParams['pdf.fonttype'] = 42
# plt.savefig(str(args.save_dir)+"/"+args.model1+"_histo.png", dpi=100,bbox_inches='tight')
# plt.ylim((0, 12))
plt.show()
from matplotlib.backends.backend_pdf import PdfPages
pdf = PdfPages(f"./figures/{output_label}_kernel.pdf")
pdf.savefig(fig)
pdf.close()
plt.close()
def visualization_control(data, analysis, compare=3000, output_label=""):
"""Using PCA or tSNE for generated and original data visualization.
Args:
- data: dictionary of original and generated data
- analysis: tsne or pca or kernel
"""
ori_data = data.get("ori_data")
keys = list(data.keys())
keys.remove("ori_data")
# Analysis sample size (for faster computation)
anal_sample_no = min([compare, ori_data.shape[0]])
idx = np.random.permutation(ori_data.shape[0])[:anal_sample_no]
# Data preprocessing
# ori_data = np.asarray(ori_data)
# generated_data = np.asarray(generated_data)
ori_data = ori_data[idx]
for i, key in enumerate(keys):
data[key] = data[key][idx]
_, seq_len, dim = ori_data.shape
preprossed_data = {}
for i in range(anal_sample_no):
if i == 0:
prep_data = np.reshape(np.mean(ori_data[0, :, :], 1), [1, seq_len])
# prep_data_hat = np.reshape(
# np.mean(generated_data[0, :, :], 1), [1, seq_len]
# )
for key in keys:
prep_data_hat = np.reshape(
np.mean(data[key][0, :, :], 1), [1, seq_len]
)
preprossed_data[key] = prep_data_hat
else:
prep_data = np.concatenate(
(prep_data, np.reshape(np.mean(ori_data[i, :, :], 1), [1, seq_len]))
)
# prep_data_hat = np.concatenate(
# (
# prep_data_hat,
# np.reshape(np.mean(generated_data[i, :, :], 1), [1, seq_len]),
# )
# )
for key in keys:
prep_data_hat = np.concatenate(
(
preprossed_data[key],
np.reshape(np.mean(data[key][i, :, :], 1), [1, seq_len]),
)
)
preprossed_data[key] = prep_data_hat
# Visualization parameter
# colors = [
# "red" for i in range(anal_sample_no)] + [
# "blue" for i in range(anal_sample_no)
# ]
colors = [
"#CA0020",
"#F4A582",
"#92C5DE",
"#0571B0",
"#5E4FA2",
"#54278F",
"#6A3D9A",
"#9E0142",
"#D53E4F",
"#F46D43",
"#FDAE61",
"#FEE08B",
] * 3
if analysis == "pca":
# PCA Analysis
pca = PCA(n_components=2)
pca.fit(prep_data)
pca_results = pca.transform(prep_data)
pca_control_results = {}
for key in keys:
pca_control_results[key] = pca.transform(preprossed_data[key])
# pca_hat_results = pca.transform(prep_data_hat)
# Plotting
fig, ax = plt.subplots(1, figsize=(8, 6))
plt.scatter(
pca_results[:, 0],
pca_results[:, 1],
# c=colors[:anal_sample_no],
c=[colors[0] for _ in range(anal_sample_no)],
alpha=0.5,
label="Original",
)
# plt.scatter(
# pca_hat_results[:, 0],
# pca_hat_results[:, 1],
# # c=colors[anal_sample_no:],
# c=[colors[1] for _ in range(anal_sample_no)],
# alpha=0.5,
# label="Generated",
# )
for i, key in enumerate(keys):
plt.scatter(
pca_control_results[key][:, 0],
pca_control_results[key][:, 1],
c=[colors[i+1] for _ in range(anal_sample_no)],
alpha=0.5,
label=key,
)
ax.legend()
plt.title("PCA plot")
plt.xlabel("x")
plt.ylabel("y")
plt.show()
from matplotlib.backends.backend_pdf import PdfPages
pdf = PdfPages(f"./figures/{output_label}_pca.pdf")
pdf.savefig(fig)
pdf.close()
elif analysis == "tsne":
# Do t-SNE Analysis together
prep_data_final = np.concatenate([prep_data] + [preprossed_data[key] for key in keys], axis=0)
# TSNE anlaysis
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(prep_data_final)
# Plotting
fig, ax = plt.subplots(1, figsize=(8, 6))
plt.scatter(
tsne_results[:anal_sample_no, 0],
tsne_results[:anal_sample_no, 1],
c=[colors[0] for _ in range(anal_sample_no)],
alpha=0.5,
label="Original",
)
for i, key in enumerate(keys):
plt.scatter(
tsne_results[(i+1)*anal_sample_no:(i+2)*anal_sample_no, 0],
tsne_results[(i+1)*anal_sample_no:(i+2)*anal_sample_no, 1],
c=[colors[i+1] for _ in range(anal_sample_no)],
alpha=0.5,
label=key,
)
# plt.scatter(
# tsne_results[anal_sample_no:, 0],
# tsne_results[anal_sample_no:, 1],
# c=[colors[1] for _ in range(anal_sample_no)],
# alpha=0.5,
# label="Generated",
# )
ax.legend()
plt.title("t-SNE plot")
plt.xlabel("x")
plt.ylabel("y")
plt.show()
from matplotlib.backends.backend_pdf import PdfPages
pdf = PdfPages(f"./figures/{output_label}_tsne.pdf")
pdf.savefig(fig)
pdf.close()
elif analysis == "kernel":
# Visualization parameter
# colors = ["red" for i in range(anal_sample_no)] + ["blue" for i in range(anal_sample_no)]
fig, ax = plt.subplots(1, figsize=(8, 6))
sns.distplot(
prep_data,
hist=False,
kde=True,
kde_kws={"linewidth": 2},
label="Original",
color=colors[0],
)
# sns.distplot(
# prep_data_hat,
# hist=False,
# kde=True,
# kde_kws={"linewidth": 2, "linestyle": "--"},
# label="Generated",
# color=colors[1],
# )
for i, key in enumerate(keys):
sns.distplot(
preprossed_data[key],
hist=False,
kde=True,
kde_kws={"linewidth": 2, "linestyle": "--"},
label=key,
color=colors[i+1],
)
# Plot formatting
# plt.legend(prop={'size': 22})
plt.legend()
plt.xlabel("Data Value")
plt.ylabel("Data Density Estimate")
# plt.rcParams['pdf.fonttype'] = 42
# plt.savefig(str(args.save_dir)+"/"+args.model1+"_histo.png", dpi=100,bbox_inches='tight')
# plt.ylim((0, 12))
plt.show()
from matplotlib.backends.backend_pdf import PdfPages
pdf = PdfPages(f"./figures/{output_label}_kernel.pdf")
pdf.savefig(fig)
pdf.close()
plt.close()
def save_pdf(fig, path):
# from matplotlib.backends.backend_pdf import PdfPages
# pdf = PdfPages(path)
# pdf.savefig(fig)
# pdf.close()
fig.savefig(path, format="pdf", bbox_inches="tight")
if __name__ == "__main__":
pass