Spaces:

TSAnonymousDemo
/

TSEditor

Sleeping

App Files Files Community

TSEditor / plot.py

PeterYu

update

2875fe6 6 months ago

raw

history blame

21.1 kB

	import os
	import torch
	import numpy as np

	os.environ["WANDB_ENABLED"] = "false"

	from engine.solver import Trainer
	from data.build_dataloader import build_dataloader
	from utils.metric_utils import visualization, save_pdf
	# from utils.metric_utils import visualization
	from utils.io_utils import load_yaml_config, instantiate_from_config
	from models.model_utils import unnormalize_to_zero_to_one
	from scipy.signal import find_peaks, peak_prominences

	# disable user warnings
	import warnings
	warnings.simplefilter("ignore", UserWarning)
	import scipy.stats
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt

	from sklearn.manifold import TSNE
	from sklearn.decomposition import PCA

	class Arguments:
	def __init__(self, config_path) -> None:
	self.config_path = config_path
	# self.config_path = "./config/control/revenue-baseline-sine.yaml"
	self.save_dir = (
	"../../../data/" + os.path.basename(self.config_path).split(".")[0]
	)
	self.gpu = 0
	os.makedirs(self.save_dir, exist_ok=True)

	self.mode = "infill"
	self.missing_ratio = 0.95
	self.milestone = 10

	import numpy as np
	import matplotlib as mpl

	def create_color_gradient(sorting_value=None, start_color='#FFFF00', end_color='#00008B'):
	"""Create color gradient using matplotlib color interpolation."""

	def color_fader(c1, c2, mix=0):
	"""Fade from color c1 to c2 with mix ratio."""
	c1 = np.array(mpl.colors.to_rgb(c1))
	c2 = np.array(mpl.colors.to_rgb(c2))
	return mpl.colors.to_hex((1-mix)c1 + mixc2)

	if sorting_value is not None:
	# Normalize values between 0-1
	values = np.array(list(sorting_value.values()))
	normalized = (values - values.min()) / (values.max() - values.min())

	# Create color mapping
	return {
	key: color_fader(start_color, end_color, mix=norm_val)
	for key, norm_val in zip(sorting_value.keys(), normalized)
	}
	else:
	# Return middle point color
	return color_fader(start_color, end_color, mix=0.5)

	def create_color_gradient(sorting_value=None, start_color='#FFFF00', middle_color='#00FF00', end_color='#00008B'):
	"""Create color gradient using matplotlib interpolation with middle color."""

	def color_fader(c1, c2, mix=0):
	"""Fade from color c1 to c2 with mix ratio."""
	c1 = np.array(mpl.colors.to_rgb(c1))
	c2 = np.array(mpl.colors.to_rgb(c2))
	return mpl.colors.to_hex((1-mix)c1 + mixc2)

	if sorting_value is not None:
	values = np.array(list(sorting_value.values()))
	normalized = (values - values.min()) / (values.max() - values.min())

	colors = {}
	for key, norm_val in zip(sorting_value.keys(), normalized):
	if norm_val <= 0.5:
	# Interpolate between start and middle
	mix = norm_val * 2 # Scale 0-0.5 to 0-1
	colors[key] = color_fader(start_color, middle_color, mix)
	else:
	# Interpolate between middle and end
	mix = (norm_val - 0.5) * 2 # Scale 0.5-1 to 0-1
	colors[key] = color_fader(middle_color, end_color, mix)
	return colors
	else:
	return middle_color # Return middle color directly

	def evaluate_peak_detection(data, target_peaks, window_size=7, min_distance=5, prominence_threshold=0.1):
	"""
	Evaluate peak detection accuracy by comparing detected peaks with target peaks.

	Parameters:
	data: numpy array of shape (batch_size, seq_length, features)
	The generated sequences to analyze
	The indices where peaks should occur (e.g., every 7 steps for weekly peaks)
	target_peak: list
	List of indices where peaks should occur
	window_size: int
	Size of window to consider a peak match
	"""
	batch_size, seq_length, features = data.shape
	detected_peaks = []
	accuracy_metrics = {}

	# Create figure for visualization
	fig, axes = plt.subplots(4, 2, figsize=(20, 12))
	axes = axes.flatten()

	# Analyze first 8 batches and first feature (revenue)
	overall_matched = 0
	overall_targets = 0

	for i in range(8):
	sequence = data[i, :, 0] # batch i, all timepoints, revenue feature

	# Find peaks using scipy
	peaks, properties = find_peaks(sequence,
	distance=min_distance,
	prominence=prominence_threshold)

	# Plot original sequence and detected peaks
	axes[i].plot(sequence, label='Generated Sequence')
	axes[i].plot(peaks, sequence[peaks], "x", label='Detected Peaks')

	# Plot target peak positions
	target_positions = target_peaks # np.arange(0, seq_length, 7) # Weekly peaks
	axes[i].plot(target_positions, sequence[target_positions], "o",
	label='Target Peak Positions')

	axes[i].set_title(f'Sequence {i+1} Peak Detection Analysis')
	axes[i].legend()
	axes[i].grid(True)

	# Count matches within window for this sequence
	matched_peaks = 0
	for target in target_positions:
	# Check if any detected peak is within the window of the target
	matches = np.any((peaks >= target - window_size//2) &
	(peaks <= target + window_size//2))
	if matches:
	matched_peaks += 1

	overall_matched += matched_peaks
	overall_targets += len(target_positions)

	for i in range(8, batch_size):
	peaks, properties = find_peaks(data[i, :, 0], distance=min_distance, prominence=prominence_threshold)
	matched_peaks = 0
	for target in target_peaks:
	matches = np.any((peaks >= target - window_size//2) &
	(peaks <= target + window_size//2))
	if matches:
	matched_peaks += 1
	overall_matched += matched_peaks
	overall_targets += len(target_peaks)

	# Calculate overall metrics
	accuracy = overall_matched / overall_targets
	precision = overall_matched / (len(peaks) * 8) if len(peaks) > 0 else 0

	accuracy_metrics = {
	'accuracy': accuracy,
	'precision': precision,
	'total_targets': overall_targets,
	'detected_peaks': len(peaks) * 8,
	'matched_peaks': overall_matched
	}
	plt.tight_layout()
	plt.show()
	return accuracy_metrics, peaks

	for config_path in [
	"./config/modified/sines.yaml",
	"./config/modified/revenue-baseline-365.yaml",
	"./config/modified/energy.yaml",
	"./config/modified/fmri.yaml",
	]:
	args = Arguments(config_path)
	configs = load_yaml_config(args.config_path)
	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	torch.cuda.set_device(args.gpu)

	dl_info = build_dataloader(configs, args)
	model = instantiate_from_config(configs["model"]).to(device)
	trainer = Trainer(config=configs, args=args, model=model, dataloader=dl_info)
	# trainer.load(args.milestone, from_folder="../../../data/ckpt_baseline_240")
	# trainer.train()

	from data.build_dataloader import build_dataloader_cond
	# args.milestone
	trainer.load("10")
	test_dl_info = build_dataloader_cond(configs, args)
	test_dataloader, test_dataset = test_dl_info["dataloader"], test_dl_info["dataset"]
	coef = configs["dataloader"]["test_dataset"]["coefficient"]
	stepsize = configs["dataloader"]["test_dataset"]["step_size"]
	sampling_steps = configs["dataloader"]["test_dataset"]["sampling_steps"]
	seq_length, feature_dim = test_dataset.window, test_dataset.var_num
	# samples, ori_data, masks = trainer.restore(
	# test_dataloader,
	# [seq_length, feature_dim],
	# coef,
	# stepsize,
	# sampling_steps,
	# control_signal={},
	# # test=
	# )
	# if test_dataset.auto_norm:
	# samples = unnormalize_to_zero_to_one(samples)

	# ori_data = np.load(os.path.join(dataset.dir, f"sine_ground_truth_{seq_length}_test.npy"))
	dataset_name = os.path.basename(args.config_path).split(".")[0].split("-")[0]
	mapper = {
	"sines": "sines",
	"revenue": "revenue",
	"energy": "energy",
	"fmri": "fMRI",
	}
	gap = seq_length // 5

	ori_data = np.load(
	os.path.join("../../../data/train/", dataset_name, "samples", f"{mapper[dataset_name]}_norm_truth_{seq_length}_train.npy")
	)
	masks = np.load(os.path.join("../../../data/train/", dataset_name, "samples", f"{mapper[dataset_name]}_masking_{seq_length}.npy"))

	sample_num, seq_len, feat_dim = masks.shape
	observed = ori_data[:sample_num] * masks
	ori_data = ori_data[:sample_num]

	import pickle
	from pathlib import Path

	# Cache file path
	cache_dir = Path(f"../../../data/cache_{dataset_name}")
	cache_dir.mkdir(exist_ok=True)

	def load_cached_results():
	results = {'unconditional': None, 'sum_controlled': {}, 'anchor_controlled': {}}
	for cache_file in cache_dir.glob('*.pkl'):
	with open(cache_file, 'rb') as f:
	key = cache_file.stem
	if key == 'unconditional':
	results['unconditional'] = pickle.load(f)
	elif key.startswith('sum_'):
	param = key[4:] # Remove 'sum_' prefix
	results['sum_controlled'][param] = pickle.load(f)
	elif key.startswith('anchor_'):
	param = key[7:] # Remove 'anchor_' prefix
	results['anchor_controlled'][param] = pickle.load(f)
	return results

	def save_result(key, subkey, data):
	if subkey:
	filename = f"{key}_{subkey}.pkl"
	else:
	filename = f"{key}.pkl"
	with open(cache_dir / filename, 'wb') as f:
	pickle.dump(data, f)

	results = load_cached_results()

	dataset = dl_info["dataset"]
	seq_length, feature_dim = dataset.window, dataset.var_num
	coef = configs["dataloader"]["test_dataset"]["coefficient"]
	stepsize = configs["dataloader"]["test_dataset"]["step_size"]

	# Unconditional sampling
	if results['unconditional'] is None:
	print("Generating unconditional data...")
	results['unconditional'] = trainer.sample(
	num=min(1000, len(dataset)), size_every=500, shape=[seq_length, feature_dim]
	)
	save_result('unconditional', None, results['unconditional'])

	# Different AUC weights
	auc_weights = [10,]
	auc_values = [-200, -150, -100, 0, 20, 30, 50, 100, 150]

	for auc in auc_values:
	for weight in auc_weights:
	key = f"auc_{auc}_weight_{weight}"
	if key not in results['sum_controlled']:
	print(f"Generating sum controlled data - AUC: {auc}, Weight: {weight}")
	results['sum_controlled'][key] = trainer.control_sample(
	num=min(1000, len(dataset)), size_every=500, shape=[seq_length, feature_dim],
	model_kwargs={
	"gradient_control_signal": {"auc": auc, "auc_weight": weight},
	"coef": coef,
	"learning_rate": stepsize
	}
	)
	save_result('sum', key, results['sum_controlled'][key])

	auc_weights = [1, 10, 50, 100]
	auc_values = [-200,]

	for auc in auc_values:
	for weight in auc_weights:
	key = f"auc_{auc}_weight_{weight}"
	if key not in results['sum_controlled']:
	print(f"Generating sum controlled data - AUC: {auc}, Weight: {weight}")
	results['sum_controlled'][key] = trainer.control_sample(
	num=min(1000, len(dataset)), size_every=500, shape=[seq_length, feature_dim],
	model_kwargs={
	"gradient_control_signal": {"auc": auc, "auc_weight": weight},
	"coef": coef,
	"learning_rate": stepsize
	}
	)
	save_result('sum', key, results['sum_controlled'][key])

	# Different weekly peaks
	peak_values = [0.8, 1.0]
	peak_weights = [0.1, 0.5, 1.0]

	# import matplotlib.pyplot as plt

	# for peak in peak_values:
	# for weight in peak_weights:
	# key = f"peak_{peak}_weight_{weight}"
	# if key not in results['anchor_controlled']:
	# mask = np.zeros((seq_length, feature_dim), dtype=np.float32)
	# mask[::gap, 0] = weight
	# target = np.zeros((seq_length, feature_dim), dtype=np.float32)
	# target[::gap, 0] = peak

	# print(f"Generating anchor controlled data - Peak: {peak}, Weight: {weight}")
	# results['anchor_controlled'][key] = trainer.control_sample(
	# num=min(1000, len(dataset)), size_every=500, shape=[seq_length, feature_dim],
	# model_kwargs={
	# "gradient_control_signal": {"auc": -50, "auc_weight": 10.0},
	# "coef": coef,
	# "learning_rate": stepsize
	# },
	# target=target,
	# partial_mask=mask
	# )
	# save_result('anchor', key, results['anchor_controlled'][key])
	# # plot mask, target, and generated sequence
	# plt.figure(figsize=(12, 6))
	# plt.plot(mask[:, 0], label='Mask')
	# plt.plot(target[:, 0], label='Target')
	# plt.plot(results['anchor_controlled'][key][0, :, 0], label='Generated Sequence')
	# plt.title(f"Anchor Controlled Data - Peak: {peak}, Weight: {weight}")
	# plt.legend()
	# plt.show()

	# Unnormalize results if needed
	if dataset.auto_norm:
	for key, data in results.items():
	if isinstance(data, dict):
	for subkey, subdata in data.items():
	results[key][subkey] = unnormalize_to_zero_to_one(subdata)
	else:
	results[key] = unnormalize_to_zero_to_one(data)

	# Store the results in variables for compatibility with existing code
	unconditional_data = results['unconditional']
	sum_controled_data = results['sum_controlled']# ['auc_0_weight_10.0'] # default values
	anchor_controled_data = results['anchor_controlled'] # ['peak_0.8_weight_0.1'] # default values

	# Sum control
	samples = 1000
	data = {
	"ori_data": ori_data[:samples, :, :1],
	"Unconditional": unconditional_data[:samples, :, :1],
	}
	# for key, value in sum_controled_data.items():
	# if "weight_10" in key:
	# data[key] = value
	# print(key)
	keys = [
	# "auc_-200_weight_10",
	"auc_-100_weight_10",
	# "auc_0_weight_10",
	"auc_20_weight_10",
	# "auc_30_weight_10",
	"auc_50_weight_10",
	# "auc_100_weight_10",
	"auc_150_weight_10",
	]
	for key in keys:
	data[key] = sum_controled_data[key][:samples, :, :1]
	# print sum
	print(key, " ==> ", sum_controled_data[key][:samples, :, :1].sum() / sum_controled_data[key][:samples, :, :1].shape[0])

	# visualization_control(
	# data=data,
	# analysis="kernel",
	# compare=ori_data.shape[0],
	# output_label="revenue"
	# )
	def visualization_control_subplots(data, analysis="kernel", compare=100, output_label="", highlight=None):
	# from scipy import integrate

	# Calculate area under curve for each distribution
	def get_auc(data_array):
	return data_array.sum(-1).mean()

	# Get AUC values
	auc_orig = get_auc(data["ori_data"])
	auc_uncond = get_auc(data["Unconditional"])

	# Setup subplots
	keys = [k for k in data.keys() if k not in ["ori_data", "Unconditional"]]
	l = len(keys)
	n_cols = min(4, len(keys))
	n_rows = (len(keys) + n_cols - 1) // n_cols
	fig, axes = plt.subplots(n_rows, n_cols, figsize=(6n_cols, 4n_rows))
	fig.set_dpi(300)

	if n_rows == 1:
	axes = axes.reshape(1, -1)

	def beautiful_text(key):
	print(key)
	if "auc" in key:
	auc = key.split("_")[1]
	weight = key.split("_")[3]
	if highlight is None:
	return f"AUC: $\\mathbf{{{auc}}}$ Weight: {weight}"
	else:
	return f"AUC: {auc} Weight: $\\mathbf{{{weight}}}$"
	if "peak" in key:
	peak = key.split("_")[1]
	weight = key.split("_")[3]
	return f"Peak: {peak} Weight: {weight}"
	return key

	# Plot distributions
	# colors = create_color_gradient({key: get_auc(data[key]) for key in keys}, '#004225','#F02147', '#4B0082')

	def get_alpha(idx, n_plots):
	"""Generate alpha value between 0.3-0.8 based on plot index"""
	return 0.5 + (0.4 * idx / (n_plots - 1)) if n_plots > 1 else 0.8

	for idx, key in enumerate(keys):
	row, col = idx // n_cols, idx % n_cols
	ax = axes[row, col]

	# Plot distributions
	sns.distplot(data["ori_data"], hist=False, kde=True,
	kde_kws={"linewidth": 2, "alpha": 0.9 - get_alpha(idx, l) * 0.5}, color='red',
	ax=ax, label=f'Original\n$\overline{{Area}}={auc_orig:.3f}$')

	sns.distplot(data["Unconditional"], hist=False, kde=True,
	kde_kws={"linewidth": 2, "linestyle":"--", "alpha": 0.9 - get_alpha(idx, l) * 0.5},
	color='#15B01A', ax=ax, #FF4500 GREEN:15B01A
	label=f'Unconditional\n$\overline{{Area}}= {auc_uncond:.3f}$')

	auc_control = get_auc(data[key])
	sns.distplot(data[key], hist=False, kde=True,
	kde_kws={"linewidth": 2, "alpha": get_alpha(idx, l), "linestyle": "--"}, color="#9A0EEA",
	ax=ax, label=f'{beautiful_text(key)}\n$\overline{{Area}}= {auc_control:.3f})$')

	# ax.set_title(f'{beautiful_text(key)}')
	ax.legend()
	# Set labels only for first column and last row
	if col == 0: ax.set_ylabel('Density')
	else: ax.set_ylabel('')
	if row == n_rows - 1: ax.set_xlabel('Value')
	else: ax.set_xlabel('')

	fig.suptitle(f"Kernel Density Estimation of {output_label}", fontsize=16)#, fontweight='bold')
	plt.tight_layout()
	plt.show()
	# save pdf
	# plt.savefig(f"./figures/{output_label}_kde.pdf", bbox_inches='tight')
	save_pdf(fig, f"./figures/{output_label}_kde.pdf")
	plt.close()

	ds_name_display = {
	"sines": "Synthetic Sine Waves",
	"revenue": "Revenue",
	"energy": "ETTh",
	"fmri": "fMRI",
	}
	visualization_control_subplots(
	data=data,
	analysis="kernel",
	compare=ori_data.shape[0],
	output_label=f"{ds_name_display[dataset_name]} Dataset with Summation Control"
	)

	# peak control
	# data = {
	# "ori_data": ori_data[:samples, :, :1],
	# "Unconditional": unconditional_data[:samples, :, :1],
	# }
	# keys = [
	# "peak_0.8_weight_0.1",
	# "peak_0.8_weight_0.5",
	# "peak_0.8_weight_1.0",
	# "peak_1.0_weight_0.1",
	# "peak_1.0_weight_0.5",
	# "peak_1.0_weight_1.0",
	# ]
	# for key in keys:
	# data[key] = anchor_controled_data[key][:samples, :, :1]
	# # print peak
	# print(key, " ==> ", anchor_controled_data[key][:samples, :, :1].max())

	# visualization_control(
	# data=data,
	# analysis="kernel",
	# compare=ori_data.shape[0],
	# output_label="revenue"
	# )

	# # config_mapping = {
	# # "sines": {

	# # }
	# # "revenue": "revenue",
	# # "energy": "energy",
	# # "fmri": "fMRI",
	# # }
	# # Evaluate peak detection for different control settings
	# peak_accuracies = {}
	# for key, data in anchor_controled_data.items():
	# print(f"\nEvaluating {key}")
	# metrics, peaks = evaluate_peak_detection(
	# data,
	# target_peaks=range(0, seq_length, gap),
	# window_size=max(1, gap//2),
	# min_distance=max(1, gap - 1)
	# )
	# peak_accuracies[key] = metrics
	# print(f"Accuracy: {metrics['accuracy']:.3f}")
	# print(f"Precision: {metrics['precision']:.3f}")
	# print(f"Matched peaks: {metrics['matched_peaks']} / {metrics['total_targets']}")

	print("="*50)