add: src

fe64bad verified 7 months ago

20.3 kB

	import os
	import imageio
	import sys
	sys.path.append('.')

	import numpy as np
	import torch
	import torch.nn.functional as F
	from tqdm import tqdm
	from src.visualize.anim import plot_3d_motion_dico, load_anim
	import clip
	from PIL import Image
	import pickle
	import src.utils.rotation_conversions as geometry
	from textwrap import wrap
	import shutil
	import subprocess as sp
	from copy import deepcopy

	GPU_MINIMUM_MEMORY = 5500

	def stack_images(real, real_gens, gen):
	nleft_cols = len(real_gens) + 1
	print("Stacking frames..")
	allframes = np.concatenate((real[:, None, ...], *[x[:, None, ...] for x in real_gens], gen), 1)
	nframes, nspa, nats, h, w, pix = allframes.shape
	blackborder = np.zeros((w//30, h*nats, pix), dtype=allframes.dtype)
	frames = []
	for frame_idx in tqdm(range(nframes)):
	columns = np.vstack(allframes[frame_idx].transpose(1, 2, 3, 4, 0)).transpose(3, 1, 0, 2)
	frame = np.concatenate((columns[0:nleft_cols], blackborder, columns[nleft_cols:]), 0).transpose(1, 0, 2)
	frames.append(frame)
	return np.stack(frames)

	def stack_gen_and_images(gen, images):
	# nleft_cols = len(real_gens) + 1
	print("Stacking frames..")
	allframes = np.concatenate((images, gen), 2)
	nframes, nspa, nats, h, w, pix = allframes.shape
	blackborder = np.zeros((w//30, h*nats, pix), dtype=allframes.dtype)
	frames = []
	for frame_idx in tqdm(range(nframes)):
	columns = np.vstack(allframes[frame_idx].transpose(1, 2, 3, 4, 0)).transpose(3, 1, 0, 2)
	frame = np.concatenate((columns[:]), 0).transpose(1, 0, 2)
	frames.append(frame)
	return np.stack(frames)

	def stack_gen_only(gen):
	# nleft_cols = len(real_gens) + 1
	print("Stacking frames..")
	# allframes = np.concatenate((real[:, None, ...], *[x[:, None, ...] for x in real_gens], gen), 1)
	allframes = gen
	nframes, nspa, nats, h, w, pix = allframes.shape
	blackborder = np.zeros((w//30, h*nats, pix), dtype=allframes.dtype)
	frames = []
	for frame_idx in tqdm(range(nframes)):
	columns = np.vstack(allframes[frame_idx].transpose(1, 2, 3, 4, 0)).transpose(3, 1, 0, 2)
	frame = np.concatenate((columns[:]), 0).transpose(1, 0, 2)
	frames.append(frame)
	return np.stack(frames)


	def generate_by_video(visualization, reconstructions, generation,
	label_to_action_name, params, nats, nspa, tmp_path, image_pathes=None, mode=None):
	# shape : (17, 3, 4, 480, 640, 3)
	# (nframes, row, column, h, w, 3)
	fps = params["fps"]

	params = params.copy()

	if "output_xyz" in visualization or "output_xyz" in generation:
	outputkey = "output_xyz"
	params["pose_rep"] = "xyz"
	else:
	outputkey = "poses"

	keep = [outputkey, "lengths", "y"]

	def _to_np(x):
	if type(x).__module__ == np.__name__:
	return x
	else: # assume tensor
	return x.data.cpu().numpy()

	visu = {key: _to_np(visualization[key]) for key in keep if key in visualization.keys()}
	recons = {mode: {key: _to_np(reconstruction[key]) for key in keep if key in reconstruction.keys()}
	for mode, reconstruction in reconstructions.items()}
	gener = {key: _to_np(generation[key]) for key in keep if key in generation.keys()}

	def get_palette(i, nspa):
	if mode == 'edit' and i < 3:
	return 'orange'
	elif mode == 'interp' and i in [0, nspa-1]:
	return 'orange'
	return 'blue'


	if(len(visu) > 0):
	lenmax = max(gener["lengths"].max(),
	visu["lengths"].max())
	else:
	lenmax = gener["lengths"].max()
	timesize = lenmax + 5
	# if params['appearance_mode'] == 'motionclip':
	# timesize = lenmax + 20

	import multiprocessing

	def pool_job_with_desc(pool, iterator, desc, max_, save_path_format, isij):
	with tqdm(total=max_, desc=desc.format("Render")) as pbar:
	for _ in pool.imap_unordered(plot_3d_motion_dico, iterator):
	pbar.update()
	if isij:
	array = np.stack([[load_anim(save_path_format.format(i, j), timesize)
	for j in range(nats)]
	for i in tqdm(range(nspa), desc=desc.format("Load"))])
	return array.transpose(2, 0, 1, 3, 4, 5)
	else:
	array = np.stack([load_anim(save_path_format.format(i), timesize)
	for i in tqdm(range(nats), desc=desc.format("Load"))])
	return array.transpose(1, 0, 2, 3, 4)

	with multiprocessing.Pool() as pool:
	# Generated samples
	save_path_format = os.path.join(tmp_path, "gen_{}_{}.gif")
	iterator = ((gener[outputkey][i, j],
	gener["lengths"][i, j],
	save_path_format.format(i, j),
	# params, {"title": f"gen: {label_to_action_name(gener['y'][i, j])}", "interval": 1000/fps})
	params, {"title": f"{label_to_action_name(gener['y'][i, j])}", "interval": 1000/fps, "palette": get_palette(i, nspa)})
	for j in range(nats) for i in range(nspa))
	gener["frames"] = pool_job_with_desc(pool, iterator,
	"{} the generated samples",
	nats*nspa,
	save_path_format,
	True)

	# Make frames with no title blank
	frames_no_title = gener['y'] == ''
	gener["frames"][:, frames_no_title] = gener["frames"][:, 0, 0:1, 0:1, 0:1] # cast the corner pixel value for all blank box

	# Real samples
	if len(visu) > 0:
	save_path_format = os.path.join(tmp_path, "real_{}.gif")
	iterator = ((visu[outputkey][i],
	visu["lengths"][i],
	save_path_format.format(i),
	params, {"title": f"real: {label_to_action_name(visu['y'][i])}", "interval": 1000/fps})
	for i in range(nats))
	visu["frames"] = pool_job_with_desc(pool, iterator,
	"{} the real samples",
	nats,
	save_path_format,
	False)
	for mode, recon in recons.items():
	# Reconstructed samples
	save_path_format = os.path.join(tmp_path, f"reconstructed_{mode}_" + "{}.gif")
	iterator = ((recon[outputkey][i],
	recon["lengths"][i],
	save_path_format.format(i),
	params, {"title": f"recons: {label_to_action_name(recon['y'][i])}",
	"interval": 1000/fps})
	for i in range(nats))
	recon["frames"] = pool_job_with_desc(pool, iterator,
	"{} the reconstructed samples",
	nats,
	save_path_format,
	False)
	if image_pathes is not None:
	# visu["frames"] -> [timesize(65), nspa(n_samples), nats(1), h(290), w(260), n_ch(3)]
	assert nats == 1
	assert nspa == len(image_pathes)
	h, w = gener["frames"].shape[3:5]
	image_frames = []
	for im_path in image_pathes:
	im = Image.open(im_path).resize((w, h))
	image_frames.append(np.tile(np.expand_dims(np.asarray(im)[..., :3], axis=(0, 1, 2)), (timesize, 1, 1, 1, 1, 1)))
	image_frames = np.concatenate(image_frames, axis=1)
	assert image_frames.shape == gener["frames"].shape
	return stack_gen_and_images(gener["frames"], image_frames)

	if len(visu) == 0:
	frames = stack_gen_only(gener["frames"])
	else:
	frames = stack_images(visu["frames"], [recon["frames"] for recon in recons.values()], gener["frames"])
	return frames


	def generate_by_video_sequences(visualization, label_to_action_name, params, nats, nspa, tmp_path):
	# shape : (17, 3, 4, 480, 640, 3)
	# (nframes, row, column, h, w, 3)
	fps = params["fps"]

	if "output_xyz" in visualization:
	outputkey = "output_xyz"
	params["pose_rep"] = "xyz"
	else:
	outputkey = "poses"

	keep = [outputkey, "lengths", "y"]
	visu = {key: visualization[key].data.cpu().numpy() for key in keep}
	lenmax = visu["lengths"].max()

	timesize = lenmax + 5
	import multiprocessing

	def pool_job_with_desc(pool, iterator, desc, max_, save_path_format):
	with tqdm(total=max_, desc=desc.format("Render")) as pbar:
	for _ in pool.imap_unordered(plot_3d_motion_dico, iterator):
	pbar.update()
	array = np.stack([[load_anim(save_path_format.format(i, j), timesize)
	for j in range(nats)]
	for i in tqdm(range(nspa), desc=desc.format("Load"))])
	return array.transpose(2, 0, 1, 3, 4, 5)

	with multiprocessing.Pool() as pool:
	# Real samples
	save_path_format = os.path.join(tmp_path, "real_{}_{}.gif")
	iterator = ((visu[outputkey][i, j],
	visu["lengths"][i, j],
	save_path_format.format(i, j),
	params, {"title": f"real: {label_to_action_name(visu['y'][i, j])}", "interval": 1000/fps})
	for j in range(nats) for i in range(nspa))
	visu["frames"] = pool_job_with_desc(pool, iterator,
	"{} the real samples",
	nats,
	save_path_format)
	frames = stack_images_sequence(visu["frames"])
	return frames


	def viz_clip_text(model, text_grid, epoch, params, folder):
	""" Generate & viz samples """

	# visualize with joints3D
	model.outputxyz = True

	print(f"Visualization of the epoch {epoch}")

	# noise_same_action = params["noise_same_action"]
	# noise_diff_action = params["noise_diff_action"]

	fact = params["fact_latent"]
	figname = params["figname"].format(epoch)

	classes = np.array(text_grid, dtype=str)
	h, w = classes.shape

	texts = classes.reshape([-1])
	clip_tokens = clip.tokenize(texts).to(params['device'])
	clip_features = model.clip_model.encode_text(clip_tokens).float().unsqueeze(0)

	gendurations = torch.ones((hw, 1), dtype=int) params['num_frames']

	# generate the repr (joints3D/pose etc)
	model.eval()
	with torch.no_grad():

	generation = model.generate(clip_features, gendurations,
	is_amass=True,
	is_clip_features=True)
	generation['y'] = texts

	for key, val in generation.items():
	if len(generation[key].shape) == 1:
	generation[key] = val.reshape(h, w)
	else:
	generation[key] = val.reshape(h, w, *val.shape[1:])

	f_name = params['input_file']
	if os.path.isfile(params['input_file']):
	f_name = os.path.basename(params['input_file'].replace('.txt', ''))
	finalpath = os.path.join(folder, 'clip_text_{}_{}'.format(f_name, 'trans_' if params['vertstrans'] else '') + figname + ".gif")
	tmp_path = os.path.join(folder, f"clip_text_subfigures_{figname}")
	os.makedirs(tmp_path, exist_ok=True)

	# save_pkl(generation['output'], generation['output_xyz'], texts, finalpath.replace('.gif', '.pkl'))

	print("Generate the videos..")
	frames = generate_by_video({}, {}, generation,
	lambda x: str(x), params, w, h, tmp_path, mode='text')


	print(f"Writing video [{finalpath}]")
	imageio.mimsave(finalpath, frames, fps=params["fps"])


	def viz_clip_interp(model, datasets, interp_csv, num_stops, epoch, params, folder):
	""" Generate & viz samples """

	# visualize with joints3D
	model.outputxyz = True

	print(f"Visualization of the epoch {epoch}")
	figname = params["figname"].format(epoch)
	motion_collection = get_motion_text_mapping(datasets)

	# prepare motion representations
	all_clip_features = []
	all_texts = []
	for line in interp_csv:
	# Get CLIP features
	texts = [line['start'], line['end']]
	retrieved_motions = retrieve_motions(datasets, motion_collection, texts, params['device'])
	clip_features = encode_motions(model, retrieved_motions, params['device'])


	# Make interp
	end_factor = np.linspace(0., 1., num=num_stops)
	start_factor = 1. - end_factor
	interp_features = [(start_factor[i]clip_features[0]) + (end_factor[i]clip_features[1]) for i in range(num_stops)]
	all_clip_features.append(torch.stack(interp_features))
	texts = texts[:1] + [' '] * (num_stops-2) + texts[-1:]
	all_texts.append(texts)

	all_clip_features = torch.transpose(torch.stack(all_clip_features, axis=0), 0, 1)
	all_texts = np.array(all_texts).T
	h, w = all_clip_features.shape[:2]
	gendurations = torch.ones((hw, 1), dtype=int) params['num_frames']

	# generate the repr (joints3D/pose etc)
	model.eval()
	with torch.no_grad():
	generation = model.generate(all_clip_features, gendurations,
	is_amass=True,
	is_clip_features=True)
	generation['y'] = all_texts.reshape([-1])

	for key, val in generation.items():
	if len(generation[key].shape) == 1:
	generation[key] = val.reshape(h, w)
	else:
	generation[key] = val.reshape(h, w, *val.shape[1:])

	if os.path.isfile(params['input_file']):
	f_name = os.path.basename(params['input_file'].replace('.csv', ''))
	finalpath = os.path.join(folder, f'clip_edit_{f_name}_' + figname + ".gif")
	tmp_path = os.path.join(folder, f"clip_edit_subfigures_{figname}")
	os.makedirs(tmp_path, exist_ok=True)

	print("Generate the videos..")
	frames = generate_by_video({}, {}, generation,
	lambda x: str(x), params, w, h, tmp_path, mode='interp')

	print(f"Writing video [{finalpath}]")
	imageio.mimsave(finalpath, frames, fps=params["fps"])


	def viz_clip_edit(model, datasets, edit_csv, epoch, params, folder):
	""" Generate & viz samples """

	# visualize with joints3D
	model.outputxyz = True

	print(f"Visualization of the epoch {epoch}")
	figname = params["figname"].format(epoch)
	motion_collection = get_motion_text_mapping(datasets)

	# prepare motion representations
	all_clip_features = []
	all_texts = []
	for line in edit_csv:
	# Get CLIP features
	texts = [line['base'], line['v_start'], line['v_end']]
	if line['motion_source'] == 'data':
	retrieved_motions = retrieve_motions(datasets, motion_collection, texts, params['device'])
	clip_features = encode_motions(model, retrieved_motions, params['device'])
	elif line['motion_source'] == 'text':
	clip_tokens = clip.tokenize(texts).to(params['device'])
	clip_features = model.clip_model.encode_text(clip_tokens).float()
	else:
	raise ValueError

	# Make edit
	result_features = clip_features[0] - clip_features[1] + clip_features[2]
	all_clip_features.append(torch.cat([clip_features, result_features.unsqueeze(0)]))
	texts.append('Result')
	all_texts.append(texts)

	all_clip_features = torch.transpose(torch.stack(all_clip_features, axis=0), 0, 1)
	all_texts = np.array(all_texts).T
	h, w = all_clip_features.shape[:2]
	gendurations = torch.ones((hw, 1), dtype=int) params['num_frames']

	# generate the repr (joints3D/pose etc)
	model.eval()
	with torch.no_grad():
	generation = model.generate(all_clip_features, gendurations,
	is_amass=True,
	is_clip_features=True)
	generation['y'] = all_texts.reshape([-1])

	for key, val in generation.items():
	if len(generation[key].shape) == 1:
	generation[key] = val.reshape(h, w)
	else:
	generation[key] = val.reshape(h, w, *val.shape[1:])

	if os.path.isfile(params['input_file']):
	f_name = os.path.basename(params['input_file'].replace('.csv', ''))
	finalpath = os.path.join(folder, f'clip_edit_{f_name}_' + figname + ".gif")
	tmp_path = os.path.join(folder, f"clip_edit_subfigures_{figname}")
	os.makedirs(tmp_path, exist_ok=True)

	print("Generate the videos..")
	frames = generate_by_video({}, {}, generation,
	lambda x: str(x), params, w, h, tmp_path, mode='edit')

	print(f"Writing video [{finalpath}]")
	imageio.mimsave(finalpath, frames, fps=params["fps"])


	def stack_images_sequence(visu):
	print("Stacking frames..")
	allframes = visu
	nframes, nspa, nats, h, w, pix = allframes.shape
	frames = []
	for frame_idx in tqdm(range(nframes)):
	columns = np.vstack(allframes[frame_idx].transpose(1, 2, 3, 4, 0)).transpose(3, 1, 0, 2)
	frame = np.concatenate(columns).transpose(1, 0, 2)
	frames.append(frame)
	return np.stack(frames)


	def get_gpu_device():
	command = "nvidia-smi --query-gpu=memory.free --format=csv"
	memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
	memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
	for gpu_idx, free_mem in enumerate(memory_free_values):
	if free_mem > GPU_MINIMUM_MEMORY:
	return gpu_idx
	Exception('No GPU with required memory')


	def get_motion_text_mapping(datasets):
	print('Building text-motion mapping...')
	split_names = list(datasets.keys())
	collection_path = datasets[split_names[0]].datapath.replace('.pt', '_text_labels.txt')
	if len(split_names) > 1:
	assert split_names[0] in os.path.basename(collection_path)
	_base = os.path.basename(collection_path).replace(split_names[0], 'all')
	collection_path = os.path.join(os.path.dirname(collection_path), _base)
	cache_path = collection_path.replace('.txt', '.npy')

	# load if exists
	word = 'Loading' if os.path.isfile(cache_path) else 'Saving'
	print('{} list of text labels in current dataset to [{}]:'.format(word, collection_path))
	print('Look it up next time you want to retrieve new motions using textual labels.')

	if os.path.isfile(cache_path):
	return np.load(cache_path, allow_pickle=True)[None][0]

	motion_collection = {}
	for split_name, data in datasets.items():
	for i, d in tqdm(enumerate(data)):
	motion_collection[d['clip_text']] = motion_collection.get(d['clip_text'], []) + [(split_name, i)]

	with open(collection_path, 'w') as fw:
	text_labels = sorted(list(motion_collection.keys()))
	fw.write('\n'.join(text_labels) + '\n')
	np.save(cache_path, motion_collection)

	return motion_collection

	def retrieve_motions(datasets, motion_collection, texts, device):
	retrieved_motions = []
	for txt in texts:
	_split, _index = motion_collection[txt][0]
	retrieved_motions.append(datasets[_split][_index]['inp'].unsqueeze(0).to(device))
	return torch.cat(retrieved_motions, axis=0)

	def encode_motions(model, motions, device):
	return model.encoder({'x': motions,
	'y': torch.zeros(motions.shape[0], dtype=int, device=device),
	'mask': model.lengths_to_mask(torch.ones(motions.shape[0], dtype=int, device=device) * 60)})["mu"]