smog

File size: 20,266 Bytes

fe64bad

import os
import imageio
import sys
sys.path.append('.')

import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm
from src.visualize.anim import plot_3d_motion_dico, load_anim
import clip
from PIL import Image
import pickle
import src.utils.rotation_conversions as geometry
from textwrap import wrap
import shutil
import subprocess as sp
from copy import deepcopy

GPU_MINIMUM_MEMORY = 5500

def stack_images(real, real_gens, gen):
    nleft_cols = len(real_gens) + 1
    print("Stacking frames..")
    allframes = np.concatenate((real[:, None, ...], *[x[:, None, ...] for x in real_gens], gen), 1)
    nframes, nspa, nats, h, w, pix = allframes.shape
    blackborder = np.zeros((w//30, h*nats, pix), dtype=allframes.dtype)
    frames = []
    for frame_idx in tqdm(range(nframes)):
        columns = np.vstack(allframes[frame_idx].transpose(1, 2, 3, 4, 0)).transpose(3, 1, 0, 2)
        frame = np.concatenate((*columns[0:nleft_cols], blackborder, *columns[nleft_cols:]), 0).transpose(1, 0, 2)
        frames.append(frame)
    return np.stack(frames)

def stack_gen_and_images(gen, images):
    # nleft_cols = len(real_gens) + 1
    print("Stacking frames..")
    allframes = np.concatenate((images, gen), 2)
    nframes, nspa, nats, h, w, pix = allframes.shape
    blackborder = np.zeros((w//30, h*nats, pix), dtype=allframes.dtype)
    frames = []
    for frame_idx in tqdm(range(nframes)):
        columns = np.vstack(allframes[frame_idx].transpose(1, 2, 3, 4, 0)).transpose(3, 1, 0, 2)
        frame = np.concatenate((columns[:]), 0).transpose(1, 0, 2)
        frames.append(frame)
    return np.stack(frames)

def stack_gen_only(gen):
    # nleft_cols = len(real_gens) + 1
    print("Stacking frames..")
    # allframes = np.concatenate((real[:, None, ...], *[x[:, None, ...] for x in real_gens], gen), 1)
    allframes = gen
    nframes, nspa, nats, h, w, pix = allframes.shape
    blackborder = np.zeros((w//30, h*nats, pix), dtype=allframes.dtype)
    frames = []
    for frame_idx in tqdm(range(nframes)):
        columns = np.vstack(allframes[frame_idx].transpose(1, 2, 3, 4, 0)).transpose(3, 1, 0, 2)
        frame = np.concatenate((columns[:]), 0).transpose(1, 0, 2)
        frames.append(frame)
    return np.stack(frames)


def generate_by_video(visualization, reconstructions, generation,

                      label_to_action_name, params, nats, nspa, tmp_path, image_pathes=None, mode=None):
    # shape : (17, 3, 4, 480, 640, 3)
    # (nframes, row, column, h, w, 3)
    fps = params["fps"]

    params = params.copy()

    if "output_xyz" in visualization or "output_xyz" in generation:
        outputkey = "output_xyz"
        params["pose_rep"] = "xyz"
    else:
        outputkey = "poses"

    keep = [outputkey, "lengths", "y"]

    def _to_np(x):
        if type(x).__module__ == np.__name__:
            return x
        else:  # assume tensor
            return x.data.cpu().numpy()

    visu = {key: _to_np(visualization[key]) for key in keep if key in visualization.keys()}
    recons = {mode: {key: _to_np(reconstruction[key]) for key in keep if key in reconstruction.keys()}
              for mode, reconstruction in reconstructions.items()}
    gener = {key: _to_np(generation[key]) for key in keep if key in generation.keys()}

    def get_palette(i, nspa):
        if mode == 'edit' and i < 3:
            return 'orange'
        elif mode == 'interp' and i in [0, nspa-1]:
            return 'orange'
        return 'blue'


    if(len(visu) > 0):
        lenmax = max(gener["lengths"].max(),
                     visu["lengths"].max())
    else:
        lenmax = gener["lengths"].max()
    timesize = lenmax + 5
    # if params['appearance_mode'] == 'motionclip':
    #     timesize = lenmax + 20

    import multiprocessing

    def pool_job_with_desc(pool, iterator, desc, max_, save_path_format, isij):
        with tqdm(total=max_, desc=desc.format("Render")) as pbar:
            for _ in pool.imap_unordered(plot_3d_motion_dico, iterator):
                pbar.update()
        if isij:
            array = np.stack([[load_anim(save_path_format.format(i, j), timesize)
                               for j in range(nats)]
                              for i in tqdm(range(nspa), desc=desc.format("Load"))])
            return array.transpose(2, 0, 1, 3, 4, 5)
        else:
            array = np.stack([load_anim(save_path_format.format(i), timesize)
                              for i in tqdm(range(nats), desc=desc.format("Load"))])
            return array.transpose(1, 0, 2, 3, 4)

    with multiprocessing.Pool() as pool:
        # Generated samples
        save_path_format = os.path.join(tmp_path, "gen_{}_{}.gif")
        iterator = ((gener[outputkey][i, j],
                     gener["lengths"][i, j],
                     save_path_format.format(i, j),
                     # params, {"title": f"gen: {label_to_action_name(gener['y'][i, j])}", "interval": 1000/fps})
                     params, {"title": f"{label_to_action_name(gener['y'][i, j])}", "interval": 1000/fps, "palette": get_palette(i, nspa)})
                    for j in range(nats) for i in range(nspa))
        gener["frames"] = pool_job_with_desc(pool, iterator,
                                             "{} the generated samples",
                                             nats*nspa,
                                             save_path_format,
                                             True)

        # Make frames with no title blank
        frames_no_title = gener['y'] == ''
        gener["frames"][:, frames_no_title] = gener["frames"][:, 0, 0:1, 0:1, 0:1] # cast the corner pixel value for all blank box

        # Real samples
        if len(visu) > 0:
            save_path_format = os.path.join(tmp_path, "real_{}.gif")
            iterator = ((visu[outputkey][i],
                         visu["lengths"][i],
                         save_path_format.format(i),
                         params, {"title": f"real: {label_to_action_name(visu['y'][i])}", "interval": 1000/fps})
                        for i in range(nats))
            visu["frames"] = pool_job_with_desc(pool, iterator,
                                                "{} the real samples",
                                                nats,
                                                save_path_format,
                                                False)
        for mode, recon in recons.items():
            # Reconstructed samples
            save_path_format = os.path.join(tmp_path, f"reconstructed_{mode}_" + "{}.gif")
            iterator = ((recon[outputkey][i],
                         recon["lengths"][i],
                         save_path_format.format(i),
                         params, {"title": f"recons: {label_to_action_name(recon['y'][i])}",
                                  "interval": 1000/fps})
                        for i in range(nats))
            recon["frames"] = pool_job_with_desc(pool, iterator,
                                                 "{} the reconstructed samples",
                                                 nats,
                                                 save_path_format,
                                                 False)
    if image_pathes is not None:
        # visu["frames"] -> [timesize(65), nspa(n_samples), nats(1), h(290), w(260), n_ch(3)]
        assert nats == 1
        assert nspa == len(image_pathes)
        h, w = gener["frames"].shape[3:5]
        image_frames = []
        for im_path in image_pathes:
            im = Image.open(im_path).resize((w, h))
            image_frames.append(np.tile(np.expand_dims(np.asarray(im)[..., :3], axis=(0, 1, 2)), (timesize, 1, 1, 1, 1, 1)))
        image_frames = np.concatenate(image_frames, axis=1)
        assert image_frames.shape == gener["frames"].shape
        return stack_gen_and_images(gener["frames"], image_frames)

    if len(visu) == 0:
        frames = stack_gen_only(gener["frames"])
    else:
        frames = stack_images(visu["frames"], [recon["frames"] for recon in recons.values()], gener["frames"])
    return frames


def generate_by_video_sequences(visualization, label_to_action_name, params, nats, nspa, tmp_path):
    # shape : (17, 3, 4, 480, 640, 3)
    # (nframes, row, column, h, w, 3)
    fps = params["fps"]

    if "output_xyz" in visualization:
        outputkey = "output_xyz"
        params["pose_rep"] = "xyz"
    else:
        outputkey = "poses"

    keep = [outputkey, "lengths", "y"]
    visu = {key: visualization[key].data.cpu().numpy() for key in keep}
    lenmax = visu["lengths"].max()

    timesize = lenmax + 5
    import multiprocessing

    def pool_job_with_desc(pool, iterator, desc, max_, save_path_format):
        with tqdm(total=max_, desc=desc.format("Render")) as pbar:
            for _ in pool.imap_unordered(plot_3d_motion_dico, iterator):
                pbar.update()
        array = np.stack([[load_anim(save_path_format.format(i, j), timesize)
                           for j in range(nats)]
                          for i in tqdm(range(nspa), desc=desc.format("Load"))])
        return array.transpose(2, 0, 1, 3, 4, 5)

    with multiprocessing.Pool() as pool:
        # Real samples
        save_path_format = os.path.join(tmp_path, "real_{}_{}.gif")
        iterator = ((visu[outputkey][i, j],
                     visu["lengths"][i, j],
                     save_path_format.format(i, j),
                     params, {"title": f"real: {label_to_action_name(visu['y'][i, j])}", "interval": 1000/fps})
                    for j in range(nats) for i in range(nspa))
        visu["frames"] = pool_job_with_desc(pool, iterator,
                                            "{} the real samples",
                                            nats,
                                            save_path_format)
    frames = stack_images_sequence(visu["frames"])
    return frames


def viz_clip_text(model, text_grid, epoch, params, folder):
    """ Generate & viz samples """

    # visualize with joints3D
    model.outputxyz = True

    print(f"Visualization of the epoch {epoch}")

    # noise_same_action = params["noise_same_action"]
    # noise_diff_action = params["noise_diff_action"]

    fact = params["fact_latent"]
    figname = params["figname"].format(epoch)

    classes = np.array(text_grid, dtype=str)
    h, w = classes.shape

    texts = classes.reshape([-1])
    clip_tokens = clip.tokenize(texts).to(params['device'])
    clip_features = model.clip_model.encode_text(clip_tokens).float().unsqueeze(0)

    gendurations = torch.ones((h*w, 1), dtype=int) * params['num_frames']

    # generate the repr (joints3D/pose etc)
    model.eval()
    with torch.no_grad():

        generation = model.generate(clip_features, gendurations,
                                    is_amass=True,
                                    is_clip_features=True)
        generation['y'] = texts

    for key, val in generation.items():
        if len(generation[key].shape) == 1:
            generation[key] = val.reshape(h, w)
        else:
            generation[key] = val.reshape(h, w, *val.shape[1:])

    f_name = params['input_file']
    if os.path.isfile(params['input_file']):
        f_name = os.path.basename(params['input_file'].replace('.txt', ''))
    finalpath = os.path.join(folder, 'clip_text_{}_{}'.format(f_name, 'trans_' if params['vertstrans'] else '') + figname + ".gif")
    tmp_path = os.path.join(folder, f"clip_text_subfigures_{figname}")
    os.makedirs(tmp_path, exist_ok=True)

    # save_pkl(generation['output'], generation['output_xyz'], texts, finalpath.replace('.gif', '.pkl'))

    print("Generate the videos..")
    frames = generate_by_video({}, {}, generation,
                               lambda x: str(x), params, w, h, tmp_path, mode='text')


    print(f"Writing video [{finalpath}]")
    imageio.mimsave(finalpath, frames, fps=params["fps"])


def viz_clip_interp(model, datasets, interp_csv, num_stops, epoch, params, folder):
    """ Generate & viz samples """

    # visualize with joints3D
    model.outputxyz = True

    print(f"Visualization of the epoch {epoch}")
    figname = params["figname"].format(epoch)
    motion_collection = get_motion_text_mapping(datasets)

    # prepare motion representations
    all_clip_features = []
    all_texts = []
    for line in interp_csv:
        # Get CLIP features
        texts = [line['start'], line['end']]
        retrieved_motions = retrieve_motions(datasets, motion_collection, texts, params['device'])
        clip_features = encode_motions(model, retrieved_motions, params['device'])


        # Make interp
        end_factor = np.linspace(0., 1., num=num_stops)
        start_factor = 1. - end_factor
        interp_features = [(start_factor[i]*clip_features[0]) + (end_factor[i]*clip_features[1]) for i in range(num_stops)]
        all_clip_features.append(torch.stack(interp_features))
        texts = texts[:1] + [' '] * (num_stops-2) + texts[-1:]
        all_texts.append(texts)

    all_clip_features = torch.transpose(torch.stack(all_clip_features, axis=0), 0, 1)
    all_texts = np.array(all_texts).T
    h, w = all_clip_features.shape[:2]
    gendurations = torch.ones((h*w, 1), dtype=int) * params['num_frames']

    # generate the repr (joints3D/pose etc)
    model.eval()
    with torch.no_grad():
        generation = model.generate(all_clip_features, gendurations,
                                    is_amass=True,
                                    is_clip_features=True)
        generation['y'] = all_texts.reshape([-1])

    for key, val in generation.items():
        if len(generation[key].shape) == 1:
            generation[key] = val.reshape(h, w)
        else:
            generation[key] = val.reshape(h, w, *val.shape[1:])

    if os.path.isfile(params['input_file']):
        f_name = os.path.basename(params['input_file'].replace('.csv', ''))
    finalpath = os.path.join(folder, f'clip_edit_{f_name}_' + figname + ".gif")
    tmp_path = os.path.join(folder, f"clip_edit_subfigures_{figname}")
    os.makedirs(tmp_path, exist_ok=True)

    print("Generate the videos..")
    frames = generate_by_video({}, {}, generation,
                               lambda x: str(x), params, w, h, tmp_path, mode='interp')

    print(f"Writing video [{finalpath}]")
    imageio.mimsave(finalpath, frames, fps=params["fps"])


def viz_clip_edit(model, datasets, edit_csv, epoch, params, folder):
    """ Generate & viz samples """

    # visualize with joints3D
    model.outputxyz = True

    print(f"Visualization of the epoch {epoch}")
    figname = params["figname"].format(epoch)
    motion_collection = get_motion_text_mapping(datasets)

    # prepare motion representations
    all_clip_features = []
    all_texts = []
    for line in edit_csv:
        # Get CLIP features
        texts = [line['base'], line['v_start'], line['v_end']]
        if line['motion_source'] == 'data':
            retrieved_motions = retrieve_motions(datasets, motion_collection, texts, params['device'])
            clip_features = encode_motions(model, retrieved_motions, params['device'])
        elif line['motion_source'] == 'text':
            clip_tokens = clip.tokenize(texts).to(params['device'])
            clip_features = model.clip_model.encode_text(clip_tokens).float()
        else:
            raise ValueError

        # Make edit
        result_features = clip_features[0] - clip_features[1] + clip_features[2]
        all_clip_features.append(torch.cat([clip_features, result_features.unsqueeze(0)]))
        texts.append('Result')
        all_texts.append(texts)

    all_clip_features = torch.transpose(torch.stack(all_clip_features, axis=0), 0, 1)
    all_texts = np.array(all_texts).T
    h, w = all_clip_features.shape[:2]
    gendurations = torch.ones((h*w, 1), dtype=int) * params['num_frames']

    # generate the repr (joints3D/pose etc)
    model.eval()
    with torch.no_grad():
        generation = model.generate(all_clip_features, gendurations,
                                    is_amass=True,
                                    is_clip_features=True)
        generation['y'] = all_texts.reshape([-1])

    for key, val in generation.items():
        if len(generation[key].shape) == 1:
            generation[key] = val.reshape(h, w)
        else:
            generation[key] = val.reshape(h, w, *val.shape[1:])

    if os.path.isfile(params['input_file']):
        f_name = os.path.basename(params['input_file'].replace('.csv', ''))
    finalpath = os.path.join(folder, f'clip_edit_{f_name}_' + figname + ".gif")
    tmp_path = os.path.join(folder, f"clip_edit_subfigures_{figname}")
    os.makedirs(tmp_path, exist_ok=True)

    print("Generate the videos..")
    frames = generate_by_video({}, {}, generation,
                               lambda x: str(x), params, w, h, tmp_path, mode='edit')

    print(f"Writing video [{finalpath}]")
    imageio.mimsave(finalpath, frames, fps=params["fps"])


def stack_images_sequence(visu):
    print("Stacking frames..")
    allframes = visu
    nframes, nspa, nats, h, w, pix = allframes.shape
    frames = []
    for frame_idx in tqdm(range(nframes)):
        columns = np.vstack(allframes[frame_idx].transpose(1, 2, 3, 4, 0)).transpose(3, 1, 0, 2)
        frame = np.concatenate(columns).transpose(1, 0, 2)
        frames.append(frame)
    return np.stack(frames)


def get_gpu_device():
    command = "nvidia-smi --query-gpu=memory.free --format=csv"
    memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    for gpu_idx, free_mem in enumerate(memory_free_values):
        if free_mem > GPU_MINIMUM_MEMORY:
            return gpu_idx
    Exception('No GPU with required memory')


def get_motion_text_mapping(datasets):
    print('Building text-motion mapping...')
    split_names = list(datasets.keys())
    collection_path = datasets[split_names[0]].datapath.replace('.pt', '_text_labels.txt')
    if len(split_names) > 1:
        assert split_names[0] in os.path.basename(collection_path)
        _base = os.path.basename(collection_path).replace(split_names[0], 'all')
        collection_path = os.path.join(os.path.dirname(collection_path), _base)
    cache_path = collection_path.replace('.txt', '.npy')

    # load if exists
    word = 'Loading' if os.path.isfile(cache_path) else 'Saving'
    print('{} list of text labels in current dataset to [{}]:'.format(word, collection_path))
    print('Look it up next time you want to retrieve new motions using textual labels.')

    if os.path.isfile(cache_path):
        return np.load(cache_path, allow_pickle=True)[None][0]

    motion_collection = {}
    for split_name, data in datasets.items():
        for i, d in tqdm(enumerate(data)):
            motion_collection[d['clip_text']] = motion_collection.get(d['clip_text'], []) + [(split_name, i)]

    with open(collection_path, 'w') as fw:
        text_labels = sorted(list(motion_collection.keys()))
        fw.write('\n'.join(text_labels) + '\n')
    np.save(cache_path, motion_collection)

    return motion_collection

def retrieve_motions(datasets, motion_collection, texts, device):
    retrieved_motions = []
    for txt in texts:
        _split, _index = motion_collection[txt][0]
        retrieved_motions.append(datasets[_split][_index]['inp'].unsqueeze(0).to(device))
    return torch.cat(retrieved_motions, axis=0)

def encode_motions(model, motions, device):
    return model.encoder({'x': motions,
                          'y': torch.zeros(motions.shape[0], dtype=int, device=device),
                          'mask': model.lengths_to_mask(torch.ones(motions.shape[0], dtype=int, device=device) * 60)})["mu"]