diff --git a/README.md b/README.md
index 6fafda79804e471dfe58871843d3fdaa2d1fee30..076481736c15d48a1c69998a075f72561721293c 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,13 @@
 ---
-title: OneLLM
+title: Tar
 emoji: 🚀
 colorFrom: red
 colorTo: indigo
 sdk: gradio
-sdk_version: 4.7.1
+sdk_version: 5.34.0
 app_file: app.py
 pinned: false
-python_version: 3.9.18
+python_version: 3.10.18
 ---
 
-# OneLLM: One Framework to Align All Modalities with Language
+# Tar: Unifying Visual Understanding and Generation via Text-Aligned Representations
\ No newline at end of file
diff --git a/app.py b/app.py
index 1fa0e489394435324252b12acf750ba0cb5a5e46..b17609953435f771aad30f9b763fb665cb26064e 100644
--- a/app.py
+++ b/app.py
@@ -1,457 +1,111 @@
-import sys
 import os
-import argparse
-import multiprocessing as mp
-import numpy as np
-from typing import List, Optional
-
-import torch
-import torch.distributed as dist
-
-from fairscale.nn.model_parallel import initialize as fs_init
-
 import gradio as gr
-from util.misc import setup_for_distributed
-from util.misc import default_tensor_type
-from model.meta import MetaModel
-from data.conversation_lib import conv_templates, SeparatorStyle
-from PIL import Image
-import torchvision.transforms as transforms
-from data.fintune_dataset import make_audio_features
-from data import video_utils 
-from dataclasses import dataclass
-from huggingface_hub import hf_hub_download
-import plotly.graph_objects as go
-from data.fintune_dataset import pc_norm
-from functools import partial
-import glob
-import torchvision.transforms.functional as F
+from torchvision.transforms.functional import to_tensor
+from huggingface_hub import hf_hub_download, login
 
-T_random_resized_crop = transforms.Compose([
-    transforms.RandomResizedCrop(size=(224, 224), scale=(0.9, 1.0), ratio=(0.75, 1.3333), interpolation=3,
-                                 antialias=None),  # 3 is bicubic
-    transforms.ToTensor(),
-    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])
+from t2i_inference import T2IConfig, TextToImageInference
 
-class PairRandomResizedCrop(transforms.RandomResizedCrop):
-    def forward(self, imgs):
-        i, j, h, w = self.get_params(imgs[0], self.scale, self.ratio)
-        return [F.resized_crop(img, i, j, h, w, self.size, self.interpolation, antialias=self.antialias) for img in imgs]
-
-class PairToTensor(transforms.ToTensor):
-    def __call__(self, pics):
-        return [F.to_tensor(pic) for pic in pics]
-
-class PairNormalize(transforms.Normalize):
-    def forward(self, tensors):
-        return [F.normalize(tensor, self.mean, self.std, self.inplace) for tensor in tensors]
+def generate_text(self, image: str, prompt: str) -> str:
+    image = image.convert('RGB')
+    image = to_tensor(image).unsqueeze(0).to(self.device)
     
-transform_pairimg_train = transforms.Compose([
-    PairRandomResizedCrop(size=(224, 224), scale=(0.99, 1.0), ratio=(0.75, 1.3333), interpolation=3, antialias=None),  # 3 is bicubic
-    PairToTensor(),
-    PairNormalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])
-
-def load_audio(audio_path):
-    fbank = make_audio_features(audio_path, mel_bins=128)
-    fbank = fbank.transpose(0, 1)[None] #[1, 128, 1024]
-    return fbank
+    image_code = self.visual_tokenizer.encoder(image)['bottleneck_rep']
+    image_text = "".join([f"<I{x}>" for x in image_code[0].cpu().tolist()])
     
-def load_video(video_path):
-    video_feats = video_utils.load_and_transform_video_data(video_path, video_path, clip_duration=1, clips_per_video=5)
-    return video_feats[:, :, 0]
-
-def load_point(point_path):
-    point_feat = np.load(point_path)
-    point_feat = torch.tensor(point_feat)
-    point_feat = pc_norm(point_feat)
-    return point_feat
-
-def load_fmri(fmri_path):
-    data = np.load(fmri_path)
-    data = data.mean(axis=0)
-    data = torch.tensor(data[None])
-    return data
-
-def load_rgbx(image_path, x_image_path):
-    # trick: replace path if 'depth_scaled' in path
-    x_image_path = x_image_path.replace('depth_scaled', 'depth')
-
-    image = Image.open(image_path).convert('RGB')
-    x_image = Image.open(x_image_path).convert('RGB')
-    x_image = x_image.resize(image.size[-2:])
-
-    image, x_image = transform_pairimg_train([image, x_image])
-
-    # [2, 3, H, W]
-    image = torch.stack([image, x_image], dim=0)
-    return image
-
-
-class Ready: pass
-
-
-def model_worker(
-    rank: int, args: argparse.Namespace, barrier: mp.Barrier,
-    request_queue: mp.Queue, response_queue: Optional[mp.Queue] = None,
-) -> None:
-    """
-    The worker function that manipulates the GPU to run the inference.
-    Exact n_gpu workers are started, with each one operating on a separate GPU.
-
-    Args:
-        rank (int): Distributed rank of the worker.
-        args (argparse.Namespace): All command line arguments.
-        barrier (multiprocessing.Barrier): A barrier used to delay the start
-            of Web UI to be after the start of the model.
-    """
-
-    world_size = len(args.gpu_ids)
-    gpu_id = args.gpu_ids[rank]
-    dist.init_process_group(
-        backend="nccl", rank=rank, world_size=world_size,
-        init_method=f"tcp://{args.master_addr}:{args.master_port}",
-    )
-    print(f"| distributed init on worker {rank}/{world_size}. "
-          f"using gpu: {gpu_id}")
-    fs_init.initialize_model_parallel(world_size)
-    torch.cuda.set_device(gpu_id)
-
-    torch.manual_seed(1)
-    np.random.seed(1)
-
-    # set the print behavior.
-    setup_for_distributed(rank == 0)
-
-    target_dtype = {
-        "bf16": torch.bfloat16,
-        "fp16": torch.float16
-    }[args.dtype]
-    with default_tensor_type(dtype=target_dtype, device="cuda"):
-        model = MetaModel(args.llama_type, args.llama_config, tokenizer_path=args.tokenizer_path)
-    for ckpt_id in range(args.num_ckpts):
-        ckpt_path = hf_hub_download(repo_id=args.pretrained_path, filename=args.ckpt_format.format(str(ckpt_id)))
-        # ckpt_path = os.path.join(args.pretrained_path, args.ckpt_format.format(str(ckpt_id)))
-        print(f"Loading pretrained weights {ckpt_path}")
-        checkpoint = torch.load(ckpt_path, map_location='cpu')
-        msg = model.load_state_dict(checkpoint, strict=False)
-    # print("load result:\n", msg)
-    model.cuda()
-    model.eval()
-    print(f"Model = {str(model)}")
-
-    barrier.wait()
-
-    while True:
-        if response_queue is not None:
-            response_queue.put(Ready())
-        img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, temperature, top_p, modality = request_queue.get()
-        try:
-            if 'image' in modality and img_path is not None:
-                image = Image.open(img_path).convert('RGB')
-                inputs = T_random_resized_crop(image)
-            elif 'video' in modality and video_path is not None:
-                inputs = load_video(video_path)
-            elif 'audio' in modality and audio_path is not None:
-                inputs = load_audio(audio_path)
-            elif 'point' in modality and point_path is not None:
-                inputs = load_point(point_path)
-            elif 'fmri' in modality and fmri_path is not None:
-                inputs = load_fmri(fmri_path)
-            elif 'rgbd' in modality and depth_path is not None and depth_rgb_path is not None:
-                inputs = load_rgbx(depth_rgb_path, depth_path)
-            elif 'rgbn' in modality and normal_path is not None and normal_rgb_path is not None:
-                inputs = load_rgbx(normal_rgb_path, normal_path)
-            else:
-                inputs = None
-        except:
-            inputs = None
-        
-        if inputs is not None:
-            inputs = inputs[None].cuda().to(target_dtype)
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": f"{image_text}\n{prompt}"}
+    ]
     
-        conv = conv_templates["v1"].copy()
-        for user, bot in chatbot:
-            conv.append_message(conv.roles[0], user)
-            conv.append_message(conv.roles[1], bot)
-
-        with torch.cuda.amp.autocast(dtype=target_dtype):
-            print(conv.get_prompt())
-            for stream_response in model.stream_generate(
-                conv.get_prompt(), inputs,
-                max_gen_len=max_gen_len, temperature=temperature, top_p=top_p,
-                modal = modality
-            ):
-                conv_sep = (
-                    conv.sep
-                    if conv.sep_style == SeparatorStyle.SINGLE
-                    else conv.sep2
-                )
-                end_pos = stream_response["text"].find(conv_sep)
-                if end_pos != -1:
-                    stream_response["text"] = (
-                        stream_response['text'][:end_pos].rstrip() + "\n"
-                    )
-                    stream_response["end_of_content"] = True
-
-                # keep a few characters if not end_of_content to avoid sending
-                # part of conv_sep before all of it is generated.
-                if not stream_response["end_of_content"]:
-                    if len(stream_response["text"]) < len(conv_sep):
-                        continue
-                    stream_response["text"] = (
-                        stream_response["text"][:-len(conv_sep)]
-                    )
-
-                if response_queue is not None:
-                    response_queue.put(stream_response)
-
-                if stream_response["end_of_content"]:
-                    break
-
-
-def gradio_worker(
-    request_queues: List[mp.Queue], response_queue: mp.Queue,
-    args: argparse.Namespace, barrier: mp.Barrier,
-) -> None:
-    """
-    The gradio worker is responsible for displaying the WebUI and relay the
-    requests to model workers. It should be launched only once.
+    input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = self.tokenizer(input_text, return_tensors="pt")
+    
+    gen_ids = self.model.generate(
+        inputs.input_ids.to(self.device),
+        max_new_tokens=512,
+        do_sample=True)
+    return self.tokenizer.batch_decode(gen_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
+
+login(token=os.getenv('HF_TOKEN'))
+config = T2IConfig()
+config.ar_path = hf_hub_download("csuhan/TA-Tok", "ar_dtok_lp_512px.pth")
+config.encoder_path = hf_hub_download("csuhan/TA-Tok", "ta_tok.pth")
+config.decoder_path = hf_hub_download("peizesun/llamagen_t2i", "vq_ds16_t2i.pt")
+inference = TextToImageInference(config)
+
+def generate_image(prompt, top_p, top_k, cfg_scale):
+    config.top_p = top_p
+    config.top_k = top_k
+    config.cfg_scale = cfg_scale
+    image = inference.generate_image(prompt)
+    return image
 
-    Args:
-        request_queues (List[mp.Queue]): A list of request queues (one for
-            each model worker).
-        args (argparse.Namespace): All command line arguments.
-        barrier (multiprocessing.Barrier): A barrier used to delay the start
-            of Web UI to be after the start of the model.
-    """
+def clear_inputs_t2i():
+    return "", None
 
-    def show_user_input(msg, chatbot):
-        return "", chatbot + [[msg, None]]
+def understand_image(image, prompt):
+    return generate_text(inference, image, prompt)
 
-    def stream_model_output(img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality):
-        while True:
-            content_piece = response_queue.get()
-            if isinstance(content_piece, Ready):
-                break
-        for queue in request_queues:
-            queue.put((img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality))
-        while True:
-            content_piece = response_queue.get()
-            chatbot[-1][1] = content_piece["text"]
-            yield chatbot
-            if content_piece["end_of_content"]:
-                break
+def clear_inputs_i2t():
+    return None, ""
 
-    def undo(chatbot):
-        if len(chatbot) > 0:
-            chatbot = chatbot[:-1]
-        return chatbot
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        <div align="center">
 
-    def clear():
-        chatbot = []
-        msg = ""
-        return chatbot, msg
-    
-    def show_point_cloud(file):
-        point = load_point(file).numpy()
-        fig = go.Figure(
-            data=[
-                go.Scatter3d(
-                    x=point[:,0], y=point[:,1], z=point[:,2],
-                    mode='markers',
-                    marker=dict(
-                    size=1.2,
-                    color=['rgb({},{},{})'.format(r, g, b) for r,g,b in zip(point[:,3], point[:,4], point[:,5])]
-                ))],
-            layout=dict(
-                scene=dict(
-                    xaxis=dict(visible=False),
-                    yaxis=dict(visible=False),
-                    zaxis=dict(visible=False)
-                )),)
-        return fig
-    
-    def change_modality(modal):
-        return modal
+        ### Tar: Unifying Visual Understanding and Generation via Text-Aligned Representations  
 
-    CSS ="""
-    .contain { display: flex; flex-direction: column; }
-    #component-0 { height: 100%; }
-    #chatbot { flex-grow: 1; overflow: auto;}
-    """
+        [📄 Paper](https://arxiv.org/abs/xxxx.xxxxx) • [💻 Code](https://github.com/csuhan/Tar) • [📦 Model](https://huggingface.co/csuhan/TA-Tok)
 
-    header="""
-    ## OneLLM: One Framework to Align All Modalities with Language
-    [[Project Page](https://onellm.csuhan.com)] [[Paper](https://arxiv.org/abs/2312.03700)] [[Code](https://github.com/csuhan/OneLLM)]
-    """
-
-    with gr.Blocks(css=CSS, theme=gr.themes.Base()) as demo:
-        gr.Markdown(header)
-        with gr.Row(equal_height=True):
-            modality = gr.Textbox(value='image', visible=False)
+        </div>
+        """,
+        elem_id="title",
+    )
+    with gr.Tab("Image Generation"):
+      with gr.Row():
+          with gr.Column(scale=1):
+              prompt = gr.Textbox(label="Prompt", placeholder="Enter a prompt")
+              with gr.Accordion("Advanced Settings", open=False):
+                top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
+                top_k = gr.Slider(1, 2000, value=1200, step=10, label="Top-k")
+                cfg_scale = gr.Slider(1.0, 20.0, value=4.0, step=0.5, label="CFG Scale")
+              with gr.Row():
+                  generate_btn = gr.Button("Generate")
+                  clear_btn = gr.Button("Clear")
+          with gr.Column(scale=2):
+              output_image = gr.Image(label="Generated Image")
+
+      generate_btn.click(
+          generate_image, 
+          inputs=[prompt, top_p, top_k, cfg_scale], 
+          outputs=output_image
+      )
+      clear_btn.click(
+          clear_inputs_t2i, 
+          outputs=[prompt, output_image]
+      )
+
+    with gr.Tab("Image Understanding"):
+        with gr.Row():
             with gr.Column(scale=1):
-                with gr.Tab('Image') as img_tab:
-                    img_path = gr.Image(label='Image Input', type='filepath')
-                    gr.Examples(
-                        examples=[
-                            "examples/new_york.jpg",
-                            "examples/food_menu.png",
-                            ],
-                        inputs=[img_path],
-                    )
-                with gr.Tab('Video') as video_tab:
-                    video_path = gr.Video(label='Video Input', max_length=180)
-                    gr.Examples(
-                        examples=[
-                            "examples/flower.mp4",
-                            "examples/star_kun.mp4",
-                            ],
-                        inputs=[video_path],
-                    )
-                with gr.Tab('Audio') as audio_tab:
-                    audio_path = gr.Audio(label='Audio Input', type='filepath', sources=['upload'])
-                    gr.Examples(
-                        examples=[
-                            "examples/bell_ring.wav",
-                            "examples/bird_audio.wav",
-                            ],
-                        inputs=[audio_path],
-                    )
-                with gr.Tab('Point Cloud') as point_tab:
-                    point_path = gr.File(label='Point Cloud Input', elem_id="pointpath", elem_classes="")
-                    point_vis = gr.Plot()
-                    btn = gr.Button(value="Show Point Cloud")
-                    btn.click(show_point_cloud, point_path, point_vis)
-                    gr.Examples(
-                        examples=glob.glob("examples/point/*.npy"),
-                        inputs=[point_path],
-                        examples_per_page=5,
-                    )
-                with gr.Tab('IMU') as imu_tab:
-                    gr.Markdown('Coming soon🤗')
-                with gr.Tab('fMRI') as fmri_tab:
-                    fmri_path = gr.File(label='fMRI Input', elem_id="fmripath", elem_classes="")
-                    fmri_image_path = gr.Image(label='Reference Image', interactive=False)
-                    gr.Examples(
-                        examples=[
-                            [file.replace('.jpg', '.npy'), file]
-                            for file in glob.glob("examples/fmri/*.jpg")
-                        ],
-                        inputs=[fmri_path, fmri_image_path],
-                        examples_per_page=3,
-                    )
-                with gr.Tab('Depth Map') as depth_tab:
-                    depth_path = gr.Image(label='Depth Map', type='filepath')
-                    depth_rgb_path = gr.Image(label='RGB Image', type='filepath')
-                    gr.Examples(
-                        examples=[
-                            [rgb_image.replace('rgb', 'depth_scaled'), rgb_image]
-                            for rgb_image in glob.glob("examples/depth_normal/rgb/*.png")[:9]
-                        ],
-                        inputs=[depth_path, depth_rgb_path],
-                        examples_per_page=3,
-                    )
-                with gr.Tab('Normal Map') as normal_tab:
-                    normal_path = gr.Image(label='Normal Map', type='filepath')
-                    normal_rgb_path = gr.Image(label='RGB Image', type='filepath')
-                    gr.Examples(
-                        examples=[
-                            [rgb_image.replace('rgb', 'normal'), rgb_image]
-                            for rgb_image in glob.glob("examples/depth_normal/rgb/*.png")[9:]
-                        ],
-                        inputs=[normal_path, normal_rgb_path],
-                        examples_per_page=3,
-                    )
-            with gr.Column(scale=2):
-                chatbot = gr.Chatbot(elem_id="chatbot")
-                msg = gr.Textbox()
-
+                image_input = gr.Image(label="Upload Image", type="pil")
+                question_input = gr.Textbox(label="Instruction", value="Describe the image shortly.")
                 with gr.Row():
-                    submit_button = gr.Button("Submit", variant="primary")
-                    undo_button = gr.Button("Undo")
-                    clear_button = gr.ClearButton([chatbot, msg, img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, point_vis])
-                with gr.Row():
-                    max_gen_len = gr.Slider(
-                        minimum=1, maximum=args.model_max_seq_len // 2,
-                        value=args.model_max_seq_len // 2, interactive=True,
-                        label="Single-turn max response length",
-                    )
-                    gen_t = gr.Slider(
-                        minimum=0, maximum=1, value=0.1, interactive=True,
-                        label="Temperature",
-                    )
-                    top_p = gr.Slider(
-                        minimum=0, maximum=1, value=0.75, interactive=True,
-                        label="Top-p",
-                    )
-        
-        img_tab.select(partial(change_modality, 'image'), [], [modality])
-        video_tab.select(partial(change_modality, 'video'), [], [modality])
-        audio_tab.select(partial(change_modality, 'audio'), [], [modality])
-        point_tab.select(partial(change_modality, 'point'), [], [modality])
-        fmri_tab.select(partial(change_modality, 'fmri'), [], [modality])
-        depth_tab.select(partial(change_modality, 'rgbd'), [], [modality])
-        normal_tab.select(partial(change_modality, 'rgbn'), [], [modality])
-
-        img_path.change(clear, [], [chatbot, msg])
-        audio_path.change(clear, [], [chatbot, msg])
-        video_path.change(clear, [], [chatbot, msg])
-        point_path.change(clear, [], [chatbot, msg])
-        fmri_path.change(clear, [], [chatbot, msg])
-        depth_path.change(clear, [], [chatbot, msg])
-        normal_path.change(clear, [], [chatbot, msg])
+                    qa_btn = gr.Button("Generate")
+                    clear_btn_i2t = gr.Button("Clear")
+            with gr.Column(scale=1):
+                answer_output = gr.Textbox(label="Response", lines=4)
 
-        msg.submit(
-            show_user_input, [msg, chatbot], [msg, chatbot],
-        ).then(
-            stream_model_output, [img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality], chatbot,
+        qa_btn.click(
+            understand_image,
+            inputs=[image_input, question_input],
+            outputs=answer_output
         )
-        submit_button.click(
-            show_user_input, [msg, chatbot], [msg, chatbot],
-        ).then(
-            stream_model_output, [img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality], chatbot,
-        )
-        undo_button.click(undo, chatbot, chatbot)
-    barrier.wait()
-    demo.queue(api_open=True).launch(share=True, max_threads=1)
-
-
-@dataclass
-class DemoConfig:
-    gpu_ids = [0]
-    tokenizer_path = "config/llama2/tokenizer.model"
-    llama_type = "onellm"
-    llama_config = "config/llama2/7B.json"
-    model_max_seq_len = 2048
-    pretrained_path = "csuhan/OneLLM-7B-hf"
-    # pretrained_path = "/home/pgao/jiaming/weights/7B_v20_splits/"
-    ckpt_format = "consolidated.00-of-01.s{}.pth"
-    num_ckpts = 10
-    master_port = 23863
-    master_addr = "127.0.0.1"
-    dtype = "fp16"
-
-if __name__ == "__main__":
-    args = DemoConfig()
-
-    # using the default "fork" method messes up some imported libs (e.g.,
-    # pandas)
-    # mp.set_start_method("spawn")
 
-    # setup the queues and start the model workers
-    request_queues = []
-    response_queue = mp.Queue()
-    worker_processes = []
-    barrier = mp.Barrier(len(args.gpu_ids) + 1)
-    for rank, gpu_id in enumerate(args.gpu_ids):
-        request_queue = mp.Queue()
-        rank_response_queue = response_queue if rank == 0 else None
-        process = mp.Process(
-            target=model_worker,
-            args=(rank, args, barrier, request_queue, rank_response_queue),
+        clear_btn_i2t.click(
+            clear_inputs_i2t,
+            outputs=[image_input, question_input, answer_output]
         )
-        process.start()
-        worker_processes.append(process)
-        request_queues.append(request_queue)
 
-    gradio_worker(request_queues, response_queue, args, barrier)
+demo.launch(share=True)
diff --git a/config/llama2/7B.json b/config/llama2/7B.json
deleted file mode 100644
index 6523f76675b50e9cf3a57d1fb135189abcffe1c7..0000000000000000000000000000000000000000
--- a/config/llama2/7B.json
+++ /dev/null
@@ -1 +0,0 @@
-{"dim": 4096, "multiple_of": 256, "n_heads": 32, "n_layers": 32, "norm_eps": 1e-05, "vocab_size": -1}
diff --git a/config/llama2/tokenizer.model b/config/llama2/tokenizer.model
deleted file mode 100644
index 6c00c742ce03c627d6cd5b795984876fa49fa899..0000000000000000000000000000000000000000
--- a/config/llama2/tokenizer.model
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
-size 499723
diff --git a/data/__pycache__/conversation_lib.cpython-310.pyc b/data/__pycache__/conversation_lib.cpython-310.pyc
deleted file mode 100644
index 7104daf11059185efd723e40160c7debf191003b..0000000000000000000000000000000000000000
Binary files a/data/__pycache__/conversation_lib.cpython-310.pyc and /dev/null differ
diff --git a/data/__pycache__/conversation_lib.cpython-39.pyc b/data/__pycache__/conversation_lib.cpython-39.pyc
deleted file mode 100644
index fdca3a64c8523a2b4439b3e0c894c64b2020f486..0000000000000000000000000000000000000000
Binary files a/data/__pycache__/conversation_lib.cpython-39.pyc and /dev/null differ
diff --git a/data/__pycache__/fintune_dataset.cpython-310.pyc b/data/__pycache__/fintune_dataset.cpython-310.pyc
deleted file mode 100644
index 2e4e08c37f7d6e25b19727d2847d34fc1fdd0c8e..0000000000000000000000000000000000000000
Binary files a/data/__pycache__/fintune_dataset.cpython-310.pyc and /dev/null differ
diff --git a/data/__pycache__/fintune_dataset.cpython-39.pyc b/data/__pycache__/fintune_dataset.cpython-39.pyc
deleted file mode 100644
index 45989953b86dec0a2084a59127bc1028067b8640..0000000000000000000000000000000000000000
Binary files a/data/__pycache__/fintune_dataset.cpython-39.pyc and /dev/null differ
diff --git a/data/__pycache__/imu_utils.cpython-310.pyc b/data/__pycache__/imu_utils.cpython-310.pyc
deleted file mode 100644
index cae8e9e22e039ecc388eb3193589fa83b5c3847b..0000000000000000000000000000000000000000
Binary files a/data/__pycache__/imu_utils.cpython-310.pyc and /dev/null differ
diff --git a/data/__pycache__/imu_utils.cpython-39.pyc b/data/__pycache__/imu_utils.cpython-39.pyc
deleted file mode 100644
index 46ddf01f39b937cad76df5c6201b4f62b441694d..0000000000000000000000000000000000000000
Binary files a/data/__pycache__/imu_utils.cpython-39.pyc and /dev/null differ
diff --git a/data/__pycache__/video_utils.cpython-310.pyc b/data/__pycache__/video_utils.cpython-310.pyc
deleted file mode 100644
index 4c007e0ea4f4c05fe867e0bc31077d4c6bc0fe79..0000000000000000000000000000000000000000
Binary files a/data/__pycache__/video_utils.cpython-310.pyc and /dev/null differ
diff --git a/data/__pycache__/video_utils.cpython-39.pyc b/data/__pycache__/video_utils.cpython-39.pyc
deleted file mode 100644
index 81b5c7d05231e85a9f4552385921740940514e39..0000000000000000000000000000000000000000
Binary files a/data/__pycache__/video_utils.cpython-39.pyc and /dev/null differ
diff --git a/data/conversation_lib.py b/data/conversation_lib.py
deleted file mode 100644
index 783fe0eb8f9dd425ec6c285e820f755d2e955a3b..0000000000000000000000000000000000000000
--- a/data/conversation_lib.py
+++ /dev/null
@@ -1,369 +0,0 @@
-import dataclasses
-from enum import auto, Enum
-from typing import List, Tuple
-
-
-class SeparatorStyle(Enum):
-    """Different separator style."""
-    SINGLE = auto()
-    TWO = auto()
-    MPT = auto()
-
-
-@dataclasses.dataclass
-class Conversation:
-    """A class that keeps all conversation history."""
-    system: str
-    roles: List[str]
-    messages: List[List[str]]
-    offset: int
-    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
-    sep: str = "###"
-    sep2: str = None
-    version: str = "Unknown"
-
-    skip_next: bool = False
-
-    def get_prompt(self):
-        if self.sep_style == SeparatorStyle.SINGLE:
-            ret = self.system + '\n\n' + self.sep
-            for role, message in self.messages:
-                if message:
-                    if type(message) is tuple:
-                        message, _, _ = message
-                    ret += role + ": " + message + '\n' + self.sep
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.TWO:
-            seps = [self.sep, self.sep2]
-            ret = self.system + seps[0]
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    if type(message) is tuple:
-                        message, _, _ = message
-                    ret += role + ": " + message + seps[i % 2]
-                else:
-                    ret += role + ":"
-            return ret
-        if self.sep_style == SeparatorStyle.MPT:
-            ret = self.system + self.sep
-            for role, message in self.messages:
-                if message:
-                    if type(message) is tuple:
-                        message, _, _ = message
-                    ret += role + message + self.sep
-                else:
-                    ret += role
-            return ret
-        else:
-            raise ValueError(f"Invalid style: {self.sep_style}")
-
-    def append_message(self, role, message):
-        self.messages.append([role, message])
-
-    def get_images(self, return_pil=False):
-        images = []
-        for i, (role, msg) in enumerate(self.messages[self.offset:]):
-            if i % 2 == 0:
-                if type(msg) is tuple:
-                    import base64
-                    from io import BytesIO
-                    from PIL import Image
-                    msg, image, image_process_mode = msg
-                    if image_process_mode == "Pad":
-                        def expand2square(pil_img, background_color=(122, 116, 104)):
-                            width, height = pil_img.size
-                            if width == height:
-                                return pil_img
-                            elif width > height:
-                                result = Image.new(pil_img.mode, (width, width), background_color)
-                                result.paste(pil_img, (0, (width - height) // 2))
-                                return result
-                            else:
-                                result = Image.new(pil_img.mode, (height, height), background_color)
-                                result.paste(pil_img, ((height - width) // 2, 0))
-                                return result
-
-                        image = expand2square(image)
-                    elif image_process_mode == "Crop":
-                        pass
-                    elif image_process_mode == "Resize":
-                        image = image.resize((224, 224))
-                    else:
-                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
-                    max_hw, min_hw = max(image.size), min(image.size)
-                    aspect_ratio = max_hw / min_hw
-                    max_len, min_len = 800, 400
-                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
-                    longest_edge = int(shortest_edge * aspect_ratio)
-                    W, H = image.size
-                    if H > W:
-                        H, W = longest_edge, shortest_edge
-                    else:
-                        H, W = shortest_edge, longest_edge
-                    image = image.resize((W, H))
-                    if return_pil:
-                        images.append(image)
-                    else:
-                        buffered = BytesIO()
-                        image.save(buffered, format="JPEG")
-                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
-                        images.append(img_b64_str)
-        return images
-
-    def to_gradio_chatbot(self):
-        ret = []
-        for i, (role, msg) in enumerate(self.messages[self.offset:]):
-            if i % 2 == 0:
-                if type(msg) is tuple:
-                    import base64
-                    from io import BytesIO
-                    msg, image, image_process_mode = msg
-                    max_hw, min_hw = max(image.size), min(image.size)
-                    aspect_ratio = max_hw / min_hw
-                    max_len, min_len = 800, 400
-                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
-                    longest_edge = int(shortest_edge * aspect_ratio)
-                    W, H = image.size
-                    if H > W:
-                        H, W = longest_edge, shortest_edge
-                    else:
-                        H, W = shortest_edge, longest_edge
-                    image = image.resize((W, H))
-                    # image = image.resize((224, 224))
-                    buffered = BytesIO()
-                    image.save(buffered, format="JPEG")
-                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
-                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
-                    msg = msg.replace('<image>', img_str)
-                ret.append([msg, None])
-            else:
-                ret[-1][-1] = msg
-        return ret
-
-    def copy(self):
-        return Conversation(
-            system=self.system,
-            roles=self.roles,
-            messages=[[x, y] for x, y in self.messages],
-            offset=self.offset,
-            sep_style=self.sep_style,
-            sep=self.sep,
-            sep2=self.sep2)
-
-    def dict(self):
-        if len(self.get_images()) > 0:
-            return {
-                "system": self.system,
-                "roles": self.roles,
-                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
-                "offset": self.offset,
-                "sep": self.sep,
-                "sep2": self.sep2,
-            }
-        return {
-            "system": self.system,
-            "roles": self.roles,
-            "messages": self.messages,
-            "offset": self.offset,
-            "sep": self.sep,
-            "sep2": self.sep2,
-        }
-
-
-conv_v1 = Conversation(
-    system="A chat between a curious human and an artificial intelligence assistant. "
-           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
-    roles=("Human", "Assistant"),
-    messages=(
-        ("Human", "Give three tips for staying healthy."),
-        ("Assistant",
-         "Sure, here are three tips for staying healthy:\n"
-         "1. Exercise regularly: Regular physical activity can help improve your overall health and wellbeing. "
-         "It can also help reduce your risk of chronic conditions such as obesity, diabetes, heart disease, "
-         "and certain cancers. Aim for at least 150 minutes of moderate-intensity aerobic exercise or "
-         "75 minutes of vigorous-intensity aerobic exercise per week, along with muscle-strengthening "
-         "activities at least two days per week.\n"
-         "2. Eat a balanced diet: Eating a balanced diet that is rich in fruits, "
-         "vegetables, whole grains, lean proteins, and healthy fats can help support "
-         "your overall health. Try to limit your intake of processed and high-sugar foods, "
-         "and aim to drink plenty of water throughout the day.\n"
-         "3. Get enough sleep: Getting enough quality sleep is essential for your physical "
-         "and mental health. Adults should aim for seven to nine hours of sleep per night. "
-         "Establish a regular sleep schedule and try to create a relaxing bedtime routine to "
-         "help improve the quality of your sleep.")
-    ),
-    offset=2,
-    sep_style=SeparatorStyle.SINGLE,
-    sep="###",
-)
-
-conv_v1_2 = Conversation(
-    system="A chat between a curious human and an artificial intelligence assistant. "
-           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
-    roles=("Human", "Assistant"),
-    messages=(),
-
-    # (
-    #     ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
-    #     ("Assistant",
-    #         "Renewable energy sources are those that can be replenished naturally in a relatively "
-    #         "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
-    #         "Non-renewable energy sources, on the other hand, are finite and will eventually be "
-    #         "depleted, such as coal, oil, and natural gas. Here are some key differences between "
-    #         "renewable and non-renewable energy sources:\n"
-    #         "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
-    #         "energy sources are finite and will eventually run out.\n"
-    #         "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
-    #         "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
-    #         "and other negative effects.\n"
-    #         "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
-    #         "have lower operational costs than non-renewable sources.\n"
-    #         "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
-    #         "locations than non-renewable sources.\n"
-    #         "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
-    #         "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
-    #         "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
-    #         "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
-    # )
-    offset = 2,
-    sep_style = SeparatorStyle.SINGLE,
-    sep = "###",
-    )
-
-conv_vicuna_v1_1 = Conversation(
-    system="A chat between a curious user and an artificial intelligence assistant. "
-           "The assistant gives helpful, detailed, and polite answers to the user's questions.",
-    roles=("USER", "ASSISTANT"),
-    version="v1",
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.TWO,
-    sep=" ",
-    sep2="</s>",
-)
-
-conv_mpt = Conversation(
-    system="""<|im_start|>system
-- You are a helpful language and vision assistant.
-- You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
-- You should follow the instructions carefully and explain your answers in detail.""",
-    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
-    version="mpt",
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.MPT,
-    sep="<|im_end|>",
-)
-
-conv_mpt_text = Conversation(
-    system="""<|im_start|>system
-- You are a helpful assistant chatbot trained by MosaicML.
-- You answer questions.
-- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
-- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
-    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
-    version="mpt",
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.MPT,
-    sep="<|im_end|>",
-)
-
-conv_bair_v1 = Conversation(
-    system="BEGINNING OF CONVERSATION:",
-    roles=("USER", "GPT"),
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.TWO,
-    sep=" ",
-    sep2="</s>",
-)
-
-simple_conv = Conversation(
-    system="A chat between a curious human and an artificial intelligence assistant. "
-           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
-    roles=("Human", "Assistant"),
-    messages=(
-        ("Human", "Hi!"),
-        ("Assistant", "Hi there! How can I help you today?")
-    ),
-    offset=2,
-    sep_style=SeparatorStyle.SINGLE,
-    sep="###",
-)
-
-simple_conv_multimodal = Conversation(
-    system="You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab."
-           "You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
-           "Follow the instructions carefully and explain your answers in detail.",
-    roles=("Human", "Assistant"),
-    messages=(
-        ("Human", "Hi!"),
-        ("Assistant", "Hi there!  How can I help you today?\n")
-    ),
-    offset=2,
-    sep_style=SeparatorStyle.SINGLE,
-    sep="###",
-)
-
-simple_conv_mpt_multimodal = Conversation(
-    system="""<|im_start|>system
-- You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab.
-- You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
-- You should follow the instructions carefully and explain your answers in detail.""",
-    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
-    version="mpt",
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.MPT,
-    sep="<|im_end|>",
-)
-
-simple_conv_legacy = Conversation(
-    system="You are LLaVA, a large language model trained by UW Madison WAIV Lab."
-           "You are designed to assist human with a variety of tasks using natural language."
-           "Follow the instructions carefully.",
-    roles=("Human", "Assistant"),
-    messages=(
-        ("Human", "Hi!\n\n### Response:"),
-        ("Assistant", "Hi there!  How can I help you today?\n")
-    ),
-    offset=2,
-    sep_style=SeparatorStyle.SINGLE,
-    sep="###",
-)
-
-conv_llava_v1 = Conversation(
-    system="You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab."
-           "You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
-           "Follow the instructions carefully and explain your answers in detail.",
-    roles=("USER", "ASSISTANT"),
-    version="v1",
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.TWO,
-    sep=" ",
-    sep2="</s>",
-)
-
-default_conversation = conv_v1_2
-conv_templates = {
-    "default": conv_v1_2,
-    "simple": simple_conv,
-    "simple_legacy": simple_conv_legacy,
-    "multimodal": simple_conv_multimodal,
-    "mpt_multimodal": simple_conv_mpt_multimodal,
-    "llava_v1": conv_llava_v1,
-
-    # fastchat
-    "v1": conv_v1_2,
-    "bair_v1": conv_bair_v1,
-    "vicuna_v1_1": conv_vicuna_v1_1,
-    "mpt": conv_mpt,
-    "mpt_text": conv_mpt_text,
-}
-
-if __name__ == "__main__":
-    print(default_conversation.get_prompt())
diff --git a/data/fintune_dataset.py b/data/fintune_dataset.py
deleted file mode 100644
index 6f787139d702fbc46ef5ca8189ade8f89a9c7df0..0000000000000000000000000000000000000000
--- a/data/fintune_dataset.py
+++ /dev/null
@@ -1,449 +0,0 @@
-import warnings
-
-import torch
-import yaml
-from torch.utils.data import Dataset
-from PIL import Image
-import json
-from model.tokenizer import Tokenizer
-import os
-import torchvision.transforms as transforms
-import random
-import torchvision.transforms.functional as F
-import torchaudio
-from . import conversation_lib
-
-import numpy as np
-from . import video_utils
-from .imu_utils import get_imu_frames
-
-
-IGNORE_INDEX = -100
-
-DEFAULT_IMAGE_TOKEN = "<image>"
-try:
-    from torchvision.transforms import InterpolationMode
-
-    BICUBIC = InterpolationMode.BICUBIC
-except ImportError:
-    BICUBIC = Image.BICUBIC
-
-T_random_resized_crop = transforms.Compose([
-    transforms.RandomResizedCrop(size=(224, 224), scale=(0.9, 1.0), ratio=(0.75, 1.3333), interpolation=BICUBIC,
-                                 antialias=None),  # 3 is bicubic
-    transforms.ToTensor(),
-    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])
-
-
-# image transform
-transform_img_train = transforms.Compose([
-    transforms.RandomResizedCrop(size=(224, 224), scale=(0.9, 1.0), ratio=(
-        0.75, 1.3333), interpolation=3, antialias=None),  # 3 is bicubic
-    transforms.ToTensor(),
-    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])
-
-
-class PairRandomResizedCrop(transforms.RandomResizedCrop):
-    def forward(self, imgs):
-        i, j, h, w = self.get_params(imgs[0], self.scale, self.ratio)
-        return [F.resized_crop(img, i, j, h, w, self.size, self.interpolation, antialias=self.antialias) for img in imgs]
-
-
-class PairToTensor(transforms.ToTensor):
-    def __call__(self, pics):
-        return [F.to_tensor(pic) for pic in pics]
-
-
-class PairNormalize(transforms.Normalize):
-    def forward(self, tensors):
-        return [F.normalize(tensor, self.mean, self.std, self.inplace) for tensor in tensors]
-
-
-transform_pairimg_train = transforms.Compose([
-    PairRandomResizedCrop(size=(224, 224), scale=(0.9, 1.0), ratio=(
-        0.75, 1.3333), interpolation=3, antialias=None),  # 3 is bicubic
-    PairToTensor(),
-    PairNormalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])
-
-
-def pc_norm(pc):
-    """ pc: NxC, return NxC """
-    xyz = pc[:, :3]
-    other_feature = pc[:, 3:]
-
-    centroid = torch.mean(xyz, dim=0)
-    xyz = xyz - centroid
-    m = torch.max(torch.sqrt(torch.sum(xyz ** 2, dim=1)))
-    xyz = xyz / m
-
-    pc = torch.cat((xyz, other_feature), dim=1)
-    return pc
-
-
-def make_audio_features(wav_name, mel_bins=128, target_length=1024, aug=False):
-    waveform, sr = torchaudio.load(wav_name)
-    # assert sr == 16000, 'input audio sampling rate must be 16kHz'
-    if sr != 16000:
-        trans = torchaudio.transforms.Resample(sr, 16000)
-        waveform = trans(waveform)
-
-    waveform = waveform - waveform.mean()
-
-    fbank = torchaudio.compliance.kaldi.fbank(
-        waveform, htk_compat=True, sample_frequency=16000, use_energy=False,
-        window_type='hanning', num_mel_bins=mel_bins, dither=0.0, frame_shift=10)
-
-    n_frames = fbank.shape[0]
-
-    p = target_length - n_frames
-    if p > 0:
-        m = torch.nn.ZeroPad2d((0, 0, 0, p))
-        fbank = m(fbank)
-    elif p < 0:
-        fbank = fbank[0:target_length, :]
-
-    if aug:
-        freqm = torchaudio.transforms.FrequencyMasking(48)
-        timem = torchaudio.transforms.TimeMasking(192)
-        fbank = torch.transpose(fbank, 0, 1)
-        fbank = fbank.unsqueeze(0)
-        fbank = freqm(fbank)
-        fbank = timem(fbank)
-        fbank = fbank.squeeze(0)
-        fbank = torch.transpose(fbank, 0, 1)
-
-    fbank = (fbank - (-4.2677393)) / (4.5689974 * 2)
-    return fbank
-
-
-class ConversationGenerator:
-    def __init__(self, tokenizer):
-        self.tokenizer = tokenizer
-        self.header = f"{conversation_lib.default_conversation.system}\n\n"
-        self._probe_tokenizer_style()
-
-    def _probe_tokenizer_style(self):
-        """
-        Given a sentence, e.g. "My darling", some tokenizers will make the space a seperate token,
-        while some others will merge the space into the next word, forming a token representing " darling".
-        Knowing which style the tokenizer takes is necessary for correct ground-truth label masking.
-
-        """
-        probe = "Probe am I"
-        sentence1 = self.tokenizer.encode(conversation_lib.default_conversation.roles[1] + ": " + probe,
-                                          bos=False, eos=False)
-        sentence2 = self.tokenizer.encode(probe,
-                                          bos=False, eos=False)
-        if sentence1[-len(sentence2):] == sentence2:
-            self.space_before_to_predict = False
-        else:
-            sentence3 = self.tokenizer.encode(" " + probe,
-                                              bos=False, eos=False)
-            assert sentence1[-len(sentence3):] == sentence3
-            self.space_before_to_predict = True
-
-    def add_speaker_and_signal(self, source, get_conversation=True):
-        """Add speaker and start/end signal on each round."""
-        BEGIN_SIGNAL = "### "
-        END_SIGNAL = "\n"
-        conversation = self.header
-
-        to_predict_list = []
-
-        for sentence in source:
-            from_str = sentence["from"]
-            if from_str.lower() in ["human"]:
-                from_str = conversation_lib.default_conversation.roles[0]
-            elif from_str.lower() in ["gpt", "assistant"]:
-                from_str = conversation_lib.default_conversation.roles[1]
-            else:
-                raise ValueError(f"unknown dialog role: {from_str.lower()}")
-
-            value = sentence["value"]
-            if DEFAULT_IMAGE_TOKEN in value:
-                value = value.replace(DEFAULT_IMAGE_TOKEN, '').strip()
-
-            sentence_value = BEGIN_SIGNAL + from_str + ": " + value + END_SIGNAL
-
-            if from_str == conversation_lib.default_conversation.roles[1]:
-                to_predict_value = value + END_SIGNAL + "###"
-                if self.space_before_to_predict:
-                    to_predict_value = " " + to_predict_value
-                to_predict_list.append(to_predict_value)
-
-            if get_conversation:
-                conversation = conversation + sentence_value
-
-        conversation = conversation + BEGIN_SIGNAL
-        return conversation, to_predict_list
-
-
-DATASETS = dict(
-    image=[
-        dict(path="datasets/InstructionTuning/image/llava_v1_5_mix665k_image.json", type='image'),
-        dict(path='datasets/InstructionTuning/image/cococap_train.json', type='image'),
-        dict(path="datasets/InstructionTuning/image/llava_v1_5_mix665k_text.json", type='text'),
-    ],
-    audio=[
-        dict(path="datasets/InstructionTuning/audio/audiocap_train.json", type='audio'),
-        dict(path="datasets/InstructionTuning/audio/audiocap_val.json", type='audio'),
-        dict(path="datasets/InstructionTuning/audio/audio_conversation.json", type='audio'),
-    ],
-    video=[
-        dict(path="datasets/InstructionTuning/video/msrvtt_cap_trainval.json", type='video'),
-        dict(path="datasets/InstructionTuning/video/msrvtt_cap_test.json", type='video'),
-        dict(path="datasets/InstructionTuning/video/msrvtt_vqa_train.json", type='video'),
-        dict(path="datasets/InstructionTuning/video/msrvtt_vqa_val.json", type='video'),
-        dict(path="datasets/InstructionTuning/video/msrvtt_vqa_test.json", type='video'),
-        dict(path="datasets/InstructionTuning/video/video_complex_reasoning_10k.json", type='video'),
-        dict(path="datasets/InstructionTuning/video/video_conversation_10k.json", type='video'),
-        dict(path="datasets/InstructionTuning/video/video_detail_10k.json", type='video'),
-    ],
-    point=[
-        dict(path="datasets/InstructionTuning/point/pointllm_70k_formated.json", type='point'),
-    ],
-    rgbd=[
-        dict(path="datasets/InstructionTuning/depth_normal/llava_instruct_50k_depth.json", type='rgbd'),
-    ],
-    rgbn=[
-        dict(path="datasets/InstructionTuning/depth_normal/llava_instruct_50k_normal.json", type='rgbn'),
-    ],
-    imu=[
-        dict(path="datasets/InstructionTuning/imu/imu_fixed_50k.json", type='imu'),
-    ],
-    fmri=[
-        dict(path="datasets/InstructionTuning/fmri/fmri_fixed.json", type='fmri'),
-    ],
-)
-IMU_PATH = "/mnt/petrelfs/share_data/hanjiaming/ego4d/v2/processed_imu/"
-
-
-class FinetuneDialogDataset(Dataset):
-    def __init__(self, dataset=['image'], transform=T_random_resized_crop, max_words=2048, image_words=30, tokenizer_path=None):
-        if isinstance(dataset, str):
-            dataset = [dataset]
-
-        self.dataset = dataset
-
-        group_ann = {}
-        for d in dataset:
-            for meta in DATASETS[d]:
-                meta_path, meta_type = meta['path'], meta['type']
-                meta_ext = os.path.splitext(meta_path)[-1]
-                if meta_ext == ".json":
-                    with open(meta_path) as f:
-                        meta_l = json.load(f)
-                        # add data_type
-                        # this is a temp solution
-                        new_meta_l = []
-                        for l in meta_l:
-                            l['data_type'] = meta_type
-                            new_meta_l.append(l)
-                        meta_l = new_meta_l
-                elif meta_ext == ".jsonl":
-                    meta_l = []
-                    with open(meta_path) as f:
-                        for i, line in enumerate(f):
-                            try:
-                                meta_l.append(json.loads(line))
-                            except json.decoder.JSONDecodeError as e:
-                                print(
-                                    f"Error decoding the following jsonl line ({i}):\n{line.rstrip()}", force=True)
-                                raise e
-                else:
-                    raise NotImplementedError(
-                        f"Unknown meta file extension: \"{meta_ext}\". "
-                        f"Currently, .json, .jsonl are supported. "
-                        "If you are using a supported format, please set the file extension so that the proper parsing "
-                        "routine can be called."
-                    )
-                if meta_type not in group_ann:
-                    group_ann[meta_type] = []
-                print(f"{meta_path}, type {meta_type}: len {len(meta_l)}")
-                group_ann[meta_type] += meta_l
-
-        # sort group_ann for higher efficiency (items in one global batch with similar length)
-        for meta_type, meta_l in group_ann.items():
-            meta_l.sort(key=lambda data_item: sum(
-                [len(_['value']) for _ in data_item['conversations']]))
-
-        self.group_ann = group_ann
-        self.ann = sum(list(self.group_ann.values()), start=[])
-
-        self.group_indices = {}
-        start_pos = 0
-        for meta_type, meta_l in self.group_ann.items():
-            self.group_indices[meta_type] = list(
-                range(start_pos, start_pos + len(meta_l)))
-            start_pos = start_pos + len(meta_l)
-
-        print(f"total length: {len(self)}")
-        self.transform = transform
-        print(f"transform:\n{self.transform}")
-        self.max_words = max_words
-        self.image_words = image_words
-        self.tokenizer = Tokenizer(model_path=tokenizer_path)
-        self.conversation_generator = ConversationGenerator(self.tokenizer)
-
-        self.load_funcs = dict(
-            image=self.load_image,
-            audio=self.load_audio,
-            video=self.load_video,
-            point=self.load_point,
-            rgbd=self.load_rgbx,
-            rgbn=self.load_rgbx,
-            imu=self.load_imu,
-            fmri=self.load_fmri
-        )
-
-    def __len__(self):
-        return len(self.ann)
-
-    def load_image(self, data):
-        filename = data['image']
-        image = Image.open(filename).convert('RGB')
-        image = self.transform(image)
-        return image
-
-    def load_audio(self, data):
-        audio_path = data['image']
-        fbank = make_audio_features(audio_path, mel_bins=128)
-        fbank = fbank.transpose(0, 1)[None]  # [1, 128, 1024]
-        return fbank
-
-    def load_video(self, data):
-        video_path = data['image']
-        video_feats = video_utils.load_and_transform_video_data(
-            video_path, video_path, clip_duration=1, clips_per_video=5)
-        return video_feats[:, :, 0]
-
-    def load_point(self, data):
-        point_path = data['image']
-        point_feat = torch.load(point_path, map_location='cpu')
-        point_feat = point_feat.transpose(0, 1)
-        return point_feat
-
-    def load_rgbx(self, data):
-        image_path = data['image']
-        x_image_path = data['depth_image'] if 'depth_image' in data else data['normal_image']
-        image = Image.open(image_path).convert('RGB')
-        x_image = Image.open(x_image_path).convert('RGB')
-        x_image = x_image.resize(image.size[-2:])
-
-        image, x_image = transform_pairimg_train([image, x_image])
-        # [2, 3, H, W]
-        image = torch.stack([image, x_image], dim=0)
-        return image
-
-    def load_fmri(self, data):
-        fmri_path = data['image']
-        data = np.load(fmri_path)
-        data = data.mean(axis=0)
-        data = torch.tensor(data[None])
-        return data
-
-    def load_imu(self, data_dict):
-        uid = data_dict["video_uid"]
-        w_s = data_dict["window_start"]
-        w_e = data_dict["window_end"]
-
-        imu_data = get_imu_frames(
-            IMU_PATH, uid,
-            video_start_sec=w_s,
-            video_end_sec=w_e,
-        )
-        if imu_data is None:
-            raise ValueError
-        return imu_data['signal']
-
-    def __getitem__(self, index, expect_type=None):
-        if expect_type is None:
-            data_item = self.ann[index]
-        else:
-            # in case we want get data from specific data_type
-            data_item = self.group_ann[expect_type][index]
-
-        data_type = data_item['data_type']
-        if data_type != 'text':
-            if data_type in self.load_funcs:
-                try:
-                    image = self.load_funcs[data_type](data_item)
-                    if image == None:
-                        raise ValueError('Data is None')
-                except:
-                    print('Error', data_item)
-                    rand_idx = random.randint(
-                        0, len(self.group_ann[data_type]))
-                    return self.__getitem__(rand_idx, expect_type=data_type)
-            else:
-                raise ValueError(f'Does not support {data_type}')
-        else:
-            image = None
-            # warnings.warn("pure black image for examples without image")
-            # image = torch.zeros(3, 224, 224)
-
-        source = data_item["conversations"]
-        conversation, to_predict_values = self.conversation_generator.add_speaker_and_signal(
-            source)
-        if len(to_predict_values) == 0:
-            warnings.warn(
-                f"see dialog data with nothing to predict, data: {data_item}")
-            return self[index-1]
-
-        tokenzed_conversation = self.tokenizer.encode(
-            conversation, bos=True, eos=True)
-        labels = [IGNORE_INDEX for _ in tokenzed_conversation]
-
-        check_pos = 0
-        for value in to_predict_values:
-            tokenized_value = self.tokenizer.encode(
-                value, bos=False, eos=False)
-            value_pos = find_sublist(
-                tokenzed_conversation[check_pos:], tokenized_value) + check_pos
-            if value_pos == -1:
-                print(
-                    "a sentence mismatches the corresponding piece in the conversation")
-                return self[index-1]
-            labels[value_pos:value_pos+len(tokenized_value)] = tokenized_value
-            assert labels[value_pos:value_pos+len(
-                tokenized_value)] == tokenzed_conversation[value_pos:value_pos+len(tokenized_value)]
-            check_pos = value_pos+len(tokenized_value)
-
-        input2 = torch.tensor(tokenzed_conversation, dtype=torch.int64)
-        labels = torch.tensor(labels, dtype=torch.int64)
-
-        if image is not None:
-            max_words = self.max_words - self.image_words
-        else:
-            max_words = self.max_words
-        padding = max_words - input2.shape[0]
-        if padding > 0:
-            input2 = torch.cat(
-                (input2, torch.zeros(padding, dtype=torch.int64) - 1))
-            labels = torch.cat(
-                (labels, torch.zeros(padding, dtype=torch.int64) - 1))
-        elif padding < 0:
-            input2 = input2[:max_words]
-            labels = labels[:max_words]
-
-        input2_mask = input2.ge(0)
-        label_mask = labels.ge(0)
-        input2[~input2_mask] = 0
-        labels[~label_mask] = 0
-        input2_mask = input2_mask.float()
-        label_mask = label_mask.float()
-        if image is None:
-            return input2, labels, data_item['data_type']
-        else:
-            return input2, labels, image, data_item['data_type']
-
-    def groups(self):
-        return list(self.group_indices.values())
-
-
-def find_sublist(a: list, b: list):
-    len_a, len_b = len(a), len(b)
-    for i in range(len_a - len_b + 1):
-        if a[i:i+len_b] == b:
-            return i
-    return -1
diff --git a/data/imu_utils.py b/data/imu_utils.py
deleted file mode 100644
index 010563d67e603bd7ca5589672058380a79ee93d9..0000000000000000000000000000000000000000
--- a/data/imu_utils.py
+++ /dev/null
@@ -1,257 +0,0 @@
-import string
-import numpy as np
-import matplotlib.animation as animation
-from matplotlib import pyplot as plt
-import json
-from collections import defaultdict
-from bisect import bisect_left
-import os
-import torch
-import torchaudio
-torchaudio.set_audio_backend("sox_io")
-
-
-def load_json(json_path: str):
-    """
-    Load a json file
-    """
-    with open(json_path, "r", encoding="utf-8") as f_name:
-        data = json.load(f_name)
-    return data
-
-
-def check_window_signal(info_t, w_s, w_e):
-    length = w_e - w_s
-    frame_offset = int(w_s * info_t.sample_rate)
-    num_frames = int(length * info_t.sample_rate)
-    if frame_offset + num_frames > int(info_t.num_frames):
-        return False
-    else:
-        return True
-
-
-def index_narrations(ann_path):
-    narration_raw = load_json(ann_path)
-
-    narration_dict = defaultdict(list)
-    summary_dict = defaultdict(list)
-    avg_len = []
-    for v_id, narr in narration_raw.items():
-        narr_list = []
-        summ_list = []
-        if "narration_pass_1" in narr:
-            narr_list += narr["narration_pass_1"]["narrations"]
-            summ_list += narr["narration_pass_1"]["summaries"]
-        if "narration_pass_2" in narr:
-            narr_list += narr["narration_pass_2"]["narrations"]
-            summ_list += narr["narration_pass_2"]["summaries"]
-
-        if len(narr_list) > 0:
-            narration_dict[v_id] = [
-                (
-                    float(n_t["timestamp_sec"]),
-                    n_t["narration_text"],
-                    n_t["annotation_uid"],
-                    n_t["timestamp_frame"],
-                )
-                for n_t in narr_list
-            ]
-            avg_len.append(len(narration_dict[v_id]))
-        else:
-            narration_dict[v_id] = []
-        if len(summ_list) > 0:
-            summary_dict[v_id] = [
-                (
-                    float(s_t["start_sec"]),
-                    float(s_t["end_sec"]),
-                    s_t["summary_text"],
-                )
-                for s_t in summ_list
-            ]
-        else:
-            summary_dict[v_id] = []
-    # print(f"Number of Videos with narration {len(narration_dict)}")
-    # print(f"Avg. narration length {np.mean(avg_len)}")
-    # print(f"Number of Videos with summaries {len(summary_dict)}")
-    return narration_dict, summary_dict
-
-
-def get_signal_info(signal_fn: str):
-    return torchaudio.info(signal_fn)
-
-
-def get_signal_frames(signal_fn: str, video_start_sec: float, video_end_sec: float):
-    """
-    Given a signal track return the frames between video_start_sec and video_end_sec
-    """
-    info_t = get_signal_info(signal_fn)
-
-    length = video_end_sec - video_start_sec
-    aframes, _ = torchaudio.load(
-        signal_fn,
-        normalize=True,
-        frame_offset=int(video_start_sec * info_t.sample_rate),
-        num_frames=int(length * info_t.sample_rate),
-    )
-    return {"signal": aframes, "meta": info_t}
-
-
-def tosec(value):
-    return value / 1000
-
-
-def toms(value):
-    return value * 1000
-
-
-def delta(first_num: float, second_num: float):
-    """Compute the absolute value of the difference of two numbers"""
-    return abs(first_num - second_num)
-
-
-def padIMU(signal, duration_sec):
-    """
-    Pad the signal if necessary
-    """
-    expected_elements = round(duration_sec) * 200
-
-    if signal.shape[0] > expected_elements:
-        signal = signal[:expected_elements, :]
-    elif signal.shape[0] < expected_elements:
-        padding = expected_elements - signal.shape[0]
-        padded_zeros = np.zeros((padding, 6))
-        signal = np.concatenate([signal, padded_zeros], 0)
-        # signal = signal[:expected_elements, :]
-    return signal
-
-
-def resample(
-    signals: np.ndarray,
-    timestamps: np.ndarray,
-    original_sample_rate: int,
-    resample_rate: int,
-):
-    """
-    Resamples data to new sample rate
-    """
-    signals = torch.as_tensor(signals)
-    timestamps = torch.from_numpy(timestamps).unsqueeze(-1)
-    signals = torchaudio.functional.resample(
-        waveform=signals.data.T,
-        orig_freq=original_sample_rate,
-        new_freq=resample_rate,
-    ).T.numpy()
-
-    nsamples = len(signals)
-
-    period = 1 / resample_rate
-
-    # timestamps are expected to be shape (N, 1)
-    initital_seconds = timestamps[0] / 1e3
-
-    ntimes = (torch.arange(nsamples) * period).view(-1, 1) + initital_seconds
-
-    timestamps = (ntimes * 1e3).squeeze().numpy()
-    return signals, timestamps
-
-
-def resampleIMU(signal, timestamps):
-    sampling_rate = int(1000 * (1 / (np.mean(np.diff(timestamps)))))
-    # resample all to 200hz
-    if sampling_rate != 200:
-        signal, timestamps = resample(signal, timestamps, sampling_rate, 200)
-    return signal, timestamps
-
-
-def get_imu_frames(
-    imu_path,
-    uid: str,
-    video_start_sec: float,
-    video_end_sec: float,
-):
-    """
-    Given a IMU signal return the frames between video_start_sec and video_end_sec
-    """
-    signal = np.load(os.path.join(imu_path, f"{uid}.npy"))
-    signal = signal.transpose()
-    timestamps = np.load(os.path.join(imu_path, f"{uid}_timestamps.npy"))
-
-    if toms(video_start_sec) > timestamps[-1] or toms(video_end_sec) > timestamps[-1]:
-        return None
-
-    start_id = bisect_left(timestamps, toms(video_start_sec))
-    end_id = bisect_left(timestamps, toms(video_end_sec))
-
-    # make sure the retrieved window interval are correct by a max of 1 sec margin
-    if (
-        delta(video_start_sec, tosec(timestamps[start_id])) > 4
-        or delta(video_end_sec, tosec(timestamps[end_id])) > 4
-    ):
-        return None
-
-    # get the window
-    if start_id == end_id:
-        start_id -= 1
-        end_id += 1
-    signal, timestamps = signal[start_id:end_id], timestamps[start_id:end_id]
-
-    if len(signal) < 10 or len(timestamps) < 10:
-        return None
-    # resample the signal at 200hz if necessary
-    signal, timestamps = resampleIMU(signal, timestamps)
-
-    # pad  the signal if necessary
-    signal = padIMU(signal, video_end_sec - video_start_sec)
-
-    sample_dict = {
-        "timestamp": timestamps,
-        "signal": torch.tensor(signal.T),
-        "sampling_rate": 200,
-    }
-
-    return sample_dict
-
-
-def display_animation(frames, title, save_path_gif):
-    fig, ax = plt.subplots()
-    frames = [[ax.imshow(frames[i])] for i in range(len(frames))]
-    plt.title(title)
-    ani = animation.ArtistAnimation(fig, frames)
-    ani.save(save_path_gif, writer="imagemagick")
-    plt.close()
-
-
-def display_animation_imu(frames, imu, title, save_path_gif):
-    fig, (ax1, ax2, ax3) = plt.subplots(3, 1)
-    ax1.set_title(title)
-    ax2.set_title("Acc.")
-    ax3.set_title("Gyro.")
-    frames = [[ax1.imshow(frames[i])] for i in range(len(frames))]
-    ani = animation.ArtistAnimation(fig, frames)
-
-    ax2.plot(imu[0].cpu().numpy(), color="red")
-    ax2.plot(imu[1].cpu().numpy(), color="blue")
-    ax2.plot(imu[2].cpu().numpy(), color="green")
-    ax3.plot(imu[3].cpu().numpy(), color="red")
-    ax3.plot(imu[4].cpu().numpy(), color="blue")
-    ax3.plot(imu[5].cpu().numpy(), color="green")
-    plt.tight_layout()
-    ani.save(save_path_gif, writer="imagemagick")
-    plt.close()
-
-
-def filter_narration(narration_text: str) -> bool:
-    if "#c" in narration_text.lower():
-        return True
-    return False
-
-
-def clean_narration_text(narration_text: str) -> str:
-    return (
-        narration_text.replace("#C C ", "")
-        .replace("#C", "")
-        .replace("#unsure", "something")
-        .strip()
-        .strip(string.punctuation)
-        .lower()[:128]
-    )
diff --git a/data/video_utils.py b/data/video_utils.py
deleted file mode 100644
index 43ac03067e50c8570d422a057b9d9efb18e8775b..0000000000000000000000000000000000000000
--- a/data/video_utils.py
+++ /dev/null
@@ -1,204 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-from pytorchvideo import transforms as pv_transforms
-from pytorchvideo.data.clip_sampling import ConstantClipsPerVideoSampler
-from pytorchvideo.data.encoded_video import EncodedVideo
-from pytorchvideo.data.encoded_video_decord import EncodedVideoDecord
-from torchvision import transforms
-from torchvision.transforms._transforms_video import NormalizeVideo
-
-
-def get_clip_timepoints(clip_sampler, duration):
-    # Read out all clips in this video
-    all_clips_timepoints = []
-    is_last_clip = False
-    end = 0.0
-    while not is_last_clip:
-        start, end, _, _, is_last_clip = clip_sampler(end, duration, annotation=None)
-        all_clips_timepoints.append((start, end))
-    return all_clips_timepoints
-
-
-
-def crop_boxes(boxes, x_offset, y_offset):
-    """
-    Perform crop on the bounding boxes given the offsets.
-    Args:
-        boxes (ndarray or None): bounding boxes to perform crop. The dimension
-            is `num boxes` x 4.
-        x_offset (int): cropping offset in the x axis.
-        y_offset (int): cropping offset in the y axis.
-    Returns:
-        cropped_boxes (ndarray or None): the cropped boxes with dimension of
-            `num boxes` x 4.
-    """
-    cropped_boxes = boxes.copy()
-    cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
-    cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
-
-    return cropped_boxes
-
-
-def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
-    """
-    Perform uniform spatial sampling on the images and corresponding boxes.
-    Args:
-        images (tensor): images to perform uniform crop. The dimension is
-            `num frames` x `channel` x `height` x `width`.
-        size (int): size of height and weight to crop the images.
-        spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
-            is larger than height. Or 0, 1, or 2 for top, center, and bottom
-            crop if height is larger than width.
-        boxes (ndarray or None): optional. Corresponding boxes to images.
-            Dimension is `num boxes` x 4.
-        scale_size (int): optinal. If not None, resize the images to scale_size before
-            performing any crop.
-    Returns:
-        cropped (tensor): images with dimension of
-            `num frames` x `channel` x `size` x `size`.
-        cropped_boxes (ndarray or None): the cropped boxes with dimension of
-            `num boxes` x 4.
-    """
-    assert spatial_idx in [0, 1, 2]
-    ndim = len(images.shape)
-    if ndim == 3:
-        images = images.unsqueeze(0)
-    height = images.shape[2]
-    width = images.shape[3]
-
-    if scale_size is not None:
-        if width <= height:
-            width, height = scale_size, int(height / width * scale_size)
-        else:
-            width, height = int(width / height * scale_size), scale_size
-        images = torch.nn.functional.interpolate(
-            images,
-            size=(height, width),
-            mode="bilinear",
-            align_corners=False,
-        )
-
-    y_offset = int(math.ceil((height - size) / 2))
-    x_offset = int(math.ceil((width - size) / 2))
-
-    if height > width:
-        if spatial_idx == 0:
-            y_offset = 0
-        elif spatial_idx == 2:
-            y_offset = height - size
-    else:
-        if spatial_idx == 0:
-            x_offset = 0
-        elif spatial_idx == 2:
-            x_offset = width - size
-    cropped = images[:, :, y_offset : y_offset + size, x_offset : x_offset + size]
-    cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
-    if ndim == 3:
-        cropped = cropped.squeeze(0)
-    return cropped, cropped_boxes
-
-
-class SpatialCrop(nn.Module):
-    """
-    Convert the video into 3 smaller clips spatially. Must be used after the
-        temporal crops to get spatial crops, and should be used with
-        -2 in the spatial crop at the slowfast augmentation stage (so full
-        frames are passed in here). Will return a larger list with the
-        3x spatial crops as well.
-    """
-
-    def __init__(self, crop_size: int = 224, num_crops: int = 3):
-        super().__init__()
-        self.crop_size = crop_size
-        if num_crops == 3:
-            self.crops_to_ext = [0, 1, 2]
-            self.flipped_crops_to_ext = []
-        elif num_crops == 1:
-            self.crops_to_ext = [1]
-            self.flipped_crops_to_ext = []
-        else:
-            raise NotImplementedError("Nothing else supported yet")
-
-    def forward(self, videos):
-        """
-        Args:
-            videos: A list of C, T, H, W videos.
-        Returns:
-            videos: A list with 3x the number of elements. Each video converted
-                to C, T, H', W' by spatial cropping.
-        """
-        assert isinstance(videos, list), "Must be a list of videos after temporal crops"
-        assert all([video.ndim == 4 for video in videos]), "Must be (C,T,H,W)"
-        res = []
-        for video in videos:
-            for spatial_idx in self.crops_to_ext:
-                res.append(uniform_crop(video, self.crop_size, spatial_idx)[0])
-            if not self.flipped_crops_to_ext:
-                continue
-            flipped_video = transforms.functional.hflip(video)
-            for spatial_idx in self.flipped_crops_to_ext:
-                res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0])
-        return res
-
-
-def load_and_transform_video_data(
-    video_file,
-    video_path,
-    clip_duration=2,
-    clips_per_video=5,
-    sample_rate=16000,
-    with_audio=False
-):
-    video_transform = transforms.Compose(
-        [
-            pv_transforms.ShortSideScale(224),
-            NormalizeVideo(
-                mean=(0.48145466, 0.4578275, 0.40821073),
-                std=(0.26862954, 0.26130258, 0.27577711),
-            ),
-        ]
-    )
-
-    clip_sampler = ConstantClipsPerVideoSampler(
-        clip_duration=clip_duration, clips_per_video=clips_per_video
-    )
-    frame_sampler = pv_transforms.UniformTemporalSubsample(num_samples=clip_duration)
-
-    if isinstance(video_file, str):
-        video = EncodedVideo.from_path(
-            video_file,
-            decoder="decord",
-            decode_audio=with_audio,
-            # **{"sample_rate": sample_rate},
-        )
-    else:
-        video = EncodedVideoDecord(video_file, video_name=video_path, decode_video=True, decode_audio=with_audio, sample_rate=sample_rate)
-    
-    all_clips_timepoints = get_clip_timepoints(clip_sampler, video.duration)
-
-    all_video = []
-    for clip_timepoints in all_clips_timepoints:
-        # Read the clip, get frames
-        clip = video.get_clip(clip_timepoints[0], clip_timepoints[1])
-        if clip is None:
-            raise ValueError("No clip found")
-        video_clip = frame_sampler(clip["video"])
-        video_clip = video_clip / 255.0  # since this is float, need 0-1
-
-        all_video.append(video_clip)
-
-    all_video = [video_transform(clip) for clip in all_video]
-    all_video = SpatialCrop(224, num_crops=3)(all_video)
-
-    all_video = torch.stack(all_video, dim=0)
-
-    if not with_audio:
-        return all_video
-    else:
-        return all_video, clip['audio']
-
-if __name__ == '__main__':
-    video_path = "datasets/InstructionTuning/video/music_aqa/MUSIC-AVQA-videos-Real/00000002.mp4"
-    video, audio = load_and_transform_video_data(video_path, video_path, clip_duration=1, clips_per_video=5, with_audio=True)
-    import pdb;pdb.set_trace()
\ No newline at end of file
diff --git a/demos/multi_turn_mm.py b/demos/multi_turn_mm.py
deleted file mode 100644
index 6f354e6c68d0a09df50c87a1a53f110a4fe7321a..0000000000000000000000000000000000000000
--- a/demos/multi_turn_mm.py
+++ /dev/null
@@ -1,300 +0,0 @@
-import sys
-import os
-sys.path.append(os.path.abspath(__file__).rsplit('/', 2)[0])
-
-import argparse
-import multiprocessing as mp
-import numpy as np
-from typing import List, Optional
-
-import torch
-import torch.distributed as dist
-
-from fairscale.nn.model_parallel import initialize as fs_init
-
-import gradio as gr
-from util.misc import setup_for_distributed
-from util.misc import default_tensor_type
-from model.meta import MetaModel
-from data.conversation_lib import conv_templates, SeparatorStyle
-from PIL import Image
-import torchvision.transforms as transforms
-from data.fintune_dataset import make_audio_features
-from data import video_utils 
-
-
-T_random_resized_crop = transforms.Compose([
-    transforms.RandomResizedCrop(size=(224, 224), scale=(0.9, 1.0), ratio=(0.75, 1.3333), interpolation=3,
-                                 antialias=None),  # 3 is bicubic
-    transforms.ToTensor(),
-    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])
-
-
-def load_audio(audio_path):
-    fbank = make_audio_features(audio_path, mel_bins=128)
-    fbank = fbank.transpose(0, 1)[None] #[1, 128, 1024]
-    return fbank
-    
-def load_video(video_path):
-    video_feats = video_utils.load_and_transform_video_data(video_path, video_path, clip_duration=1, clips_per_video=5)
-    return video_feats[:, :, 0]
-
-
-def model_worker(
-    rank: int, args: argparse.Namespace, barrier: mp.Barrier,
-    request_queue: mp.Queue, response_queue: Optional[mp.Queue] = None,
-) -> None:
-    """
-    The worker function that manipulates the GPU to run the inference.
-    Exact n_gpu workers are started, with each one operating on a separate GPU.
-
-    Args:
-        rank (int): Distributed rank of the worker.
-        args (argparse.Namespace): All command line arguments.
-        barrier (multiprocessing.Barrier): A barrier used to delay the start
-            of Web UI to be after the start of the model.
-    """
-
-    world_size = len(args.gpu_ids)
-    gpu_id = args.gpu_ids[rank]
-    dist.init_process_group(
-        backend="nccl", rank=rank, world_size=world_size,
-        init_method=f"tcp://{args.master_addr}:{args.master_port}",
-    )
-    print(f"| distributed init on worker {rank}/{world_size}. "
-          f"using gpu: {gpu_id}")
-    fs_init.initialize_model_parallel(world_size)
-    torch.cuda.set_device(gpu_id)
-
-    torch.manual_seed(1)
-    np.random.seed(1)
-
-    # set the print behavior.
-    setup_for_distributed(rank == 0)
-
-    target_dtype = {
-        "bf16": torch.bfloat16,
-        "fp16": torch.float16
-    }[args.dtype]
-    with default_tensor_type(dtype=target_dtype, device="cuda"):
-        model = MetaModel(args.llama_type, args.llama_config, tokenizer_path=args.tokenizer_path)
-    print("Loading pretrained weights ...")
-    checkpoint = torch.load(args.pretrained_path, map_location='cpu')
-    msg = model.load_state_dict(checkpoint, strict=False)
-    print("load result:\n", msg)
-    model.cuda()
-    model.eval()
-    print(f"Model = {str(model)}")
-
-    barrier.wait()
-
-    while True:
-        img_path, audio_path, video_path, chatbot, max_gen_len, temperature, top_p, modality = request_queue.get()
-        if 'image' in modality and img_path is not None:
-            image = Image.open(img_path).convert('RGB')
-            inputs = T_random_resized_crop(image)
-        elif 'video' in modality and video_path is not None:
-            inputs = load_video(video_path)
-        elif 'audio' in modality and audio_path is not None:
-            inputs = load_audio(audio_path)
-        else:
-            inputs = None
-        
-        if inputs is not None:
-            inputs = inputs[None].cuda().to(target_dtype)
-    
-        conv = conv_templates["v1"].copy()
-        for user, bot in chatbot:
-            conv.append_message(conv.roles[0], user)
-            conv.append_message(conv.roles[1], bot)
-
-        with torch.cuda.amp.autocast(dtype=target_dtype):
-            print(conv.get_prompt())
-            for stream_response in model.stream_generate(
-                conv.get_prompt(), inputs,
-                max_gen_len=max_gen_len, temperature=temperature, top_p=top_p,
-                modal = modality
-            ):
-                conv_sep = (
-                    conv.sep
-                    if conv.sep_style == SeparatorStyle.SINGLE
-                    else conv.sep2
-                )
-                end_pos = stream_response["text"].find(conv_sep)
-                if end_pos != -1:
-                    stream_response["text"] = (
-                        stream_response['text'][:end_pos].rstrip() + "\n"
-                    )
-                    stream_response["end_of_content"] = True
-
-                # keep a few characters if not end_of_content to avoid sending
-                # part of conv_sep before all of it is generated.
-                if not stream_response["end_of_content"]:
-                    if len(stream_response["text"]) < len(conv_sep):
-                        continue
-                    stream_response["text"] = (
-                        stream_response["text"][:-len(conv_sep)]
-                    )
-
-                if response_queue is not None:
-                    response_queue.put(stream_response)
-
-                if stream_response["end_of_content"]:
-                    break
-
-
-def gradio_worker(
-    request_queues: List[mp.Queue], response_queue: mp.Queue,
-    args: argparse.Namespace, barrier: mp.Barrier,
-) -> None:
-    """
-    The gradio worker is responsible for displaying the WebUI and relay the
-    requests to model workers. It should be launched only once.
-
-    Args:
-        request_queues (List[mp.Queue]): A list of request queues (one for
-            each model worker).
-        args (argparse.Namespace): All command line arguments.
-        barrier (multiprocessing.Barrier): A barrier used to delay the start
-            of Web UI to be after the start of the model.
-    """
-
-    def show_user_input(msg, chatbot):
-        return "", chatbot + [[msg, None]]
-
-    def stream_model_output(img_path, audio_path, video_path, chatbot, max_gen_len, gen_t, top_p, modality):
-        for queue in request_queues:
-            queue.put((img_path, audio_path, video_path, chatbot, max_gen_len, gen_t, top_p, modality))
-        while True:
-            content_piece = response_queue.get()
-            chatbot[-1][1] = content_piece["text"]
-            yield chatbot
-            if content_piece["end_of_content"]:
-                break
-
-    def undo(chatbot):
-        if len(chatbot) > 0:
-            chatbot = chatbot[:-1]
-        return chatbot
-
-    def clear():
-        chatbot = []
-        msg = ""
-        return chatbot, msg
-
-    CSS ="""
-    .contain { display: flex; flex-direction: column; }
-    #component-0 { height: 100%; }
-    #chatbot { flex-grow: 1; overflow: auto;}
-    """
-    with gr.Blocks(css=CSS) as demo:
-        gr.Markdown("## OneLLM: One Framework to Align All Modalities with Language")
-        with gr.Row(equal_height=True):
-            with gr.Column(scale=1):
-                img_path = gr.Image(label='Image Input', type='filepath')
-                video_path = gr.Video(label='Video Input')
-                audio_path = gr.Audio(label='Audio Input', type='filepath', sources=['upload'])
-                modality = gr.Radio(choices=['image', 'audio', 'video'], value='image', interactive=True, label='Input Modalities')
-
-            with gr.Column(scale=2):
-                chatbot = gr.Chatbot(elem_id="chatbot")
-                msg = gr.Textbox()
-
-        with gr.Row():
-            submit_button = gr.Button("Submit", variant="primary")
-            undo_button = gr.Button("Undo")
-            clear_button = gr.ClearButton([chatbot, msg, img_path, audio_path, video_path, modality])
-        with gr.Row():
-            max_gen_len = gr.Slider(
-                minimum=1, maximum=args.model_max_seq_len // 2,
-                value=args.model_max_seq_len // 2, interactive=True,
-                label="Single-turn max response length",
-            )
-            gen_t = gr.Slider(
-                minimum=0, maximum=1, value=0.1, interactive=True,
-                label="Temperature",
-            )
-            top_p = gr.Slider(
-                minimum=0, maximum=1, value=0.75, interactive=True,
-                label="Top-p",
-            )
-        msg.submit(
-            show_user_input, [msg, chatbot], [msg, chatbot],
-        ).then(
-            stream_model_output, [img_path, audio_path, video_path, chatbot, max_gen_len, gen_t, top_p, modality], chatbot,
-        )
-        submit_button.click(
-            show_user_input, [msg, chatbot], [msg, chatbot],
-        ).then(
-            stream_model_output, [img_path, audio_path, video_path, chatbot, max_gen_len, gen_t, top_p, modality], chatbot,
-        )
-        undo_button.click(undo, chatbot, chatbot)
-        # img_path.change(clear, [], [chatbot, msg])
-    barrier.wait()
-    demo.queue(api_open=True).launch(share=True, max_threads=1)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser("Chat Demo")
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument(
-        "--gpu_ids", type=int, nargs="+",
-        help="A list of space-separated gpu ids to run the model on. "
-             "The model will span across GPUs in tensor-parallel mode."
-    )
-    parser.add_argument(
-        "--tokenizer_path", type=str,
-        help="Path to the tokenizer.model file provided along with the LLaMA "
-             "model."
-    )
-    parser.add_argument(
-        "--llama_type", default="onellm", type=str, metavar="MODEL",
-        help="LLaMA model type."
-    )
-    parser.add_argument(
-        "--llama_config", type=str, required=True,
-        help="Path to the llama model config json."
-    )
-    parser.add_argument(
-        "--model_max_seq_len", type=int, default=2048,
-        help="Max sequence length accepted by the pretrained model."
-    )
-    parser.add_argument(
-        "--pretrained_path", type=str, required=True,
-        help="Path to the llama model checkpoints. A list of checkpoints is "
-             "supported and will be merged from left to right.")
-    parser.add_argument(
-        "--master_port", type=int, default=23862,
-        help="A port used by the PyTorch distributed module to initialize."
-    )
-    parser.add_argument(
-        "--master_addr", type=str, default="127.0.0.1",
-        help="An address used by the PyTorch distributed module to initialize."
-    )
-    parser.add_argument(
-        "--dtype", type=str, choices=["fp16", "bf16"], default="fp16",
-        help="The dtype used for model weights and inference."
-    )
-    args = parser.parse_args()
-
-    # using the default "fork" method messes up some imported libs (e.g.,
-    # pandas)
-    mp.set_start_method("spawn")
-
-    # setup the queues and start the model workers
-    request_queues = []
-    response_queue = mp.Queue()
-    worker_processes = []
-    barrier = mp.Barrier(len(args.gpu_ids) + 1)
-    for rank, gpu_id in enumerate(args.gpu_ids):
-        request_queue = mp.Queue()
-        rank_response_queue = response_queue if rank == 0 else None
-        process = mp.Process(
-            target=model_worker,
-            args=(rank, args, barrier, request_queue, rank_response_queue),
-        )
-        process.start()
-        worker_processes.append(process)
-        request_queues.append(request_queue)
-
-    gradio_worker(request_queues, response_queue, args, barrier)
diff --git a/examples/bell_ring.wav b/examples/bell_ring.wav
deleted file mode 100644
index b4b7b26431c6b3e4a2266758702a5bcc95ab697f..0000000000000000000000000000000000000000
--- a/examples/bell_ring.wav
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:415b5406175cafa874396f89811121fb306c17084db8ec20753ab5666b4fdcca
-size 3630044
diff --git a/examples/bird_audio.wav b/examples/bird_audio.wav
deleted file mode 100644
index a98fc72b0df440fd10b3e54c87dfe0ffae0fa12e..0000000000000000000000000000000000000000
Binary files a/examples/bird_audio.wav and /dev/null differ
diff --git a/examples/depth_normal/depth/0084.png b/examples/depth_normal/depth/0084.png
deleted file mode 100644
index d88b2345463e7bb7dc56fc58810b30ba20e0dd28..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/0084.png and /dev/null differ
diff --git a/examples/depth_normal/depth/0131.png b/examples/depth_normal/depth/0131.png
deleted file mode 100644
index a47a267d6e213e2b8d325d850e4df826379964ed..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/0131.png and /dev/null differ
diff --git a/examples/depth_normal/depth/0297.png b/examples/depth_normal/depth/0297.png
deleted file mode 100644
index 8660e2ea0efa481a6fa788d5a9f06c0692de4490..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/0297.png and /dev/null differ
diff --git a/examples/depth_normal/depth/0331.png b/examples/depth_normal/depth/0331.png
deleted file mode 100644
index 2a01e160c6d1c72807e130ccebf9affe37094b1d..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/0331.png and /dev/null differ
diff --git a/examples/depth_normal/depth/0432.png b/examples/depth_normal/depth/0432.png
deleted file mode 100644
index a7e2f96f53d7959ee437e0eb903974ec12fe49af..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/0432.png and /dev/null differ
diff --git a/examples/depth_normal/depth/0633.png b/examples/depth_normal/depth/0633.png
deleted file mode 100644
index ea82c9ad56635dc7ed4710adfa3384f63212e503..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/0633.png and /dev/null differ
diff --git a/examples/depth_normal/depth/0663.png b/examples/depth_normal/depth/0663.png
deleted file mode 100644
index 86b26a0fd891dc6c424da5924b8543b37780b4d2..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/0663.png and /dev/null differ
diff --git a/examples/depth_normal/depth/0771.png b/examples/depth_normal/depth/0771.png
deleted file mode 100644
index 3ff506e7d528df55a5668a22acfe5dcf337be5da..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/0771.png and /dev/null differ
diff --git a/examples/depth_normal/depth/0782.png b/examples/depth_normal/depth/0782.png
deleted file mode 100644
index 88c348cf1155d310d97a57d98d76dc322f6d6fc2..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/0782.png and /dev/null differ
diff --git a/examples/depth_normal/depth/1001.png b/examples/depth_normal/depth/1001.png
deleted file mode 100644
index 316fa21f5667d0ece933cede761e2a31b1a6cc1f..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/1001.png and /dev/null differ
diff --git a/examples/depth_normal/depth/1051.png b/examples/depth_normal/depth/1051.png
deleted file mode 100644
index 796b10bdf87b7b63afdc730a2650799bf539a28a..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/1051.png and /dev/null differ
diff --git a/examples/depth_normal/depth/1129.png b/examples/depth_normal/depth/1129.png
deleted file mode 100644
index 605757f6f360aa27f67de05f4c5a3101ab4efa84..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/1129.png and /dev/null differ
diff --git a/examples/depth_normal/depth/1205.png b/examples/depth_normal/depth/1205.png
deleted file mode 100644
index 73fed9e0e02c30833749f536f1426952aa0173ab..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/1205.png and /dev/null differ
diff --git a/examples/depth_normal/depth/1336.png b/examples/depth_normal/depth/1336.png
deleted file mode 100644
index b3e6af9faa0a7e3fe0c1e74f2d68ec2945a2fbe9..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/1336.png and /dev/null differ
diff --git a/examples/depth_normal/depth/1383.png b/examples/depth_normal/depth/1383.png
deleted file mode 100644
index 8014f219637e0c0a27443bd7adce0a79fc797b97..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/1383.png and /dev/null differ
diff --git a/examples/depth_normal/depth/1386.png b/examples/depth_normal/depth/1386.png
deleted file mode 100644
index 5b79992450c741744080d7797b84d92b17fe4d76..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/1386.png and /dev/null differ
diff --git a/examples/depth_normal/depth/1393.png b/examples/depth_normal/depth/1393.png
deleted file mode 100644
index 8d46e64bbe737e5e270b25c1e8dc6506cb49a5d0..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/1393.png and /dev/null differ
diff --git a/examples/depth_normal/depth/1447.png b/examples/depth_normal/depth/1447.png
deleted file mode 100644
index 8aee58fc6f6f67a1968642048972f735f2b986cb..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth/1447.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/0084.png b/examples/depth_normal/depth_scaled/0084.png
deleted file mode 100644
index 0078f30f722bfad08878882fe942bef002cf5706..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/0084.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/0131.png b/examples/depth_normal/depth_scaled/0131.png
deleted file mode 100644
index 557b87cddf9b265913879def70f022140ecb0a7d..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/0131.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/0297.png b/examples/depth_normal/depth_scaled/0297.png
deleted file mode 100644
index f2639189ac534622b99992473c85f9aedab99e70..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/0297.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/0331.png b/examples/depth_normal/depth_scaled/0331.png
deleted file mode 100644
index 6349276a82a7f75a2b513a43054d5ce002c588fc..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/0331.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/0432.png b/examples/depth_normal/depth_scaled/0432.png
deleted file mode 100644
index 0c0143ff41024d287e60248cf4e74db992563e2d..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/0432.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/0633.png b/examples/depth_normal/depth_scaled/0633.png
deleted file mode 100644
index a3718e391168acbd2832259751bb3e60cd449dd3..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/0633.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/0663.png b/examples/depth_normal/depth_scaled/0663.png
deleted file mode 100644
index bdbaf21cd1dc8bcb030891669bfac003570e648a..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/0663.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/0771.png b/examples/depth_normal/depth_scaled/0771.png
deleted file mode 100644
index 8ef66839c8713a68e7085560b0408b9404fb81e3..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/0771.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/0782.png b/examples/depth_normal/depth_scaled/0782.png
deleted file mode 100644
index 6fcb9ce17fd711ca42898b643a9267d4a37f80a8..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/0782.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/1001.png b/examples/depth_normal/depth_scaled/1001.png
deleted file mode 100644
index 9e86a9e6e3ccad1edb67dd056d8247f7389ff30e..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/1001.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/1051.png b/examples/depth_normal/depth_scaled/1051.png
deleted file mode 100644
index f35ab717fc8d483c9e55903dfc29ff91b98025ae..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/1051.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/1129.png b/examples/depth_normal/depth_scaled/1129.png
deleted file mode 100644
index 68bf3c0197a06bd593bdba6622ced92dd1683e5f..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/1129.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/1205.png b/examples/depth_normal/depth_scaled/1205.png
deleted file mode 100644
index 2c51a94b28688112da5bdc7f95c02f8cf75d6b21..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/1205.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/1336.png b/examples/depth_normal/depth_scaled/1336.png
deleted file mode 100644
index 2510620135749e715d247b064a9bd66b7692de13..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/1336.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/1383.png b/examples/depth_normal/depth_scaled/1383.png
deleted file mode 100644
index 6e86bb6ded2552ae6ed4a61b75a5273a595bc385..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/1383.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/1386.png b/examples/depth_normal/depth_scaled/1386.png
deleted file mode 100644
index e0db1f0a9d602679a56e44e677b8f2f205fb552c..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/1386.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/1393.png b/examples/depth_normal/depth_scaled/1393.png
deleted file mode 100644
index 68d8c8ab31f88bfacb9dad9fa16cffcc3b690c57..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/1393.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/1447.png b/examples/depth_normal/depth_scaled/1447.png
deleted file mode 100644
index fb365a1f1b59f85016e826b03fed4e3c31060a20..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/depth_scaled/1447.png and /dev/null differ
diff --git a/examples/depth_normal/depth_scaled/scale_gray_image.py b/examples/depth_normal/depth_scaled/scale_gray_image.py
deleted file mode 100644
index 70a284f733dedf30b3d984f01c2ef655a799e920..0000000000000000000000000000000000000000
--- a/examples/depth_normal/depth_scaled/scale_gray_image.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import os
-from PIL import Image
-import glob
-import numpy as np
-
-files = glob.glob('examples/depth_normal/depth/*')
-
-for file in files:
-    img = Image.open(file)
-    img = np.array(img)
-    img = (img - img.min()) / (img.max() - img.min()) * 255
-    # img = img / img.max() * 255
-    img = np.array(img, dtype=np.uint8)
-    img = Image.fromarray(img)
-    img.save(file.replace('depth_normal/depth', 'depth_normal/depth_scaled'))
\ No newline at end of file
diff --git a/examples/depth_normal/normal/0084.png b/examples/depth_normal/normal/0084.png
deleted file mode 100644
index cd7b66ee73d17c5468a6792d2e59f21adaef64a3..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/0084.png and /dev/null differ
diff --git a/examples/depth_normal/normal/0131.png b/examples/depth_normal/normal/0131.png
deleted file mode 100644
index 93c1e14c63941a65e58333a12cefe4534bd452eb..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/0131.png and /dev/null differ
diff --git a/examples/depth_normal/normal/0297.png b/examples/depth_normal/normal/0297.png
deleted file mode 100644
index b2600315b7bdcfc19ea4e44e442d20188b51755a..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/0297.png and /dev/null differ
diff --git a/examples/depth_normal/normal/0331.png b/examples/depth_normal/normal/0331.png
deleted file mode 100644
index e580c91a4a7cda7572ff03b68402404607e87b2a..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/0331.png and /dev/null differ
diff --git a/examples/depth_normal/normal/0432.png b/examples/depth_normal/normal/0432.png
deleted file mode 100644
index ddd82d55fa1f540b6669148a9a7a50e07f15b10d..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/0432.png and /dev/null differ
diff --git a/examples/depth_normal/normal/0633.png b/examples/depth_normal/normal/0633.png
deleted file mode 100644
index 705949a9f77f4903223df0259b042faafeb5f283..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/0633.png and /dev/null differ
diff --git a/examples/depth_normal/normal/0663.png b/examples/depth_normal/normal/0663.png
deleted file mode 100644
index ab435fbefbe4aeba36343e3946de864a56bd87d4..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/0663.png and /dev/null differ
diff --git a/examples/depth_normal/normal/0771.png b/examples/depth_normal/normal/0771.png
deleted file mode 100644
index 8cd556e72ac632e228b5a339b8230ba6bff57515..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/0771.png and /dev/null differ
diff --git a/examples/depth_normal/normal/0782.png b/examples/depth_normal/normal/0782.png
deleted file mode 100644
index ec3191a8f9e52843b41b1227014c5a41908312c1..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/0782.png and /dev/null differ
diff --git a/examples/depth_normal/normal/1001.png b/examples/depth_normal/normal/1001.png
deleted file mode 100644
index b4b3f076eb774afe30686c06dafd6a778b96acce..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/1001.png and /dev/null differ
diff --git a/examples/depth_normal/normal/1051.png b/examples/depth_normal/normal/1051.png
deleted file mode 100644
index 0dfe42491ce8d988943db5f536c289345fcb5a49..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/1051.png and /dev/null differ
diff --git a/examples/depth_normal/normal/1129.png b/examples/depth_normal/normal/1129.png
deleted file mode 100644
index 540bc5a9418dd03f5b1b06da728ea8bbc62831c3..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/1129.png and /dev/null differ
diff --git a/examples/depth_normal/normal/1205.png b/examples/depth_normal/normal/1205.png
deleted file mode 100644
index c386a47fb8ad77da99c712dafa9923d259e3a43d..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/1205.png and /dev/null differ
diff --git a/examples/depth_normal/normal/1336.png b/examples/depth_normal/normal/1336.png
deleted file mode 100644
index 7fa2d85ac16269f908d8bc0e698e22f5d71cb625..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/1336.png and /dev/null differ
diff --git a/examples/depth_normal/normal/1383.png b/examples/depth_normal/normal/1383.png
deleted file mode 100644
index 9f4943a9994fd45e0fe429aeb5714380d0f59d61..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/1383.png and /dev/null differ
diff --git a/examples/depth_normal/normal/1386.png b/examples/depth_normal/normal/1386.png
deleted file mode 100644
index 192c0e48b933ff7cd901b64c9274e5b5a72220bf..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/1386.png and /dev/null differ
diff --git a/examples/depth_normal/normal/1393.png b/examples/depth_normal/normal/1393.png
deleted file mode 100644
index 5d392f026386812570c7621c55e4799e5927d643..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/1393.png and /dev/null differ
diff --git a/examples/depth_normal/normal/1447.png b/examples/depth_normal/normal/1447.png
deleted file mode 100644
index 3fd9069c2bd46b300e552162253a56f90da9cefb..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/normal/1447.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/0084.png b/examples/depth_normal/rgb/0084.png
deleted file mode 100644
index 9df82f1ba383fd86af1a610fda4cde8e24fd7082..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/0084.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/0131.png b/examples/depth_normal/rgb/0131.png
deleted file mode 100644
index 6c52c4a153d172c1cc9b035bea839526937fbff6..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/0131.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/0297.png b/examples/depth_normal/rgb/0297.png
deleted file mode 100644
index db315980900bd1fc47eaee3669a0a953852b8aa8..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/0297.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/0331.png b/examples/depth_normal/rgb/0331.png
deleted file mode 100644
index c2a241e9b980034d8b8c011f90b66106ff948677..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/0331.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/0432.png b/examples/depth_normal/rgb/0432.png
deleted file mode 100644
index ac1acdfa00657394733f14def2b20f825fea5a54..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/0432.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/0633.png b/examples/depth_normal/rgb/0633.png
deleted file mode 100644
index 533dd3f26e846cad5ab968a553e66a3ff72b5b15..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/0633.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/0663.png b/examples/depth_normal/rgb/0663.png
deleted file mode 100644
index 1c203adfab262d6a76ecb43998650af1d39f7792..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/0663.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/0771.png b/examples/depth_normal/rgb/0771.png
deleted file mode 100644
index 9e8468e89c9ee7521a2bf978386b9ac8caa8898d..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/0771.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/0782.png b/examples/depth_normal/rgb/0782.png
deleted file mode 100644
index 32f6886e0711c130e1f87d06d479fd5b0d739db4..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/0782.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/1001.png b/examples/depth_normal/rgb/1001.png
deleted file mode 100644
index 81ed4d32ad01c6664285382c77158cf81e75515a..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/1001.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/1051.png b/examples/depth_normal/rgb/1051.png
deleted file mode 100644
index e5ae68a29809275502fe3e9573a72b5239a9bbb2..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/1051.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/1129.png b/examples/depth_normal/rgb/1129.png
deleted file mode 100644
index 18863c260682f8f246cd0bea31c8769e571b08cb..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/1129.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/1205.png b/examples/depth_normal/rgb/1205.png
deleted file mode 100644
index eab3bcc589b47e303d6384c81b1bba80c6f7ef08..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/1205.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/1336.png b/examples/depth_normal/rgb/1336.png
deleted file mode 100644
index 186be4ce3e9a3e14a606cd4cb1c31af441273213..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/1336.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/1383.png b/examples/depth_normal/rgb/1383.png
deleted file mode 100644
index a67899ab32d3f55d57dd91259c449563941b077b..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/1383.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/1386.png b/examples/depth_normal/rgb/1386.png
deleted file mode 100644
index 94825b52ddf0fbf46a135858d5a6ffc0bee516fb..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/1386.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/1393.png b/examples/depth_normal/rgb/1393.png
deleted file mode 100644
index 06136cf0eaf014f6afbc7cdb13d09807766a258e..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/1393.png and /dev/null differ
diff --git a/examples/depth_normal/rgb/1447.png b/examples/depth_normal/rgb/1447.png
deleted file mode 100644
index 81c627e153832b70cfb0bca136548c9298d4d7ed..0000000000000000000000000000000000000000
Binary files a/examples/depth_normal/rgb/1447.png and /dev/null differ
diff --git a/examples/flower.mp4 b/examples/flower.mp4
deleted file mode 100644
index e56947e749ed98f50625fa30e4323e2699f5e22b..0000000000000000000000000000000000000000
--- a/examples/flower.mp4
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:04f27dd2ae6f1a5cb836030983c9b423e182e6bf7de3e2138c97bf698c6f29ba
-size 3139496
diff --git a/examples/fmri/sample000000498.jpg b/examples/fmri/sample000000498.jpg
deleted file mode 100644
index 04638377ab900f6215333e579c2645b1358db07e..0000000000000000000000000000000000000000
Binary files a/examples/fmri/sample000000498.jpg and /dev/null differ
diff --git a/examples/fmri/sample000000498.npy b/examples/fmri/sample000000498.npy
deleted file mode 100644
index 47fb8079be709e296992bbd1b065c61c684b1342..0000000000000000000000000000000000000000
--- a/examples/fmri/sample000000498.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:db81853dbaf9ed1869162d9f7218e6b53edf6ea8fac391ef181f59304c3918d4
-size 94472
diff --git a/examples/fmri/sample000000789.jpg b/examples/fmri/sample000000789.jpg
deleted file mode 100644
index a264d1d70249cae4ab44348cb9e472ed0cd4bfc6..0000000000000000000000000000000000000000
Binary files a/examples/fmri/sample000000789.jpg and /dev/null differ
diff --git a/examples/fmri/sample000000789.npy b/examples/fmri/sample000000789.npy
deleted file mode 100644
index 8788bf494144b9f0b6b17df98860f9054c16a276..0000000000000000000000000000000000000000
--- a/examples/fmri/sample000000789.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e659eefe264f472847e496b4990459300fbb73587f0f99f69f630b3bdcd86b53
-size 94472
diff --git a/examples/fmri/sample000001037.jpg b/examples/fmri/sample000001037.jpg
deleted file mode 100644
index 57ff3ba732878894e104c3e93622c165b3f752ad..0000000000000000000000000000000000000000
Binary files a/examples/fmri/sample000001037.jpg and /dev/null differ
diff --git a/examples/fmri/sample000001037.npy b/examples/fmri/sample000001037.npy
deleted file mode 100644
index 3a5a2cf33676e168f3efa6eab2e141c4a85db3e0..0000000000000000000000000000000000000000
--- a/examples/fmri/sample000001037.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e62dcb5ad478c8d50e3b60d558e4df56a03c1f3999883da2719c43464db48dc4
-size 94472
diff --git a/examples/fmri/sample000001554.jpg b/examples/fmri/sample000001554.jpg
deleted file mode 100644
index 0c44473e9eb8d947756355405f0143ce4b537f98..0000000000000000000000000000000000000000
Binary files a/examples/fmri/sample000001554.jpg and /dev/null differ
diff --git a/examples/fmri/sample000001554.npy b/examples/fmri/sample000001554.npy
deleted file mode 100644
index cfe2f7905324d1dfeb39860d5eda5dee9ec252d8..0000000000000000000000000000000000000000
--- a/examples/fmri/sample000001554.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:54e647f3823c878647b2ef18966e033b6f9d865016afc30be2cd06e09b3511f8
-size 94472
diff --git a/examples/fmri/sample000002995.jpg b/examples/fmri/sample000002995.jpg
deleted file mode 100644
index 74ec7d71f26f658ae20488541e667e2a77d98d9b..0000000000000000000000000000000000000000
Binary files a/examples/fmri/sample000002995.jpg and /dev/null differ
diff --git a/examples/fmri/sample000002995.npy b/examples/fmri/sample000002995.npy
deleted file mode 100644
index 4990a7d7123117c4c55766549c71c7e4531e923e..0000000000000000000000000000000000000000
--- a/examples/fmri/sample000002995.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a9d9187274787d4f1d07292c9c25ea117f7810902b75cd9434aade4a7d659f9d
-size 94472
diff --git a/examples/fmri/sample000004364.jpg b/examples/fmri/sample000004364.jpg
deleted file mode 100644
index fb86fd6d449a0165d220f0d2c414617a9e31e00d..0000000000000000000000000000000000000000
Binary files a/examples/fmri/sample000004364.jpg and /dev/null differ
diff --git a/examples/fmri/sample000004364.npy b/examples/fmri/sample000004364.npy
deleted file mode 100644
index 75a7ff22e36a07bd28a1d86a02d1afbec9f524c5..0000000000000000000000000000000000000000
--- a/examples/fmri/sample000004364.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1e4800075aa129fd3bbdd27c63c7597b375cac3552555ac1173db11542275125
-size 94472
diff --git a/examples/fmri/sample000004550.jpg b/examples/fmri/sample000004550.jpg
deleted file mode 100644
index c47e80b1d921ee68877bd7d2d21adff3ddcf8573..0000000000000000000000000000000000000000
Binary files a/examples/fmri/sample000004550.jpg and /dev/null differ
diff --git a/examples/fmri/sample000004550.npy b/examples/fmri/sample000004550.npy
deleted file mode 100644
index cb2ee3c627fed44a56967b732cb9bb7dc5fb2c28..0000000000000000000000000000000000000000
--- a/examples/fmri/sample000004550.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:64f6c75ff93ddb5988fc07f861fe4ff053996dcd7d72a96807fce77efbb82cd7
-size 94472
diff --git a/examples/fmri/sample000005047.jpg b/examples/fmri/sample000005047.jpg
deleted file mode 100644
index fc14277ec9c83a145951dee3b1d3ddd94813bf79..0000000000000000000000000000000000000000
Binary files a/examples/fmri/sample000005047.jpg and /dev/null differ
diff --git a/examples/fmri/sample000005047.npy b/examples/fmri/sample000005047.npy
deleted file mode 100644
index 961b9bc63816f8774251cc637e93ef51866e1e14..0000000000000000000000000000000000000000
--- a/examples/fmri/sample000005047.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8e44a51e7f0875a3a5d9b132ccc9f2f0ce2af13a566956272773727f80f2afa3
-size 94472
diff --git a/examples/fmri/sample000005139.jpg b/examples/fmri/sample000005139.jpg
deleted file mode 100644
index 7204d5e7d384d24ec53b83f2dc07ace3774ce9a2..0000000000000000000000000000000000000000
Binary files a/examples/fmri/sample000005139.jpg and /dev/null differ
diff --git a/examples/fmri/sample000005139.npy b/examples/fmri/sample000005139.npy
deleted file mode 100644
index 8558f84729ef4400b3d7ed508d5e6ce499535f85..0000000000000000000000000000000000000000
--- a/examples/fmri/sample000005139.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7d08f633cc921f084de2512717b08f14631581c74eacfaf7e2e025b59adc377b
-size 94472
diff --git a/examples/food_menu.png b/examples/food_menu.png
deleted file mode 100644
index 511436d7957fbf86bff5ea70d13d3f034a3ab20f..0000000000000000000000000000000000000000
--- a/examples/food_menu.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0ba055ca9485f77c577433701193d03267dce9e6d08916c44d7f3435287dca96
-size 1075142
diff --git a/examples/new_york.jpg b/examples/new_york.jpg
deleted file mode 100644
index f6cc93d46a6603c58e3020198aef737dada90f3e..0000000000000000000000000000000000000000
Binary files a/examples/new_york.jpg and /dev/null differ
diff --git a/examples/point/0031ba19d3e042c4bcf79eba40ccc812_8192.npy b/examples/point/0031ba19d3e042c4bcf79eba40ccc812_8192.npy
deleted file mode 100644
index c6d580cdc86c4b1ad50065a5acbc643a67c8ca37..0000000000000000000000000000000000000000
--- a/examples/point/0031ba19d3e042c4bcf79eba40ccc812_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d3d3c8cd5e953e30bc86a706e1c69d629bde6711820e3ba4d106ccd1befe7686
-size 196736
diff --git a/examples/point/1a282b70b3f14fd79d739891e4327df5_8192.npy b/examples/point/1a282b70b3f14fd79d739891e4327df5_8192.npy
deleted file mode 100644
index b70e5bbbd4f4e7038f6203894f02a1b43498e9f6..0000000000000000000000000000000000000000
--- a/examples/point/1a282b70b3f14fd79d739891e4327df5_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:874bc35c0b23a5bc6063e71b420ae51b82190862b87b4c37e8295f5ca4be2e05
-size 196736
diff --git a/examples/point/1ee7fc4081ac47ccaae8112b6e925c51_8192.npy b/examples/point/1ee7fc4081ac47ccaae8112b6e925c51_8192.npy
deleted file mode 100644
index b093b25d638c023d9d1e0bf7aa4a19b486668c0e..0000000000000000000000000000000000000000
--- a/examples/point/1ee7fc4081ac47ccaae8112b6e925c51_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:31d011e25b3745bcd9f3266b9d9f4b4cd03d680f007b6fc46de7b621dbc2bbb4
-size 196736
diff --git a/examples/point/2221aec5809d43d88aedf82738432318_8192.npy b/examples/point/2221aec5809d43d88aedf82738432318_8192.npy
deleted file mode 100644
index a9e1868fdc9b2ce87d243a0a0ee0cee7b924b98f..0000000000000000000000000000000000000000
--- a/examples/point/2221aec5809d43d88aedf82738432318_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d7569c06b11aaea351477d244d8e76152501190f16e28227de2ebf7ba5354c0b
-size 196736
diff --git a/examples/point/3061e9a9236d4b98b74c95ff189d28a0_8192.npy b/examples/point/3061e9a9236d4b98b74c95ff189d28a0_8192.npy
deleted file mode 100644
index 4d246837ff09291d6c4216a7e029685117951540..0000000000000000000000000000000000000000
--- a/examples/point/3061e9a9236d4b98b74c95ff189d28a0_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3202eed35e0259d8ff00a49c35b2f0f61324fd90da78ea832f5c356fccd9ea35
-size 196736
diff --git a/examples/point/39930c4d685f4a9c8f7bd63c1a2a7dee_8192.npy b/examples/point/39930c4d685f4a9c8f7bd63c1a2a7dee_8192.npy
deleted file mode 100644
index 6a4c24f848b82cc62f7c3bf87f8873ddc0d083e6..0000000000000000000000000000000000000000
--- a/examples/point/39930c4d685f4a9c8f7bd63c1a2a7dee_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4eafe3bed10c54cf0f862982a36bf4edf5cb7fccb8e0af7c0188a5a900d61286
-size 196736
diff --git a/examples/point/3a61b9d2451c4df2a1c6a6550e1875df_8192.npy b/examples/point/3a61b9d2451c4df2a1c6a6550e1875df_8192.npy
deleted file mode 100644
index cd3e149bf44a0b7419f95e2f4e07298001fcf563..0000000000000000000000000000000000000000
--- a/examples/point/3a61b9d2451c4df2a1c6a6550e1875df_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6f9be8648887b54273884df915f1f351877499fb6092f2e85b6cfaf67b03476e
-size 196736
diff --git a/examples/point/442e0532a7634ad1ba22d28534267e2f_8192.npy b/examples/point/442e0532a7634ad1ba22d28534267e2f_8192.npy
deleted file mode 100644
index 831b55f10815d7028011c868fd95531722eef031..0000000000000000000000000000000000000000
--- a/examples/point/442e0532a7634ad1ba22d28534267e2f_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cf7dfbf43f9794c8a5c90033eb458010d13defa6dc04a1d61b274675e248bfa7
-size 196736
diff --git a/examples/point/463fe91df66e44a681ebbcd7f7ba59e4_8192.npy b/examples/point/463fe91df66e44a681ebbcd7f7ba59e4_8192.npy
deleted file mode 100644
index ab82b31f4088e351abbf3b0a79fdb23b5f3bf643..0000000000000000000000000000000000000000
--- a/examples/point/463fe91df66e44a681ebbcd7f7ba59e4_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:21033208a5418f923e1c281d5bdd81ac75b09c6d047030350a718bd50d8f5185
-size 196736
diff --git a/examples/point/494854c6ffd540339a1bc6bd91c0349b_8192.npy b/examples/point/494854c6ffd540339a1bc6bd91c0349b_8192.npy
deleted file mode 100644
index 80408012c7e5b02fbce43e7d01252ad23223df0a..0000000000000000000000000000000000000000
--- a/examples/point/494854c6ffd540339a1bc6bd91c0349b_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c5a87443e5b14abcf04738ae9ceb01c62e87c3b5108e1db147880b7bdb700160
-size 196736
diff --git a/examples/point/4adbacc99de44b5597ce3e1f62026fc1_8192.npy b/examples/point/4adbacc99de44b5597ce3e1f62026fc1_8192.npy
deleted file mode 100644
index 8fd1e3f91e6fc8bf88ea6daed190f571eb0556b9..0000000000000000000000000000000000000000
--- a/examples/point/4adbacc99de44b5597ce3e1f62026fc1_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:81d79afe15fa2c011dd82c7cfbbe17bb062b364824abcb37f7b3da68a763a4c3
-size 196736
diff --git a/examples/point/4d6f0a6aab8e45f4af131ac517c74363_8192.npy b/examples/point/4d6f0a6aab8e45f4af131ac517c74363_8192.npy
deleted file mode 100644
index 95b22997a5acc93e2ea0b03ce46b57d2691dc61d..0000000000000000000000000000000000000000
--- a/examples/point/4d6f0a6aab8e45f4af131ac517c74363_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2ad1e243593b19e7e58a77f0512a8f892d663142853efac1e56c5a83db78cc50
-size 196736
diff --git a/examples/point/541747a461f84142a113e2f591a7eb9b_8192.npy b/examples/point/541747a461f84142a113e2f591a7eb9b_8192.npy
deleted file mode 100644
index 4a5462ff31a8089082d316f5b6c346e5dbadefc5..0000000000000000000000000000000000000000
--- a/examples/point/541747a461f84142a113e2f591a7eb9b_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d00c235c92766c9dde832209df18f036e68c74b11e477f40d283cde1af8b1ea2
-size 196736
diff --git a/examples/point/6796698b4d564377838c06319ec96f90_8192.npy b/examples/point/6796698b4d564377838c06319ec96f90_8192.npy
deleted file mode 100644
index 35de7ae372bec975e106c297ae9da201c0e08a8e..0000000000000000000000000000000000000000
--- a/examples/point/6796698b4d564377838c06319ec96f90_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:da91660e20b77a3f9f8a8ab173cc6c0998063282d387049906252bbf661f0b5f
-size 196736
diff --git a/examples/point/69865c89fc7344be8ed5c1a54dbddc20_8192.npy b/examples/point/69865c89fc7344be8ed5c1a54dbddc20_8192.npy
deleted file mode 100644
index f6acb82a325911acc69d8a56684ee4b2240831a7..0000000000000000000000000000000000000000
--- a/examples/point/69865c89fc7344be8ed5c1a54dbddc20_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:99a8363e432756f90425b6a5192cf5b8a547a5bd009577e72203fd42fd2319cf
-size 196736
diff --git a/examples/point/7610a160fbaf41d681c7d7b9627df85d_8192.npy b/examples/point/7610a160fbaf41d681c7d7b9627df85d_8192.npy
deleted file mode 100644
index 511fb7bc10aec9e71fcf5ccde7a2322bdc4b57db..0000000000000000000000000000000000000000
--- a/examples/point/7610a160fbaf41d681c7d7b9627df85d_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ca43630b2f139a5b0a90633f1d13d3f61e4432bf9976de5c233791ab57553dae
-size 196736
diff --git a/examples/point/7f309e755a0c4b69b64e667977268087_8192.npy b/examples/point/7f309e755a0c4b69b64e667977268087_8192.npy
deleted file mode 100644
index d2018b84489b367622b1f5462b3b2bca4195282a..0000000000000000000000000000000000000000
--- a/examples/point/7f309e755a0c4b69b64e667977268087_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0add1f7bb1ad3f4b70c383431e7a2ac372c8e4377d2987c4fc08f6bd6776d5af
-size 196736
diff --git a/examples/point/844518dcccf44c86ad1ead660ab103fb_8192.npy b/examples/point/844518dcccf44c86ad1ead660ab103fb_8192.npy
deleted file mode 100644
index 1add0562ccfe7db199118d5a96e99bb9bb4b73bb..0000000000000000000000000000000000000000
--- a/examples/point/844518dcccf44c86ad1ead660ab103fb_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:60c2a1284188bf60e405e27f2008030247657ea256e9e3f95940afde294a8080
-size 196736
diff --git a/examples/point/88867268d228490da73b99d3126dc25c_8192.npy b/examples/point/88867268d228490da73b99d3126dc25c_8192.npy
deleted file mode 100644
index 0c5d9d52ada786f07cd4ffb2d8504ee9d6fc9f4a..0000000000000000000000000000000000000000
--- a/examples/point/88867268d228490da73b99d3126dc25c_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:53bca2ad577b7eaf2c531379f69d9d904ad632a57205763ff5e17f2d2bfa7716
-size 196736
diff --git a/examples/point/8c833864d0bb40f4b3f3a3fe1e99e702_8192.npy b/examples/point/8c833864d0bb40f4b3f3a3fe1e99e702_8192.npy
deleted file mode 100644
index 71d5da5b41e311f4d5835a382588cded070e0f6b..0000000000000000000000000000000000000000
--- a/examples/point/8c833864d0bb40f4b3f3a3fe1e99e702_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:89dff97c8370e615e5674b17a7ef527b24af2bfbfe764294a5efa103064ed887
-size 196736
diff --git a/examples/point/983fa8b23a084f5dacd157e6c9ceba97_8192.npy b/examples/point/983fa8b23a084f5dacd157e6c9ceba97_8192.npy
deleted file mode 100644
index ef9998819af5e8486a59bf282aec68de8223939f..0000000000000000000000000000000000000000
--- a/examples/point/983fa8b23a084f5dacd157e6c9ceba97_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b7a67c55f4bc9a29edb9eaf458bfdcc67cb93fad504da69f6afe0e10cf1c1e1e
-size 196736
diff --git a/examples/point/9e9fa9bb1761412bb0e7ca9913f64e91_8192.npy b/examples/point/9e9fa9bb1761412bb0e7ca9913f64e91_8192.npy
deleted file mode 100644
index d5e46cf24109459ff29a370b7eb51c8914b2f61d..0000000000000000000000000000000000000000
--- a/examples/point/9e9fa9bb1761412bb0e7ca9913f64e91_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:afcffb80b428ee8728a9d30b0c7643655a1419a41872f4313082ad6768ee1eb2
-size 196736
diff --git a/examples/point/a4a0a21eedae4d82b0deac43b61dc4a3_8192.npy b/examples/point/a4a0a21eedae4d82b0deac43b61dc4a3_8192.npy
deleted file mode 100644
index 8f63fe8598f7b5f2306cad3e7ed24290d49a8c4e..0000000000000000000000000000000000000000
--- a/examples/point/a4a0a21eedae4d82b0deac43b61dc4a3_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:895b30c7ac4e89e53b4beaac9199c33d1bd4fa8a39f15fa52fe237365bad9ed4
-size 196736
diff --git a/examples/point/abc413b5e2d14a028dab615b49777167_8192.npy b/examples/point/abc413b5e2d14a028dab615b49777167_8192.npy
deleted file mode 100644
index 4ff8fa0f780b4fe80b4ad87ada1f129af14645ad..0000000000000000000000000000000000000000
--- a/examples/point/abc413b5e2d14a028dab615b49777167_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:17dd6a38a6a0ad22e1ac17265d49b50083f196cbd8bfeb1f67cdfbc8df5eacc8
-size 196736
diff --git a/examples/point/b65b0ad074664b4aa624583e52fa685e_8192.npy b/examples/point/b65b0ad074664b4aa624583e52fa685e_8192.npy
deleted file mode 100644
index f0ab2d7f6e9305c87e4aa3a5fee497d7ed8cbd44..0000000000000000000000000000000000000000
--- a/examples/point/b65b0ad074664b4aa624583e52fa685e_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:06e1b52c90abd98dc01adfa51048dc16806abbd4609c8cb1ab12bb99e52f65d3
-size 196736
diff --git a/examples/point/c20eb3a5a93e4cddb06c2f98626b1830_8192.npy b/examples/point/c20eb3a5a93e4cddb06c2f98626b1830_8192.npy
deleted file mode 100644
index b93a3b131994b9f286d913e5edaa1db69cecc079..0000000000000000000000000000000000000000
--- a/examples/point/c20eb3a5a93e4cddb06c2f98626b1830_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3793341c5f418f53b09b13e50a969b9bbb6ac865c4c59c273bbf41a06f1bc238
-size 196736
diff --git a/examples/point/d43136ef2b1a46998b7214df1a15b72c_8192.npy b/examples/point/d43136ef2b1a46998b7214df1a15b72c_8192.npy
deleted file mode 100644
index 4ef4d4bea6fa77c01db9cdac6be0714ab9108463..0000000000000000000000000000000000000000
--- a/examples/point/d43136ef2b1a46998b7214df1a15b72c_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:32cd2cd274d12c93873f41653a3d94599a8ab1108df029365f20cfd474f6bf38
-size 196736
diff --git a/examples/point/d4aa4126fce64397935d128ed54c3def_8192.npy b/examples/point/d4aa4126fce64397935d128ed54c3def_8192.npy
deleted file mode 100644
index 21f3f99a8fbcf123b00334081a9d1993769b70d2..0000000000000000000000000000000000000000
--- a/examples/point/d4aa4126fce64397935d128ed54c3def_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2cc8f49dcfeb8aca490d2ac7f08e6790ca09ed464f4be8b744710d365c23a487
-size 196736
diff --git a/examples/point/dd491278516040998667f9dd998fad4f_8192.npy b/examples/point/dd491278516040998667f9dd998fad4f_8192.npy
deleted file mode 100644
index b214b850f8c2d920c384deb2dd1747ea4124c990..0000000000000000000000000000000000000000
--- a/examples/point/dd491278516040998667f9dd998fad4f_8192.npy
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1e4d509ddcd27c36f7c413cc31b5372cc4510cf8d73df0521107b404279c4698
-size 196736
diff --git a/examples/star_kun.mp4 b/examples/star_kun.mp4
deleted file mode 100644
index 57b6f00d490e30d72bb7184c73a100604436190e..0000000000000000000000000000000000000000
--- a/examples/star_kun.mp4
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5762ad0ae954da8fbc33d524e90969b9fc56745f2ce513724e2ff40da3f1ac16
-size 14008046
diff --git a/model/LLM/__init__.py b/model/LLM/__init__.py
deleted file mode 100644
index 9e8eb9e9325f1906f28a9d60d967ff76963ff1a8..0000000000000000000000000000000000000000
--- a/model/LLM/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from . import onellm
\ No newline at end of file
diff --git a/model/LLM/__pycache__/__init__.cpython-310.pyc b/model/LLM/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index e70f6416d504770062ceb50661a6094181c47ea2..0000000000000000000000000000000000000000
Binary files a/model/LLM/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/model/LLM/__pycache__/__init__.cpython-39.pyc b/model/LLM/__pycache__/__init__.cpython-39.pyc
deleted file mode 100644
index be815601063517a817e23e64c9a6208e4e66d833..0000000000000000000000000000000000000000
Binary files a/model/LLM/__pycache__/__init__.cpython-39.pyc and /dev/null differ
diff --git a/model/LLM/__pycache__/onellm.cpython-310.pyc b/model/LLM/__pycache__/onellm.cpython-310.pyc
deleted file mode 100644
index ccf829243e41031a865186ba965b5d98d44174f1..0000000000000000000000000000000000000000
Binary files a/model/LLM/__pycache__/onellm.cpython-310.pyc and /dev/null differ
diff --git a/model/LLM/__pycache__/onellm.cpython-39.pyc b/model/LLM/__pycache__/onellm.cpython-39.pyc
deleted file mode 100644
index c82979b64a1ece8c7e506cf5b6df9a77d81ffe6f..0000000000000000000000000000000000000000
Binary files a/model/LLM/__pycache__/onellm.cpython-39.pyc and /dev/null differ
diff --git a/model/LLM/onellm.py b/model/LLM/onellm.py
deleted file mode 100644
index f363317224c40f96578931f6f1e2ac3f9133612e..0000000000000000000000000000000000000000
--- a/model/LLM/onellm.py
+++ /dev/null
@@ -1,494 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed according to the terms of the GNU General Public License version 3.
-
-from typing import Optional, Tuple
-from dataclasses import dataclass
-import math
-import functools
-import copy
-
-import torch
-from torch import nn
-import torch.nn.functional as F
-
-import fairscale.nn.model_parallel.initialize as fs_init
-from fairscale.nn.model_parallel.layers import (
-    ParallelEmbedding,
-    RowParallelLinear,
-    ColumnParallelLinear,
-)
-from ..components import RMSNorm
-from flash_attn import flash_attn_func
-
-import open_clip
-
-
-default_linear_init = nn.init.xavier_uniform_
-
-
-@dataclass
-class ModelArgs:
-    dim: int = 512
-    n_layers: int = 8
-    n_heads: int = 8
-    vocab_size: int = -1  # defined later by tokenizer
-    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
-    norm_eps: float = 1e-5
-
-    max_batch_size: int = 32
-    max_seq_len: int = 2048
-
-
-def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)
-                   [: (dim // 2)].float() / dim))
-    t = torch.arange(end, device=freqs.device)  # type: ignore
-    freqs = torch.outer(t, freqs).float()  # type: ignore
-    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
-    return freqs_cis
-
-
-def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
-    ndim = x.ndim
-    assert 0 <= 1 < ndim
-    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
-    shape = [d if i == 1 or i == ndim -
-             1 else 1 for i, d in enumerate(x.shape)]
-    return freqs_cis.view(*shape)
-
-
-def apply_rotary_emb(
-    xq: torch.Tensor,
-    xk: torch.Tensor,
-    freqs_cis: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
-    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
-    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
-    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
-    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
-    return xq_out.type_as(xq), xk_out.type_as(xk)
-
-
-class Attention(nn.Module):
-    def __init__(self, args: ModelArgs):
-        super().__init__()
-
-        self.n_local_heads = args.n_heads // fs_init.get_model_parallel_world_size()
-        self.head_dim = args.dim // args.n_heads
-
-        self.wq = ColumnParallelLinear(
-            args.dim,
-            args.n_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=default_linear_init,
-        )
-        self.wk = ColumnParallelLinear(
-            args.dim,
-            args.n_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=default_linear_init,
-        )
-        self.wv = ColumnParallelLinear(
-            args.dim,
-            args.n_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=default_linear_init,
-        )
-        self.wo = RowParallelLinear(
-            args.n_heads * self.head_dim,
-            args.dim,
-            bias=False,
-            input_is_parallel=True,
-            init_method=default_linear_init,
-        )
-
-        self.flash = True
-        self.k_cache, self.v_cache = None, None
-
-    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor], prompt=None):
-        bsz, seqlen, _ = x.shape
-        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
-
-        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-
-        if freqs_cis is not None:
-            xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
-
-        if self.k_cache is None or self.v_cache is None:
-            keys, values = xk, xv
-        else:
-            self.k_cache = self.k_cache.to(xk)
-            self.v_cache = self.v_cache.to(xv)
-            self.k_cache[:bsz, start_pos: start_pos + seqlen, :, :] = xk
-            self.v_cache[:bsz, start_pos: start_pos + seqlen, :, :] = xv
-            keys = self.k_cache[:bsz, :start_pos + seqlen]
-            values = self.v_cache[:bsz, :start_pos + seqlen]
-
-        output = flash_attn_func(
-            xq, keys, values, dropout_p=0.0, causal=mask is not None)
-        output = output.contiguous().view(bsz, seqlen, -1)
-
-        return self.wo(output)
-
-    def allocate_kv_cache(self, max_batch_size: int, max_seq_len: int) -> None:
-        kv_cache_shape = (max_batch_size, max_seq_len,
-                          self.n_local_heads, self.head_dim)
-        if self.k_cache is None or self.k_cache.size() != kv_cache_shape:
-            self.k_cache = torch.empty(kv_cache_shape)
-        if self.v_cache is None or self.v_cache.size() != kv_cache_shape:
-            self.v_cache = torch.empty(kv_cache_shape)
-
-    def destroy_kv_cache(self) -> None:
-        self.k_cache, self.v_cache = None, None
-
-
-class FeedForward(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        multiple_of: int,
-    ):
-        super().__init__()
-        hidden_dim = int(2 * hidden_dim / 3)
-        hidden_dim = multiple_of * \
-            ((hidden_dim + multiple_of - 1) // multiple_of)
-
-        self.w1 = ColumnParallelLinear(
-            dim, hidden_dim, bias=False, gather_output=False, init_method=default_linear_init,
-        )
-        self.w2 = RowParallelLinear(
-            hidden_dim, dim, bias=False, input_is_parallel=True, init_method=default_linear_init
-        )
-        self.w3 = ColumnParallelLinear(
-            dim, hidden_dim, bias=False, gather_output=False, init_method=default_linear_init
-        )
-
-    def _silu_gating(self, x, y):
-        return F.silu(x) * y
-
-    def forward(self, x):
-        return self.w2(self._silu_gating(self.w1(x), self.w3(x)))
-
-
-class TransformerBlock(nn.Module):
-    def __init__(self, layer_id: int, args: ModelArgs):
-        super().__init__()
-        self.n_heads = args.n_heads
-        self.dim = args.dim
-        self.head_dim = args.dim // args.n_heads
-        self.attention = Attention(args)
-        self.feed_forward = FeedForward(
-            dim=args.dim, hidden_dim=4 * args.dim, multiple_of=args.multiple_of
-        )
-        self.layer_id = layer_id
-        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
-        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
-
-    def _forward_ffn(self, h):
-        return h + self.feed_forward(self.ffn_norm(h))
-
-    def _forward_attention(self, x, start_pos, freqs_cis, mask, prompt):
-        return x + self.attention.forward(self.attention_norm(x), start_pos, freqs_cis, mask, prompt)
-
-    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor], prompt=None):
-        h = self._forward_attention(x, start_pos, freqs_cis, mask, prompt)
-        out = self._forward_ffn(h)
-        return out
-
-
-class Mlp(nn.Module):
-    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
-    """
-
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.fc2(x)
-        return x
-
-
-class Transformer(nn.Module):
-    def __init__(self, params: ModelArgs):
-        super().__init__()
-        self.params = params
-        self.vocab_size = params.vocab_size
-        self.n_layers = params.n_layers
-        self.tok_embeddings = ParallelEmbedding(
-            params.vocab_size, params.dim, init_method=nn.init.normal_,
-        )
-
-        self.layers = torch.nn.ModuleList()
-        for layer_id in range(params.n_layers):
-            self.layers.append(TransformerBlock(layer_id, params))
-
-        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
-        self.output = ColumnParallelLinear(
-            params.dim, params.vocab_size, bias=False, init_method=default_linear_init,
-        )
-
-        self.freqs_cis = precompute_freqs_cis(
-            self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
-        )
-
-        # load clip
-        self.clip, _, _ = open_clip.create_model_and_transforms(
-            'ViT-L-14', pretrained='openai')
-        for param in self.clip.parameters():
-            param.requires_grad = False
-            param.data = param.data.half()
-        self.clip.transformer = None
-
-        self.image_words = 30
-        self.cache_image_words = 0  # for inference
-
-        clip_width = self.clip.visual.conv1.out_channels
-        # create modal shared modules
-        self.resample_layers = nn.ModuleDict()
-        self.num_experts = 3
-        self.num_resample_layers = 8
-        for expert in range(self.num_experts):
-            expert = str(expert)
-            self.resample_layers[expert] = nn.ModuleList()
-            resampler_params = copy.deepcopy(params)
-            resampler_params.n_heads = 16
-            resampler_params.dim = clip_width
-            for layer_id in range(self.num_resample_layers):
-                self.resample_layers[expert].append(
-                    TransformerBlock(layer_id, resampler_params))
-
-        self.conv1 = nn.ModuleDict()
-        self.positional_embedding = nn.ParameterDict()
-        self.resample_tokens = nn.ParameterDict()
-        self.clip_proj1 = nn.ModuleDict()
-        self.clip_proj2 = nn.ModuleDict()
-        self.routers = nn.ModuleDict()
-        self.start_tag = nn.ParameterDict()
-        self.end_tag = nn.ParameterDict()
-        self.modals = ['image', 'audio', 'point', 'video', 'rgbd', 'rgbn', 'fmri', 'imu']
-        for modal in self.modals:
-            if modal in ['image', 'video', 'rgbn', 'rgbn']:
-                modal_tokens = 256 + 1
-                pass
-            elif modal == 'audio':
-                self.conv1[modal] = nn.Conv2d(
-                    1, clip_width, kernel_size=(16, 16), stride=(10, 10))
-                modal_tokens = 1212 + 1
-                self.positional_embedding[modal] = nn.Parameter(
-                    torch.empty([modal_tokens, clip_width]))
-                nn.init.normal_(self.positional_embedding[modal], std=0.02)
-            elif modal == 'point':
-                from model.lib.point_utils import PointPatchEmbed
-                self.conv1[modal] = PointPatchEmbed(
-                    in_channels=6, channels=clip_width)
-                modal_tokens = 1024 + 1
-                self.positional_embedding[modal] = nn.Parameter(
-                    torch.empty([modal_tokens, clip_width]))
-                nn.init.normal_(self.positional_embedding[modal], std=0.02)
-            elif modal == 'fmri':
-                self.conv1[modal] = nn.Linear(15724, 8192)
-                self.positional_embedding[modal] = nn.Parameter(
-                    torch.empty([8+1, clip_width]))
-                nn.init.normal_(self.positional_embedding[modal], std=0.02)
-            elif modal == 'imu':
-                self.conv1[modal] = nn.Conv1d(
-                    in_channels=6, out_channels=clip_width, kernel_size=10, bias=False)
-                self.positional_embedding[modal] = nn.Parameter(
-                    torch.empty([391+1, clip_width]))
-                nn.init.normal_(self.positional_embedding[modal], std=0.02)
-
-            self.routers[modal] = Mlp(
-                clip_width, clip_width * 4, self.num_experts)
-
-            self.resample_tokens[modal] = nn.Parameter(
-                torch.empty([1, 30, resampler_params.dim]))
-            nn.init.normal_(self.resample_tokens[modal], std=0.02)
-
-            self.clip_proj1[modal] = nn.Sequential(
-                nn.Linear(clip_width, resampler_params.dim),
-                nn.LayerNorm(resampler_params.dim))
-
-            self.clip_proj2[modal] = nn.Sequential(
-                nn.Linear(resampler_params.dim, params.dim),
-                nn.LayerNorm(params.dim))
-
-            self.start_tag[modal] = nn.Parameter(torch.rand(1, 1, params.dim))
-            self.end_tag[modal] = nn.Parameter(torch.rand(1, 1, params.dim))
-
-    # @torch.no_grad()
-
-    def clip_encode_image(self, x, modal='image'):
-        # shape = [*, width, grid ** 2]
-        x = x.reshape(x.shape[0], x.shape[1], -1)
-        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
-
-        x = torch.cat([self.clip.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1,
-                      x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
-
-        # use pretrained pos embeding for rest modalities
-        pos_embedding = self.clip.visual.positional_embedding
-        if modal in ['audio', 'point', 'fmri', 'imu']:
-            pos_embedding = self.positional_embedding[modal]
-
-        x = x + pos_embedding.to(x.dtype)
-        x = self.clip.visual.ln_pre(x)
-
-        x = x.permute(1, 0, 2)  # NLD -> LND
-        x = self.clip.visual.transformer(x)
-        x = x.permute(1, 0, 2)  # LND -> NLD
-
-        # preserve all spatial tokens
-        x = self.clip.visual.ln_post(x[:, :, :])
-
-        # if self.clip.visual.proj is not None:
-        #    x = x @ self.clip.visual.proj
-
-        return x
-
-    def encode_image(self, x, modal='image'):
-        bsz = x.size(0)
-        T = 1
-        if modal in ['image']:
-            # modified from CLIP
-            x = self.clip.visual.conv1(x)  # shape = [*, width, grid, grid]
-        elif modal in ['audio', 'imu']:
-            x = self.conv1[modal](x)
-        elif modal == 'point':
-            # [B, 16384, 6] -> [B, 1024, 1024, 1]
-            x = self.conv1[modal](x.float()).to(x.dtype)
-        elif modal in ['video', 'rgbd', 'rgbn']:
-            # [B, 15, 3, 224, 224]
-            B, T = x.shape[:2]
-            bsz = B * T
-            x = x.reshape(bsz, *x.shape[2:])
-            x = self.clip.visual.conv1(x)
-        elif modal == 'fmri':
-            x = self.conv1[modal](x)
-            # [B, 1, 8196] -> [B, 1024, 8]
-            x = x.reshape(x.size(0), self.clip.visual.conv1.out_channels, -1)
-
-        image_feats = self.clip_encode_image(x, modal=modal)
-        # take mean on time dimension
-        # all inputs are reduced to [B, L, D]
-        bsz = int(bsz / T)
-        image_feats = image_feats.reshape(
-            bsz, T, *image_feats.shape[1:]).mean(dim=1)
-
-        image_feats = self.clip_proj1[modal](image_feats)
-        image_feats = torch.cat(
-            [self.resample_tokens[modal].repeat(bsz, 1, 1), image_feats], dim=1)
-
-        # routing modalites
-        # [B, L, D]->[B, L, N]
-        routing_weights = self.routers[modal](image_feats).sigmoid()
-        routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
-
-        image_feats_experts = []
-        for expert_id in range(self.num_experts):
-            image_feats_expert = image_feats
-            for layer in self.resample_layers[str(expert_id)]:
-                image_feats_expert = layer(image_feats_expert, 0, None, None)
-
-            image_feats_expert = image_feats_expert[:, :self.resample_tokens[modal].size(1)]
-            routing_weight = routing_weights[:, :self.resample_tokens[modal].size(
-                1), expert_id]
-            # [B, L, D] * [B, L, 1]
-            image_feats_expert = image_feats_expert * routing_weight[:, :, None]
-
-            image_feats_experts.append(image_feats_expert)
-
-        image_feats = sum(image_feats_experts)
-        image_feats = self.clip_proj2[modal](image_feats)
-
-        return image_feats
-
-    def forward(self, examples, image=None, modal='image'):
-        self._destroy_kv_cache()  # training always disables kv cache
-        modal = modal[0]
-        _bsz, seqlen = examples.shape
-        h = self.tok_embeddings(examples)
-        self.freqs_cis = self.freqs_cis.to(h.device)
-
-        start_pos = 0
-        prefix_len = 0
-        if image is not None:
-            h_bos, h_caption = h[:, :1], h[:, 1:]
-            image_tokens = self.encode_image(image, modal)
-            h = torch.cat((h_bos, self.start_tag[modal].expand(
-                _bsz, -1, -1), image_tokens, self.end_tag[modal].expand(_bsz, -1, -1), h_caption), dim=1)
-            # bos + image token + start_tag[modal], end_tag[modal] is used for caption generation
-            prefix_len = image_tokens.shape[1] + 1 + 1
-            seqlen = h.shape[1]
-
-        freqs_cis = self.freqs_cis[start_pos:start_pos + seqlen]
-        mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=h.device)
-        mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)
-        for layer in self.layers:
-            h = layer(h, start_pos, freqs_cis, mask)
-        h = self.norm(h)
-        output = self.output(h[:, prefix_len:, :])
-        return output
-
-    @torch.inference_mode()
-    def forward_inference(self, tokens: torch.Tensor, start_pos: int, image=None, modal='image'):
-        modal = modal[0] if isinstance(modal, list) else modal
-        _bsz, seqlen = tokens.shape
-        if start_pos == 0:
-            # kv cache will not re-allocate if size is unchanged
-            self._allocate_kv_cache(_bsz)
-        h = self.tok_embeddings(tokens)
-        self.freqs_cis = self.freqs_cis.to(h.device)
-
-        if image is not None:
-            h_bos, h_caption = h[:, :1], h[:, 1:]
-            image_tokens = self.encode_image(image, modal)
-            self.cache_image_words = image_tokens.shape[1]
-            h = torch.cat((h_bos, self.start_tag[modal].repeat(_bsz, 1, 1), image_tokens, self.end_tag[modal].repeat(_bsz, 1, 1), h_caption), dim=1)
-            seqlen = h.shape[1]
-            freqs_cis = self.freqs_cis[0: seqlen]
-        else:
-            if start_pos == 0:
-                self.cache_image_words = 0
-                freqs_cis = self.freqs_cis[0: seqlen]
-            else:
-                # if image was not None when start_pos=0,
-                # the offset should be added to start_pos within later forward_inference calls
-                start_pos = start_pos + self.cache_image_words
-                freqs_cis = self.freqs_cis[start_pos: start_pos + seqlen]
-
-        # freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
-
-        mask = None
-        if seqlen > 1:
-            mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=tokens.device)
-            mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)
-
-        for layer in self.layers:
-            h = layer(h, start_pos, freqs_cis, mask)
-        h = self.norm(h)
-        output = self.output(h[:, -1, :])  # only compute last logits
-        return output.float()
-
-    def _allocate_kv_cache(self, max_batch_size: int) -> None:
-        for layer in self.layers:
-            layer.attention.allocate_kv_cache(
-                max_batch_size, self.params.max_seq_len)
-
-    def _destroy_kv_cache(self) -> None:
-        for layer in self.layers:
-            layer.attention.destroy_kv_cache()
diff --git a/model/__init__.py b/model/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/model/__pycache__/__init__.cpython-310.pyc b/model/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index ab67f64cfe739a7a1c51327e5e7a0ea2afc50cd9..0000000000000000000000000000000000000000
Binary files a/model/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/model/__pycache__/__init__.cpython-39.pyc b/model/__pycache__/__init__.cpython-39.pyc
deleted file mode 100644
index bea7e8cd12224f18eb3eefbc92abf61852979fab..0000000000000000000000000000000000000000
Binary files a/model/__pycache__/__init__.cpython-39.pyc and /dev/null differ
diff --git a/model/__pycache__/components.cpython-39.pyc b/model/__pycache__/components.cpython-39.pyc
deleted file mode 100644
index dfbf25224cf34dab4fa2f85fff462d2dbef6b4d6..0000000000000000000000000000000000000000
Binary files a/model/__pycache__/components.cpython-39.pyc and /dev/null differ
diff --git a/model/__pycache__/meta.cpython-310.pyc b/model/__pycache__/meta.cpython-310.pyc
deleted file mode 100644
index 3fbd547ed81e5dc062ca75c125fe8c8a668b5ead..0000000000000000000000000000000000000000
Binary files a/model/__pycache__/meta.cpython-310.pyc and /dev/null differ
diff --git a/model/__pycache__/meta.cpython-39.pyc b/model/__pycache__/meta.cpython-39.pyc
deleted file mode 100644
index b69c01b1098d55637fb39f5be3aed62ddaf7cf43..0000000000000000000000000000000000000000
Binary files a/model/__pycache__/meta.cpython-39.pyc and /dev/null differ
diff --git a/model/__pycache__/tokenizer.cpython-310.pyc b/model/__pycache__/tokenizer.cpython-310.pyc
deleted file mode 100644
index 9a4452629f6f6edcb5522834a8e5bbdfc825b48e..0000000000000000000000000000000000000000
Binary files a/model/__pycache__/tokenizer.cpython-310.pyc and /dev/null differ
diff --git a/model/__pycache__/tokenizer.cpython-39.pyc b/model/__pycache__/tokenizer.cpython-39.pyc
deleted file mode 100644
index c1a8d58048c6364146decfe9c883d63dc197e359..0000000000000000000000000000000000000000
Binary files a/model/__pycache__/tokenizer.cpython-39.pyc and /dev/null differ
diff --git a/model/components.py b/model/components.py
deleted file mode 100644
index 2c8bc4e88484950988aaad4faf6d34ec1a4ec8bf..0000000000000000000000000000000000000000
--- a/model/components.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import warnings
-import torch
-import torch.nn as nn
-
-try:
-    from apex.normalization import FusedRMSNorm as RMSNorm
-except ImportError:
-    warnings.warn("Cannot import apex RMSNorm, switch to vanilla implementation")
-
-    class RMSNorm(torch.nn.Module):
-        def __init__(self, dim: int, eps: float = 1e-6):
-            """
-            Initialize the RMSNorm normalization layer.
-
-            Args:
-                dim (int): The dimension of the input tensor.
-                eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
-
-            Attributes:
-                eps (float): A small value added to the denominator for numerical stability.
-                weight (nn.Parameter): Learnable scaling parameter.
-
-            """
-            super().__init__()
-            self.eps = eps
-            self.weight = nn.Parameter(torch.ones(dim))
-
-        def _norm(self, x):
-            """
-            Apply the RMSNorm normalization to the input tensor.
-
-            Args:
-                x (torch.Tensor): The input tensor.
-
-            Returns:
-                torch.Tensor: The normalized tensor.
-
-            """
-            return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
-        def forward(self, x):
-            """
-            Forward pass through the RMSNorm layer.
-
-            Args:
-                x (torch.Tensor): The input tensor.
-
-            Returns:
-                torch.Tensor: The output tensor after applying RMSNorm.
-
-            """
-            output = self._norm(x.float()).type_as(x)
-            return output * self.weight
-
-
-
-
diff --git a/model/lib/__init__.py b/model/lib/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/model/lib/__pycache__/__init__.cpython-39.pyc b/model/lib/__pycache__/__init__.cpython-39.pyc
deleted file mode 100644
index 5743997c6ae5ac42aa205caa87915ed0d774540d..0000000000000000000000000000000000000000
Binary files a/model/lib/__pycache__/__init__.cpython-39.pyc and /dev/null differ
diff --git a/model/lib/__pycache__/point_utils.cpython-310.pyc b/model/lib/__pycache__/point_utils.cpython-310.pyc
deleted file mode 100644
index b52bf4169d4d84233f3178c745896d1fa395824f..0000000000000000000000000000000000000000
Binary files a/model/lib/__pycache__/point_utils.cpython-310.pyc and /dev/null differ
diff --git a/model/lib/__pycache__/point_utils.cpython-39.pyc b/model/lib/__pycache__/point_utils.cpython-39.pyc
deleted file mode 100644
index 98967937dbd55e3cf06bfdea9285b96b80993dd5..0000000000000000000000000000000000000000
Binary files a/model/lib/__pycache__/point_utils.cpython-39.pyc and /dev/null differ
diff --git a/model/lib/point_utils.py b/model/lib/point_utils.py
deleted file mode 100644
index 8928dc641099b889d5ad63c7f153681a3d2e4c5c..0000000000000000000000000000000000000000
--- a/model/lib/point_utils.py
+++ /dev/null
@@ -1,191 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.autograd import Function
-from . import pointnet2_cuda
-
-class KNN(nn.Module):
-    def __init__(self, neighbors, transpose_mode=True):
-        super(KNN, self).__init__()
-        self.neighbors = neighbors
-
-    @torch.no_grad()
-    def forward(self, support, query):
-        """
-        Args:
-            support ([tensor]): [B, N, C]
-            query ([tensor]): [B, M, C]
-        Returns:
-            [int]: neighbor idx. [B, M, K]
-        """
-        dist = torch.cdist(support, query)
-        k_dist = dist.topk(k=self.neighbors, dim=1, largest=False)
-        return k_dist.values, k_dist.indices.transpose(1, 2).contiguous().int()
-
-
-class GroupingOperation(Function):
-
-    @staticmethod
-    @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
-        """
-        :param ctx:
-        :param features: (B, C, N) tensor of features to group
-        :param idx: (B, npoint, nsample) tensor containing the indicies of features to group with
-        :return:
-            output: (B, C, npoint, nsample) tensor
-        """
-        assert features.is_contiguous()
-        assert idx.is_contiguous()
-
-        B, nfeatures, nsample = idx.size()
-        _, C, N = features.size()
-        output = torch.cuda.FloatTensor(B, C, nfeatures, nsample, device=features.device)
-
-        pointnet2_cuda.group_points_wrapper(B, C, N, nfeatures, nsample, features, idx, output)
-
-        ctx.for_backwards = (idx, N)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_out: torch.Tensor):
-        """
-        :param ctx:
-        :param grad_out: (B, C, npoint, nsample) tensor of the gradients of the output from forward
-        :return:
-            grad_features: (B, C, N) gradient of the features
-        """
-        idx, N = ctx.for_backwards
-
-        B, C, npoint, nsample = grad_out.size()
-        grad_features = torch.zeros([B, C, N], dtype=torch.float, device=grad_out.device, requires_grad=True)
-        grad_out_data = grad_out.data.contiguous()
-        pointnet2_cuda.group_points_grad_wrapper(B, C, N, npoint, nsample, grad_out_data, idx, grad_features.data)
-        return grad_features, None
-
-grouping_operation = GroupingOperation.apply
-
-
-class KNNGroup(nn.Module):
-    def __init__(self, nsample: int,
-                 relative_xyz=True,
-                 normalize_dp=False,
-                 return_only_idx=False,
-                 **kwargs
-                 ):
-        """[summary]
-
-        Args:
-            nsample (int): maximum number of features to gather in the ball
-            use_xyz (bool, optional): concate xyz. Defaults to True.
-            ret_grouped_xyz (bool, optional): [description]. Defaults to False.
-            normalize_dp (bool, optional): [description]. Defaults to False.
-        """
-        super().__init__()
-        self.nsample = nsample
-        self.knn = KNN(nsample, transpose_mode=True)
-        self.relative_xyz = relative_xyz
-        self.normalize_dp = normalize_dp
-        self.return_only_idx = return_only_idx
-
-    def forward(self, query_xyz: torch.Tensor, support_xyz: torch.Tensor, features: torch.Tensor = None):
-        """
-        :param query_xyz: (B, N, 3) xyz coordinates of the features
-        :param support_xyz: (B, npoint, 3) centroids
-        :param features: (B, C, N) descriptors of the features
-        :return:
-            new_features: (B, 3 + C, npoint, nsample)
-        """
-        _, idx = self.knn(support_xyz, query_xyz)
-        if self.return_only_idx:
-            return idx
-        idx = idx.int()
-        xyz_trans = support_xyz.transpose(1, 2).contiguous()
-        grouped_xyz = grouping_operation(xyz_trans, idx)  # (B, 3, npoint, nsample)
-        if self.relative_xyz:
-            grouped_xyz -= query_xyz.transpose(1, 2).unsqueeze(-1)  # relative position
-        if self.normalize_dp:
-            grouped_xyz /= torch.amax(torch.sqrt(torch.sum(grouped_xyz**2, dim=1)), dim=(1, 2)).view(-1, 1, 1, 1)
-        if features is not None:
-            grouped_features = grouping_operation(features, idx)
-            return grouped_xyz, grouped_features
-        else:
-            return grouped_xyz, None
-
-
-class FurthestPointSampling(Function):
-    @staticmethod
-    def forward(ctx, xyz: torch.Tensor, npoint: int) -> torch.Tensor:
-        """
-        Uses iterative furthest point sampling to select a set of npoint features that have the largest
-        minimum distance
-        :param ctx:
-        :param xyz: (B, N, 3) where N > npoint
-        :param npoint: int, number of features in the sampled set
-        :return:
-             output: (B, npoint) tensor containing the set (idx)
-        """
-        assert xyz.is_contiguous()
-
-        B, N, _ = xyz.size()
-        # output = torch.cuda.IntTensor(B, npoint, device=xyz.device)
-        # temp = torch.cuda.FloatTensor(B, N, device=xyz.device).fill_(1e10)
-        output = torch.cuda.IntTensor(B, npoint)
-        temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
-
-        pointnet2_cuda.furthest_point_sampling_wrapper(
-            B, N, npoint, xyz, temp, output)
-        return output
-
-    @staticmethod
-    def backward(xyz, a=None):
-        return None, None
-
-furthest_point_sample = FurthestPointSampling.apply
-
-
-class PointPatchEmbed(nn.Module):
-
-    def __init__(self,
-                 sample_ratio=0.0625,
-                 sample_number=1024,
-                 group_size=32,
-                 in_channels=6,
-                 channels=1024,
-                 kernel_size=1,
-                 stride=1,
-                 normalize_dp=False,
-                 relative_xyz=True,
-                 ):
-        super().__init__()
-        self.sample_ratio = sample_ratio
-        self.sample_number = sample_number
-        self.group_size = group_size
-
-        self.sample_fn = furthest_point_sample
-        self.grouper = KNNGroup(self.group_size, relative_xyz=relative_xyz, normalize_dp=normalize_dp)
-        
-        self.conv1 = nn.Conv2d(in_channels, channels, kernel_size=kernel_size, stride=stride)
-
-
-    def forward(self, x):
-        # coordinates
-        p = x[:, :, 3:].contiguous()
-
-        B, N, _ = p.shape[:3]
-        # idx = self.sample_fn(p, int(N * self.sample_ratio)).long()
-        idx = self.sample_fn(p, self.sample_number).long()
-        center_p = torch.gather(p, 1, idx.unsqueeze(-1).expand(-1, -1, 3))
-        # query neighbors.
-        _, fj = self.grouper(center_p, p, x.permute(0, 2, 1).contiguous()) # [B, N, 6] -> [B, 6, N] -> [B, 6, 1024, 32]
-
-        # [B, 6, 1024] -> [B, channels, 1024, 1]
-        fj = self.conv1(fj).max(dim=-1, keepdim=True)[0]
-
-        return fj
-
-
-if __name__ == '__main__':
-    model = PointPatchEmbed(channels=256).cuda()
-    input = torch.rand(4, 16384, 6).cuda()
-    ou = model(input)
-    import pdb;pdb.set_trace()
\ No newline at end of file
diff --git a/model/lib/pointnet2/.gitignore b/model/lib/pointnet2/.gitignore
deleted file mode 100644
index 867434b8dc0adec1ae8e8ba1c757e8c9d1a179cb..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-build
-dist
-*.egg-info
\ No newline at end of file
diff --git a/model/lib/pointnet2/build/lib.linux-x86_64-cpython-39/pointnet2_cuda.cpython-39-x86_64-linux-gnu.so b/model/lib/pointnet2/build/lib.linux-x86_64-cpython-39/pointnet2_cuda.cpython-39-x86_64-linux-gnu.so
deleted file mode 100644
index 560e153c8189128d5cd3034c8ffdd218592d169d..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/build/lib.linux-x86_64-cpython-39/pointnet2_cuda.cpython-39-x86_64-linux-gnu.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4fe149eddea375d371bc331b11073003e0586254a25c6fe3769b2f6febf8bc54
-size 19727240
diff --git a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/.ninja_deps b/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/.ninja_deps
deleted file mode 100644
index 23bf9cfd32dc02f2f8d7b93e9f1a700f1757e16a..0000000000000000000000000000000000000000
Binary files a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/.ninja_deps and /dev/null differ
diff --git a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/.ninja_log b/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/.ninja_log
deleted file mode 100644
index 0a3b9c5eb002e1ee2f92daf19926d37227fc1b14..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/.ninja_log
+++ /dev/null
@@ -1,10 +0,0 @@
-# ninja log v5
-3	13814	1701773085497639679	/data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/sampling.o	df8f8e58c0087c9c
-1	13838	1701773085525639600	/data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/group_points.o	6b17362810207ae9
-2	13925	1701773085613639348	/data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/interpolate.o	79c442400845cc89
-1	14511	1701773086197637684	/data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/ball_query.o	79a41849bd99bac5
-1	27453	1701773099133600702	/data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/ball_query_gpu.o	c5cdc62e1b68c75a
-2	27764	1701773099449599796	/data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/group_points_gpu.o	8692792652d15ef5
-2	27829	1701773099513599612	/data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/interpolate_gpu.o	18b2a84b93ded4ae
-3	28276	1701773099961598327	/data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/sampling_gpu.o	f8c8e1e67e2b7e7f
-2	30587	1701773102261591724	/data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/pointnet2_api.o	d21e0eacc4130d8e
diff --git a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/build.ninja b/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/build.ninja
deleted file mode 100644
index 33b0fa2dae420b284a0a3d17f1230d2383b3d1a0..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/build.ninja
+++ /dev/null
@@ -1,41 +0,0 @@
-ninja_required_version = 1.3
-cxx = c++
-nvcc = /usr/local/cuda-11.7/bin/nvcc
-
-cflags = -pthread -B /usr/local/anaconda3/envs/onellm/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /usr/local/anaconda3/envs/onellm/include -I/usr/local/anaconda3/envs/onellm/include -fPIC -O2 -isystem /usr/local/anaconda3/envs/onellm/include -fPIC -I/usr/local/anaconda3/envs/onellm/lib/python3.9/site-packages/torch/include -I/usr/local/anaconda3/envs/onellm/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/usr/local/anaconda3/envs/onellm/lib/python3.9/site-packages/torch/include/TH -I/usr/local/anaconda3/envs/onellm/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda-11.7/include -I/usr/local/anaconda3/envs/onellm/include/python3.9 -c
-post_cflags = -g -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=pointnet2_cuda -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++17
-cuda_cflags = -I/usr/local/anaconda3/envs/onellm/lib/python3.9/site-packages/torch/include -I/usr/local/anaconda3/envs/onellm/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/usr/local/anaconda3/envs/onellm/lib/python3.9/site-packages/torch/include/TH -I/usr/local/anaconda3/envs/onellm/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda-11.7/include -I/usr/local/anaconda3/envs/onellm/include/python3.9 -c
-cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -O2 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=pointnet2_cuda -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 -std=c++17
-cuda_dlink_post_cflags = 
-ldflags = 
-
-rule compile
-  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
-  depfile = $out.d
-  deps = gcc
-
-rule cuda_compile
-  depfile = $out.d
-  deps = gcc
-  command = $nvcc  $cuda_cflags -c $in -o $out $cuda_post_cflags
-
-
-
-
-
-build /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/ball_query.o: compile /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/src/ball_query.cpp
-build /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/ball_query_gpu.o: cuda_compile /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/src/ball_query_gpu.cu
-build /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/group_points.o: compile /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/src/group_points.cpp
-build /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/group_points_gpu.o: cuda_compile /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/src/group_points_gpu.cu
-build /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/interpolate.o: compile /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/src/interpolate.cpp
-build /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/interpolate_gpu.o: cuda_compile /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/src/interpolate_gpu.cu
-build /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/pointnet2_api.o: compile /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/src/pointnet2_api.cpp
-build /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/sampling.o: compile /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/src/sampling.cpp
-build /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/sampling_gpu.o: cuda_compile /data1/jiaming/OneLLM-Inference-huggingface/model/lib/pointnet2/src/sampling_gpu.cu
-
-
-
-
-
-
-
diff --git a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/ball_query.o b/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/ball_query.o
deleted file mode 100644
index 98cc431b50967470beeb958b3a91fbe5f8fb7a28..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/ball_query.o
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ec9b31781d1db333cff0098b77791f56891c7cae5d87648f2612f11a1f1a9864
-size 7337552
diff --git a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/ball_query_gpu.o b/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/ball_query_gpu.o
deleted file mode 100644
index fd27102cd859b508785cec38e5e59517f69fed82..0000000000000000000000000000000000000000
Binary files a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/ball_query_gpu.o and /dev/null differ
diff --git a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/group_points.o b/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/group_points.o
deleted file mode 100644
index bbdc1e811d967ad589d8f0cea8405a423966fbd2..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/group_points.o
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5d9e48d335d2d36d29b3c375db543c52aaa9bd5bb7fdfce0bd4297ba4d5fe6fb
-size 6912904
diff --git a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/group_points_gpu.o b/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/group_points_gpu.o
deleted file mode 100644
index 3fc7304cdc83ed830dcf262e0e965355929549ed..0000000000000000000000000000000000000000
Binary files a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/group_points_gpu.o and /dev/null differ
diff --git a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/interpolate.o b/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/interpolate.o
deleted file mode 100644
index 072673655e1b4aa98f4685ea7ccc6e442f65c2b5..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/interpolate.o
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8b4d725783120b312d13f4b06a66707c565a10ac6974f77936047bd10cd48e4c
-size 6921176
diff --git a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/interpolate_gpu.o b/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/interpolate_gpu.o
deleted file mode 100644
index 64a4e42fff8b532e4a081b42767e8fda2a17adb8..0000000000000000000000000000000000000000
Binary files a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/interpolate_gpu.o and /dev/null differ
diff --git a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/pointnet2_api.o b/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/pointnet2_api.o
deleted file mode 100644
index 35ca46a8561e5f18039129326b13e3242c61422f..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/pointnet2_api.o
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:40c56336ad9fab872c6bdb28a197a37b2fb07881688b6ab496f9f6871ae11d12
-size 20280040
diff --git a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/sampling.o b/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/sampling.o
deleted file mode 100644
index 7c8e9bcf45651724c9db6da97bdb820d422ddff4..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/sampling.o
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f531d5ca53aebfeb1aba74c3dd71cecc90a0d87c0d63f5358cfd5c5f8ebbe567
-size 6917856
diff --git a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/sampling_gpu.o b/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/sampling_gpu.o
deleted file mode 100644
index f01f732437cecdf37161a4b28e5e28fd5baec30d..0000000000000000000000000000000000000000
Binary files a/model/lib/pointnet2/build/temp.linux-x86_64-cpython-39/src/sampling_gpu.o and /dev/null differ
diff --git a/model/lib/pointnet2/dist/pointnet2-0.0.0-py3.9-linux-x86_64.egg b/model/lib/pointnet2/dist/pointnet2-0.0.0-py3.9-linux-x86_64.egg
deleted file mode 100644
index 0e42557616254366f9321f471960e2ce59a9fea4..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/dist/pointnet2-0.0.0-py3.9-linux-x86_64.egg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8a192c711c480c616d7f5f51f8202c3cba529a3135792d83f5b3d353ca59d383
-size 6166015
diff --git a/model/lib/pointnet2/pointnet2.egg-info/PKG-INFO b/model/lib/pointnet2/pointnet2.egg-info/PKG-INFO
deleted file mode 100644
index c44293899fb386f21d7a5c12d93d5cfbfb733e65..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/pointnet2.egg-info/PKG-INFO
+++ /dev/null
@@ -1,3 +0,0 @@
-Metadata-Version: 2.1
-Name: pointnet2
-Version: 0.0.0
diff --git a/model/lib/pointnet2/pointnet2.egg-info/SOURCES.txt b/model/lib/pointnet2/pointnet2.egg-info/SOURCES.txt
deleted file mode 100644
index 21253f64ce882771ed3a8dd2feb25cd37cc26259..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/pointnet2.egg-info/SOURCES.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-setup.py
-pointnet2.egg-info/PKG-INFO
-pointnet2.egg-info/SOURCES.txt
-pointnet2.egg-info/dependency_links.txt
-pointnet2.egg-info/top_level.txt
-src/ball_query.cpp
-src/ball_query_gpu.cu
-src/group_points.cpp
-src/group_points_gpu.cu
-src/interpolate.cpp
-src/interpolate_gpu.cu
-src/pointnet2_api.cpp
-src/sampling.cpp
-src/sampling_gpu.cu
\ No newline at end of file
diff --git a/model/lib/pointnet2/pointnet2.egg-info/dependency_links.txt b/model/lib/pointnet2/pointnet2.egg-info/dependency_links.txt
deleted file mode 100644
index 8b137891791fe96927ad78e64b0aad7bded08bdc..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/pointnet2.egg-info/dependency_links.txt
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/model/lib/pointnet2/pointnet2.egg-info/top_level.txt b/model/lib/pointnet2/pointnet2.egg-info/top_level.txt
deleted file mode 100644
index 2d59a59591ec1d9290fd49300f0b42015b991a16..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/pointnet2.egg-info/top_level.txt
+++ /dev/null
@@ -1 +0,0 @@
-pointnet2_cuda
diff --git a/model/lib/pointnet2/pointnet2_modules.py b/model/lib/pointnet2/pointnet2_modules.py
deleted file mode 100644
index 5f125ce5075c738897e5f6a78c71123d0e3e44a2..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/pointnet2_modules.py
+++ /dev/null
@@ -1,160 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from . import pointnet2_utils
-from . import pytorch_utils as pt_utils
-from typing import List
-
-
-class _PointnetSAModuleBase(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.npoint = None
-        self.groupers = None
-        self.mlps = None
-        self.pool_method = 'max_pool'
-
-    def forward(self, xyz: torch.Tensor, features: torch.Tensor = None, new_xyz=None) -> (torch.Tensor, torch.Tensor):
-        """
-        :param xyz: (B, N, 3) tensor of the xyz coordinates of the features
-        :param features: (B, N, C) tensor of the descriptors of the the features
-        :param new_xyz:
-        :return:
-            new_xyz: (B, npoint, 3) tensor of the new features' xyz
-            new_features: (B, npoint, \sum_k(mlps[k][-1])) tensor of the new_features descriptors
-        """
-        new_features_list = []
-
-        xyz_flipped = xyz.transpose(1, 2).contiguous()
-        if new_xyz is None:
-            new_xyz = pointnet2_utils.gather_operation(
-                xyz_flipped,
-                pointnet2_utils.furthest_point_sample(xyz, self.npoint)
-            ).transpose(1, 2).contiguous() if self.npoint is not None else None
-
-        for i in range(len(self.groupers)):
-            new_features = self.groupers[i](xyz, new_xyz, features)  # (B, C, npoint, nsample)
-
-            new_features = self.mlps[i](new_features)  # (B, mlp[-1], npoint, nsample)
-            if self.pool_method == 'max_pool':
-                new_features = F.max_pool2d(
-                    new_features, kernel_size=[1, new_features.size(3)]
-                )  # (B, mlp[-1], npoint, 1)
-            elif self.pool_method == 'avg_pool':
-                new_features = F.avg_pool2d(
-                    new_features, kernel_size=[1, new_features.size(3)]
-                )  # (B, mlp[-1], npoint, 1)
-            else:
-                raise NotImplementedError
-
-            new_features = new_features.squeeze(-1)  # (B, mlp[-1], npoint)
-            new_features_list.append(new_features)
-
-        return new_xyz, torch.cat(new_features_list, dim=1)
-
-
-class PointnetSAModuleMSG(_PointnetSAModuleBase):
-    """Pointnet set abstraction layer with multiscale grouping"""
-
-    def __init__(self, *, npoint: int, radii: List[float], nsamples: List[int], mlps: List[List[int]], bn: bool = True,
-                 use_xyz: bool = True, pool_method='max_pool', instance_norm=False):
-        """
-        :param npoint: int
-        :param radii: list of float, list of radii to group with
-        :param nsamples: list of int, number of samples in each ball query
-        :param mlps: list of list of int, spec of the pointnet before the global pooling for each scale
-        :param bn: whether to use batchnorm
-        :param use_xyz:
-        :param pool_method: max_pool / avg_pool
-        :param instance_norm: whether to use instance_norm
-        """
-        super().__init__()
-
-        assert len(radii) == len(nsamples) == len(mlps)
-
-        self.npoint = npoint
-        self.groupers = nn.ModuleList()
-        self.mlps = nn.ModuleList()
-        for i in range(len(radii)):
-            radius = radii[i]
-            nsample = nsamples[i]
-            self.groupers.append(
-                pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz)
-                if npoint is not None else pointnet2_utils.GroupAll(use_xyz)
-            )
-            mlp_spec = mlps[i]
-            if use_xyz:
-                mlp_spec[0] += 3
-
-            self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn, instance_norm=instance_norm))
-        self.pool_method = pool_method
-
-
-class PointnetSAModule(PointnetSAModuleMSG):
-    """Pointnet set abstraction layer"""
-
-    def __init__(self, *, mlp: List[int], npoint: int = None, radius: float = None, nsample: int = None,
-                 bn: bool = True, use_xyz: bool = True, pool_method='max_pool', instance_norm=False):
-        """
-        :param mlp: list of int, spec of the pointnet before the global max_pool
-        :param npoint: int, number of features
-        :param radius: float, radius of ball
-        :param nsample: int, number of samples in the ball query
-        :param bn: whether to use batchnorm
-        :param use_xyz:
-        :param pool_method: max_pool / avg_pool
-        :param instance_norm: whether to use instance_norm
-        """
-        super().__init__(
-            mlps=[mlp], npoint=npoint, radii=[radius], nsamples=[nsample], bn=bn, use_xyz=use_xyz,
-            pool_method=pool_method, instance_norm=instance_norm
-        )
-
-
-class PointnetFPModule(nn.Module):
-    r"""Propigates the features of one set to another"""
-
-    def __init__(self, *, mlp: List[int], bn: bool = True):
-        """
-        :param mlp: list of int
-        :param bn: whether to use batchnorm
-        """
-        super().__init__()
-        self.mlp = pt_utils.SharedMLP(mlp, bn=bn)
-
-    def forward(
-            self, unknown: torch.Tensor, known: torch.Tensor, unknow_feats: torch.Tensor, known_feats: torch.Tensor
-    ) -> torch.Tensor:
-        """
-        :param unknown: (B, n, 3) tensor of the xyz positions of the unknown features
-        :param known: (B, m, 3) tensor of the xyz positions of the known features
-        :param unknow_feats: (B, C1, n) tensor of the features to be propigated to
-        :param known_feats: (B, C2, m) tensor of features to be propigated
-        :return:
-            new_features: (B, mlp[-1], n) tensor of the features of the unknown features
-        """
-        if known is not None:
-            dist, idx = pointnet2_utils.three_nn(unknown, known)
-            dist_recip = 1.0 / (dist + 1e-8)
-            norm = torch.sum(dist_recip, dim=2, keepdim=True)
-            weight = dist_recip / norm
-
-            interpolated_feats = pointnet2_utils.three_interpolate(known_feats, idx, weight)
-        else:
-            interpolated_feats = known_feats.expand(*known_feats.size()[0:2], unknown.size(1))
-
-        if unknow_feats is not None:
-            new_features = torch.cat([interpolated_feats, unknow_feats], dim=1)  # (B, C2 + C1, n)
-        else:
-            new_features = interpolated_feats
-
-        new_features = new_features.unsqueeze(-1)
-        new_features = self.mlp(new_features)
-
-        return new_features.squeeze(-1)
-
-
-if __name__ == "__main__":
-    pass
diff --git a/model/lib/pointnet2/pointnet2_utils.py b/model/lib/pointnet2/pointnet2_utils.py
deleted file mode 100644
index e814102d8feb5e443e64a736e7733818e0a24685..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/pointnet2_utils.py
+++ /dev/null
@@ -1,290 +0,0 @@
-import torch
-from torch.autograd import Variable
-from torch.autograd import Function
-import torch.nn as nn
-from typing import Tuple
-
-import pointnet2_cuda as pointnet2
-
-
-class FurthestPointSampling(Function):
-    @staticmethod
-    def forward(ctx, xyz: torch.Tensor, npoint: int) -> torch.Tensor:
-        """
-        Uses iterative furthest point sampling to select a set of npoint features that have the largest
-        minimum distance
-        :param ctx:
-        :param xyz: (B, N, 3) where N > npoint
-        :param npoint: int, number of features in the sampled set
-        :return:
-             output: (B, npoint) tensor containing the set
-        """
-        assert xyz.is_contiguous()
-
-        B, N, _ = xyz.size()
-        output = torch.cuda.IntTensor(B, npoint)
-        temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
-
-        pointnet2.furthest_point_sampling_wrapper(B, N, npoint, xyz, temp, output)
-        return output
-
-    @staticmethod
-    def backward(xyz, a=None):
-        return None, None
-
-
-furthest_point_sample = FurthestPointSampling.apply
-
-
-class GatherOperation(Function):
-
-    @staticmethod
-    def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
-        """
-        :param ctx:
-        :param features: (B, C, N)
-        :param idx: (B, npoint) index tensor of the features to gather
-        :return:
-            output: (B, C, npoint)
-        """
-        assert features.is_contiguous()
-        assert idx.is_contiguous()
-
-        B, npoint = idx.size()
-        _, C, N = features.size()
-        output = torch.cuda.FloatTensor(B, C, npoint)
-
-        pointnet2.gather_points_wrapper(B, C, N, npoint, features, idx, output)
-
-        ctx.for_backwards = (idx, C, N)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_out):
-        idx, C, N = ctx.for_backwards
-        B, npoint = idx.size()
-
-        grad_features = Variable(torch.cuda.FloatTensor(B, C, N).zero_())
-        grad_out_data = grad_out.data.contiguous()
-        pointnet2.gather_points_grad_wrapper(B, C, N, npoint, grad_out_data, idx, grad_features.data)
-        return grad_features, None
-
-
-gather_operation = GatherOperation.apply
-
-
-class ThreeNN(Function):
-
-    @staticmethod
-    def forward(ctx, unknown: torch.Tensor, known: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Find the three nearest neighbors of unknown in known
-        :param ctx:
-        :param unknown: (B, N, 3)
-        :param known: (B, M, 3)
-        :return:
-            dist: (B, N, 3) l2 distance to the three nearest neighbors
-            idx: (B, N, 3) index of 3 nearest neighbors
-        """
-        assert unknown.is_contiguous()
-        assert known.is_contiguous()
-
-        B, N, _ = unknown.size()
-        m = known.size(1)
-        dist2 = torch.cuda.FloatTensor(B, N, 3)
-        idx = torch.cuda.IntTensor(B, N, 3)
-
-        pointnet2.three_nn_wrapper(B, N, m, unknown, known, dist2, idx)
-        return torch.sqrt(dist2), idx
-
-    @staticmethod
-    def backward(ctx, a=None, b=None):
-        return None, None
-
-
-three_nn = ThreeNN.apply
-
-
-class ThreeInterpolate(Function):
-
-    @staticmethod
-    def forward(ctx, features: torch.Tensor, idx: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
-        """
-        Performs weight linear interpolation on 3 features
-        :param ctx:
-        :param features: (B, C, M) Features descriptors to be interpolated from
-        :param idx: (B, n, 3) three nearest neighbors of the target features in features
-        :param weight: (B, n, 3) weights
-        :return:
-            output: (B, C, N) tensor of the interpolated features
-        """
-        assert features.is_contiguous()
-        assert idx.is_contiguous()
-        assert weight.is_contiguous()
-
-        B, c, m = features.size()
-        n = idx.size(1)
-        ctx.three_interpolate_for_backward = (idx, weight, m)
-        output = torch.cuda.FloatTensor(B, c, n)
-
-        pointnet2.three_interpolate_wrapper(B, c, m, n, features, idx, weight, output)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        :param ctx:
-        :param grad_out: (B, C, N) tensor with gradients of outputs
-        :return:
-            grad_features: (B, C, M) tensor with gradients of features
-            None:
-            None:
-        """
-        idx, weight, m = ctx.three_interpolate_for_backward
-        B, c, n = grad_out.size()
-
-        grad_features = Variable(torch.cuda.FloatTensor(B, c, m).zero_())
-        grad_out_data = grad_out.data.contiguous()
-
-        pointnet2.three_interpolate_grad_wrapper(B, c, n, m, grad_out_data, idx, weight, grad_features.data)
-        return grad_features, None, None
-
-
-three_interpolate = ThreeInterpolate.apply
-
-
-class GroupingOperation(Function):
-
-    @staticmethod
-    def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
-        """
-        :param ctx:
-        :param features: (B, C, N) tensor of features to group
-        :param idx: (B, npoint, nsample) tensor containing the indicies of features to group with
-        :return:
-            output: (B, C, npoint, nsample) tensor
-        """
-        assert features.is_contiguous()
-        assert idx.is_contiguous()
-
-        B, nfeatures, nsample = idx.size()
-        _, C, N = features.size()
-        output = torch.cuda.FloatTensor(B, C, nfeatures, nsample)
-
-        pointnet2.group_points_wrapper(B, C, N, nfeatures, nsample, features, idx, output)
-
-        ctx.for_backwards = (idx, N)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        :param ctx:
-        :param grad_out: (B, C, npoint, nsample) tensor of the gradients of the output from forward
-        :return:
-            grad_features: (B, C, N) gradient of the features
-        """
-        idx, N = ctx.for_backwards
-
-        B, C, npoint, nsample = grad_out.size()
-        grad_features = Variable(torch.cuda.FloatTensor(B, C, N).zero_())
-
-        grad_out_data = grad_out.data.contiguous()
-        pointnet2.group_points_grad_wrapper(B, C, N, npoint, nsample, grad_out_data, idx, grad_features.data)
-        return grad_features, None
-
-
-grouping_operation = GroupingOperation.apply
-
-
-class BallQuery(Function):
-
-    @staticmethod
-    def forward(ctx, radius: float, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor) -> torch.Tensor:
-        """
-        :param ctx:
-        :param radius: float, radius of the balls
-        :param nsample: int, maximum number of features in the balls
-        :param xyz: (B, N, 3) xyz coordinates of the features
-        :param new_xyz: (B, npoint, 3) centers of the ball query
-        :return:
-            idx: (B, npoint, nsample) tensor with the indicies of the features that form the query balls
-        """
-        assert new_xyz.is_contiguous()
-        assert xyz.is_contiguous()
-
-        B, N, _ = xyz.size()
-        npoint = new_xyz.size(1)
-        idx = torch.cuda.IntTensor(B, npoint, nsample).zero_()
-
-        pointnet2.ball_query_wrapper(B, N, npoint, radius, nsample, new_xyz, xyz, idx)
-        return idx
-
-    @staticmethod
-    def backward(ctx, a=None):
-        return None, None, None, None
-
-
-ball_query = BallQuery.apply
-
-
-class QueryAndGroup(nn.Module):
-    def __init__(self, radius: float, nsample: int, use_xyz: bool = True):
-        """
-        :param radius: float, radius of ball
-        :param nsample: int, maximum number of features to gather in the ball
-        :param use_xyz:
-        """
-        super().__init__()
-        self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz
-
-    def forward(self, xyz: torch.Tensor, new_xyz: torch.Tensor, features: torch.Tensor = None) -> Tuple[torch.Tensor]:
-        """
-        :param xyz: (B, N, 3) xyz coordinates of the features
-        :param new_xyz: (B, npoint, 3) centroids
-        :param features: (B, C, N) descriptors of the features
-        :return:
-            new_features: (B, 3 + C, npoint, nsample)
-        """
-        idx = ball_query(self.radius, self.nsample, xyz, new_xyz)
-        xyz_trans = xyz.transpose(1, 2).contiguous()
-        grouped_xyz = grouping_operation(xyz_trans, idx)  # (B, 3, npoint, nsample)
-        grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1)
-
-        if features is not None:
-            grouped_features = grouping_operation(features, idx)
-            if self.use_xyz:
-                new_features = torch.cat([grouped_xyz, grouped_features], dim=1)  # (B, C + 3, npoint, nsample)
-            else:
-                new_features = grouped_features
-        else:
-            assert self.use_xyz, "Cannot have not features and not use xyz as a feature!"
-            new_features = grouped_xyz
-
-        return new_features
-
-
-class GroupAll(nn.Module):
-    def __init__(self, use_xyz: bool = True):
-        super().__init__()
-        self.use_xyz = use_xyz
-
-    def forward(self, xyz: torch.Tensor, new_xyz: torch.Tensor, features: torch.Tensor = None):
-        """
-        :param xyz: (B, N, 3) xyz coordinates of the features
-        :param new_xyz: ignored
-        :param features: (B, C, N) descriptors of the features
-        :return:
-            new_features: (B, C + 3, 1, N)
-        """
-        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
-        if features is not None:
-            grouped_features = features.unsqueeze(2)
-            if self.use_xyz:
-                new_features = torch.cat([grouped_xyz, grouped_features], dim=1)  # (B, 3 + C, 1, N)
-            else:
-                new_features = grouped_features
-        else:
-            new_features = grouped_xyz
-
-        return new_features
diff --git a/model/lib/pointnet2/pytorch_utils.py b/model/lib/pointnet2/pytorch_utils.py
deleted file mode 100644
index 09cb7bc76d88dde5757ac70b6e05e1e0c768cc1b..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/pytorch_utils.py
+++ /dev/null
@@ -1,236 +0,0 @@
-import torch.nn as nn
-from typing import List, Tuple
-
-
-class SharedMLP(nn.Sequential):
-
-    def __init__(
-            self,
-            args: List[int],
-            *,
-            bn: bool = False,
-            activation=nn.ReLU(inplace=True),
-            preact: bool = False,
-            first: bool = False,
-            name: str = "",
-            instance_norm: bool = False,
-    ):
-        super().__init__()
-
-        for i in range(len(args) - 1):
-            self.add_module(
-                name + 'layer{}'.format(i),
-                Conv2d(
-                    args[i],
-                    args[i + 1],
-                    bn=(not first or not preact or (i != 0)) and bn,
-                    activation=activation
-                    if (not first or not preact or (i != 0)) else None,
-                    preact=preact,
-                    instance_norm=instance_norm
-                )
-            )
-
-
-class _ConvBase(nn.Sequential):
-
-    def __init__(
-            self,
-            in_size,
-            out_size,
-            kernel_size,
-            stride,
-            padding,
-            activation,
-            bn,
-            init,
-            conv=None,
-            batch_norm=None,
-            bias=True,
-            preact=False,
-            name="",
-            instance_norm=False,
-            instance_norm_func=None
-    ):
-        super().__init__()
-
-        bias = bias and (not bn)
-        conv_unit = conv(
-            in_size,
-            out_size,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            bias=bias
-        )
-        init(conv_unit.weight)
-        if bias:
-            nn.init.constant_(conv_unit.bias, 0)
-
-        if bn:
-            if not preact:
-                bn_unit = batch_norm(out_size)
-            else:
-                bn_unit = batch_norm(in_size)
-        if instance_norm:
-            if not preact:
-                in_unit = instance_norm_func(out_size, affine=False, track_running_stats=False)
-            else:
-                in_unit = instance_norm_func(in_size, affine=False, track_running_stats=False)
-
-        if preact:
-            if bn:
-                self.add_module(name + 'bn', bn_unit)
-
-            if activation is not None:
-                self.add_module(name + 'activation', activation)
-
-            if not bn and instance_norm:
-                self.add_module(name + 'in', in_unit)
-
-        self.add_module(name + 'conv', conv_unit)
-
-        if not preact:
-            if bn:
-                self.add_module(name + 'bn', bn_unit)
-
-            if activation is not None:
-                self.add_module(name + 'activation', activation)
-
-            if not bn and instance_norm:
-                self.add_module(name + 'in', in_unit)
-
-
-class _BNBase(nn.Sequential):
-
-    def __init__(self, in_size, batch_norm=None, name=""):
-        super().__init__()
-        self.add_module(name + "bn", batch_norm(in_size))
-
-        nn.init.constant_(self[0].weight, 1.0)
-        nn.init.constant_(self[0].bias, 0)
-
-
-class BatchNorm1d(_BNBase):
-
-    def __init__(self, in_size: int, *, name: str = ""):
-        super().__init__(in_size, batch_norm=nn.BatchNorm1d, name=name)
-
-
-class BatchNorm2d(_BNBase):
-
-    def __init__(self, in_size: int, name: str = ""):
-        super().__init__(in_size, batch_norm=nn.BatchNorm2d, name=name)
-
-
-class Conv1d(_ConvBase):
-
-    def __init__(
-            self,
-            in_size: int,
-            out_size: int,
-            *,
-            kernel_size: int = 1,
-            stride: int = 1,
-            padding: int = 0,
-            activation=nn.ReLU(inplace=True),
-            bn: bool = False,
-            init=nn.init.kaiming_normal_,
-            bias: bool = True,
-            preact: bool = False,
-            name: str = "",
-            instance_norm=False
-    ):
-        super().__init__(
-            in_size,
-            out_size,
-            kernel_size,
-            stride,
-            padding,
-            activation,
-            bn,
-            init,
-            conv=nn.Conv1d,
-            batch_norm=BatchNorm1d,
-            bias=bias,
-            preact=preact,
-            name=name,
-            instance_norm=instance_norm,
-            instance_norm_func=nn.InstanceNorm1d
-        )
-
-
-class Conv2d(_ConvBase):
-
-    def __init__(
-            self,
-            in_size: int,
-            out_size: int,
-            *,
-            kernel_size: Tuple[int, int] = (1, 1),
-            stride: Tuple[int, int] = (1, 1),
-            padding: Tuple[int, int] = (0, 0),
-            activation=nn.ReLU(inplace=True),
-            bn: bool = False,
-            init=nn.init.kaiming_normal_,
-            bias: bool = True,
-            preact: bool = False,
-            name: str = "",
-            instance_norm=False
-    ):
-        super().__init__(
-            in_size,
-            out_size,
-            kernel_size,
-            stride,
-            padding,
-            activation,
-            bn,
-            init,
-            conv=nn.Conv2d,
-            batch_norm=BatchNorm2d,
-            bias=bias,
-            preact=preact,
-            name=name,
-            instance_norm=instance_norm,
-            instance_norm_func=nn.InstanceNorm2d
-        )
-
-
-class FC(nn.Sequential):
-
-    def __init__(
-            self,
-            in_size: int,
-            out_size: int,
-            *,
-            activation=nn.ReLU(inplace=True),
-            bn: bool = False,
-            init=None,
-            preact: bool = False,
-            name: str = ""
-    ):
-        super().__init__()
-
-        fc = nn.Linear(in_size, out_size, bias=not bn)
-        if init is not None:
-            init(fc.weight)
-        if not bn:
-            nn.init.constant(fc.bias, 0)
-
-        if preact:
-            if bn:
-                self.add_module(name + 'bn', BatchNorm1d(in_size))
-
-            if activation is not None:
-                self.add_module(name + 'activation', activation)
-
-        self.add_module(name + 'fc', fc)
-
-        if not preact:
-            if bn:
-                self.add_module(name + 'bn', BatchNorm1d(out_size))
-
-            if activation is not None:
-                self.add_module(name + 'activation', activation)
-
diff --git a/model/lib/pointnet2/setup.py b/model/lib/pointnet2/setup.py
deleted file mode 100644
index 99e59e37b90517cc38c35d100f7f9cee0e309368..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/setup.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-setup(
-    name='pointnet2',
-    ext_modules=[
-        CUDAExtension('pointnet2_cuda', [
-            'src/pointnet2_api.cpp',
-            
-            'src/ball_query.cpp', 
-            'src/ball_query_gpu.cu',
-            'src/group_points.cpp', 
-            'src/group_points_gpu.cu',
-            'src/interpolate.cpp', 
-            'src/interpolate_gpu.cu',
-            'src/sampling.cpp', 
-            'src/sampling_gpu.cu',
-        ],
-        extra_compile_args={'cxx': ['-g'],
-                            'nvcc': ['-O2']})
-    ],
-    cmdclass={'build_ext': BuildExtension}
-)
diff --git a/model/lib/pointnet2/src/ball_query.cpp b/model/lib/pointnet2/src/ball_query.cpp
deleted file mode 100644
index c9b176e5da5dd89a3378652f0b806925e8ee8996..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/src/ball_query.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#include <torch/serialize/tensor.h>
-#include <vector>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAEvent.h>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include "ball_query_gpu.h"
-
-#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
-
-int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample, 
-    at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
-    CHECK_INPUT(new_xyz_tensor);
-    CHECK_INPUT(xyz_tensor);
-    const float *new_xyz = new_xyz_tensor.data<float>();
-    const float *xyz = xyz_tensor.data<float>();
-    int *idx = idx_tensor.data<int>();
-    
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-    ball_query_kernel_launcher_fast(b, n, m, radius, nsample, new_xyz, xyz, idx, stream);
-    return 1;
-}
diff --git a/model/lib/pointnet2/src/ball_query_gpu.cu b/model/lib/pointnet2/src/ball_query_gpu.cu
deleted file mode 100644
index f8840aa6650693cea17d337008a15fef13ec1ebc..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/src/ball_query_gpu.cu
+++ /dev/null
@@ -1,67 +0,0 @@
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "ball_query_gpu.h"
-#include "cuda_utils.h"
-
-
-__global__ void ball_query_kernel_fast(int b, int n, int m, float radius, int nsample, 
-    const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) {
-    // new_xyz: (B, M, 3)
-    // xyz: (B, N, 3)
-    // output:
-    //      idx: (B, M, nsample)
-    int bs_idx = blockIdx.y;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (bs_idx >= b || pt_idx >= m) return;
-
-    new_xyz += bs_idx * m * 3 + pt_idx * 3;
-    xyz += bs_idx * n * 3;
-    idx += bs_idx * m * nsample + pt_idx * nsample;
-
-    float radius2 = radius * radius;
-    float new_x = new_xyz[0];
-    float new_y = new_xyz[1];
-    float new_z = new_xyz[2];
-
-    int cnt = 0;
-    for (int k = 0; k < n; ++k) {
-        float x = xyz[k * 3 + 0];
-        float y = xyz[k * 3 + 1];
-        float z = xyz[k * 3 + 2];
-        float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
-        if (d2 < radius2){
-            if (cnt == 0){
-                for (int l = 0; l < nsample; ++l) {
-                    idx[l] = k;
-                }
-            }
-            idx[cnt] = k;
-            ++cnt;
-            if (cnt >= nsample) break;
-        }
-    }
-}
-
-
-void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample, \
-    const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream) {
-    // new_xyz: (B, M, 3)
-    // xyz: (B, N, 3)
-    // output:
-    //      idx: (B, M, nsample)
-
-    cudaError_t err;
-
-    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
-
-    ball_query_kernel_fast<<<blocks, threads, 0, stream>>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
-    // cudaDeviceSynchronize();  // for using printf in kernel function
-    err = cudaGetLastError();
-    if (cudaSuccess != err) {
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
-    }
-}
\ No newline at end of file
diff --git a/model/lib/pointnet2/src/ball_query_gpu.h b/model/lib/pointnet2/src/ball_query_gpu.h
deleted file mode 100644
index ffc831a8b700f46b50e0b90d49c538aa0fedca50..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/src/ball_query_gpu.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _BALL_QUERY_GPU_H
-#define _BALL_QUERY_GPU_H
-
-#include <torch/serialize/tensor.h>
-#include <vector>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-
-int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample, 
-	at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
-
-void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample, 
-	const float *xyz, const float *new_xyz, int *idx, cudaStream_t stream);
-
-#endif
diff --git a/model/lib/pointnet2/src/cuda_utils.h b/model/lib/pointnet2/src/cuda_utils.h
deleted file mode 100644
index 7fe27969179c976a88199bbe962ca4f8d97263a4..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/src/cuda_utils.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _CUDA_UTILS_H
-#define _CUDA_UTILS_H
-
-#include <cmath>
-
-#define TOTAL_THREADS 1024
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
-
-inline int opt_n_threads(int work_size) {
-    const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
-
-    return max(min(1 << pow_2, TOTAL_THREADS), 1);
-}
-#endif
diff --git a/model/lib/pointnet2/src/group_points.cpp b/model/lib/pointnet2/src/group_points.cpp
deleted file mode 100644
index fa80f0e318acc57dabf76ec0a8b1d9dff482ab89..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/src/group_points.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#include <torch/serialize/tensor.h>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <vector>
-#include "group_points_gpu.h"
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAEvent.h>
-
-
-
-int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample, 
-    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
-
-    float *grad_points = grad_points_tensor.data<float>();
-    const int *idx = idx_tensor.data<int>();
-    const float *grad_out = grad_out_tensor.data<float>();
-
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-    group_points_grad_kernel_launcher_fast(b, c, n, npoints, nsample, grad_out, idx, grad_points, stream);
-    return 1;
-}
-
-
-int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample, 
-    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) {
-
-    const float *points = points_tensor.data<float>();
-    const int *idx = idx_tensor.data<int>();
-    float *out = out_tensor.data<float>();
-
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-    group_points_kernel_launcher_fast(b, c, n, npoints, nsample, points, idx, out, stream);
-    return 1;
-}
diff --git a/model/lib/pointnet2/src/group_points_gpu.cu b/model/lib/pointnet2/src/group_points_gpu.cu
deleted file mode 100644
index c015a8125e38aafa1f960000044978463b7853b1..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/src/group_points_gpu.cu
+++ /dev/null
@@ -1,86 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "cuda_utils.h"
-#include "group_points_gpu.h"
-
-
-__global__ void group_points_grad_kernel_fast(int b, int c, int n, int npoints, int nsample, 
-    const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) {
-    // grad_out: (B, C, npoints, nsample)
-    // idx: (B, npoints, nsample)
-    // output:
-    //      grad_points: (B, C, N)
-    int bs_idx = blockIdx.z;
-    int c_idx = blockIdx.y;
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int pt_idx = index / nsample;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
-
-    int sample_idx = index % nsample;
-    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
-    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; 
-    
-    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0] , grad_out[0]);
-}
-
-void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample, 
-    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
-    // grad_out: (B, C, npoints, nsample)
-    // idx: (B, npoints, nsample)
-    // output:
-    //      grad_points: (B, C, N)
-    cudaError_t err;
-    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
-
-    group_points_grad_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points);
-
-    err = cudaGetLastError();
-    if (cudaSuccess != err) {
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
-    }
-}
-
-
-__global__ void group_points_kernel_fast(int b, int c, int n, int npoints, int nsample, 
-    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
-    // points: (B, C, N)
-    // idx: (B, npoints, nsample)
-    // output:
-    //      out: (B, C, npoints, nsample)
-    int bs_idx = blockIdx.z;
-    int c_idx = blockIdx.y;
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int pt_idx = index / nsample;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
-
-    int sample_idx = index % nsample;
-
-    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; 
-    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
-    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
-
-    out[out_idx] = points[in_idx];
-}
-
-
-void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample, 
-    const float *points, const int *idx, float *out, cudaStream_t stream) {
-    // points: (B, C, N)
-    // idx: (B, npoints, nsample)
-    // output:
-    //      out: (B, C, npoints, nsample)
-    cudaError_t err;
-    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
-
-    group_points_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, points, idx, out);
-    // cudaDeviceSynchronize();  // for using printf in kernel function
-    err = cudaGetLastError();
-    if (cudaSuccess != err) {
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
-    }
-}
diff --git a/model/lib/pointnet2/src/group_points_gpu.h b/model/lib/pointnet2/src/group_points_gpu.h
deleted file mode 100644
index 76c73ca2600ef75c192b06d28f79a168f1ba368b..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/src/group_points_gpu.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _GROUP_POINTS_GPU_H
-#define _GROUP_POINTS_GPU_H
-
-#include <torch/serialize/tensor.h>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <vector>
-
-
-int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample, 
-    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
-
-void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample, 
-    const float *points, const int *idx, float *out, cudaStream_t stream);
-
-int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample, 
-    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
-
-void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample, 
-    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
-
-#endif
diff --git a/model/lib/pointnet2/src/interpolate.cpp b/model/lib/pointnet2/src/interpolate.cpp
deleted file mode 100644
index 88d837f966f52696308b7d85ec1756b2395bb986..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/src/interpolate.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#include <torch/serialize/tensor.h>
-#include <vector>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include "interpolate_gpu.h"
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAEvent.h>
-
-
-void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor, 
-    at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
-    const float *unknown = unknown_tensor.data<float>();
-    const float *known = known_tensor.data<float>();
-    float *dist2 = dist2_tensor.data<float>();
-    int *idx = idx_tensor.data<int>();
-
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-    three_nn_kernel_launcher_fast(b, n, m, unknown, known, dist2, idx, stream);
-}
-
-
-void three_interpolate_wrapper_fast(int b, int c, int m, int n,
-                         at::Tensor points_tensor,
-                         at::Tensor idx_tensor,
-                         at::Tensor weight_tensor,
-                         at::Tensor out_tensor) {
-
-    const float *points = points_tensor.data<float>();
-    const float *weight = weight_tensor.data<float>();
-    float *out = out_tensor.data<float>();
-    const int *idx = idx_tensor.data<int>();
-
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-    three_interpolate_kernel_launcher_fast(b, c, m, n, points, idx, weight, out, stream);
-}
-
-void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m,
-                            at::Tensor grad_out_tensor,
-                            at::Tensor idx_tensor,
-                            at::Tensor weight_tensor,
-                            at::Tensor grad_points_tensor) {
-
-    const float *grad_out = grad_out_tensor.data<float>();
-    const float *weight = weight_tensor.data<float>();
-    float *grad_points = grad_points_tensor.data<float>();
-    const int *idx = idx_tensor.data<int>();
-
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-    three_interpolate_grad_kernel_launcher_fast(b, c, n, m, grad_out, idx, weight, grad_points, stream);
-}
diff --git a/model/lib/pointnet2/src/interpolate_gpu.cu b/model/lib/pointnet2/src/interpolate_gpu.cu
deleted file mode 100644
index a123dd8d8d4f5ed23cc4a340abb1141d140fca3c..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/src/interpolate_gpu.cu
+++ /dev/null
@@ -1,161 +0,0 @@
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "cuda_utils.h"
-#include "interpolate_gpu.h"
-
-
-__global__ void three_nn_kernel_fast(int b, int n, int m, const float *__restrict__ unknown, 
-    const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) {
-    // unknown: (B, N, 3)
-    // known: (B, M, 3)
-    // output: 
-    //      dist2: (B, N, 3)
-    //      idx: (B, N, 3)
-    
-    int bs_idx = blockIdx.y;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (bs_idx >= b || pt_idx >= n) return;
-
-    unknown += bs_idx * n * 3 + pt_idx * 3;
-    known += bs_idx * m * 3;
-    dist2 += bs_idx * n * 3 + pt_idx * 3;
-    idx += bs_idx * n * 3 + pt_idx * 3;
-
-    float ux = unknown[0];
-    float uy = unknown[1];
-    float uz = unknown[2];
-
-    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
-    int besti1 = 0, besti2 = 0, besti3 = 0;
-    for (int k = 0; k < m; ++k) {
-        float x = known[k * 3 + 0];
-        float y = known[k * 3 + 1];
-        float z = known[k * 3 + 2];
-        float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
-        if (d < best1) {
-            best3 = best2; besti3 = besti2;
-            best2 = best1; besti2 = besti1;
-            best1 = d; besti1 = k;
-        } 
-        else if (d < best2) {
-            best3 = best2; besti3 = besti2;
-            best2 = d; besti2 = k;
-        } 
-        else if (d < best3) {
-            best3 = d; besti3 = k;
-        }
-    }
-    dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
-    idx[0] = besti1; idx[1] = besti2; idx[2] = besti3;
-}
-
-
-void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown, 
-    const float *known, float *dist2, int *idx, cudaStream_t stream) {
-    // unknown: (B, N, 3)
-    // known: (B, M, 3)
-    // output: 
-    //      dist2: (B, N, 3)
-    //      idx: (B, N, 3)
-
-    cudaError_t err;
-    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
-
-    three_nn_kernel_fast<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known, dist2, idx);
-
-    err = cudaGetLastError();
-    if (cudaSuccess != err) {
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
-    }
-}
-
-
-__global__ void three_interpolate_kernel_fast(int b, int c, int m, int n, const float *__restrict__ points, 
-    const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) {
-    // points: (B, C, M)
-    // idx: (B, N, 3)
-    // weight: (B, N, 3)
-    // output:
-    //      out: (B, C, N)
-
-    int bs_idx = blockIdx.z;
-    int c_idx = blockIdx.y;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
-
-    weight += bs_idx * n * 3 + pt_idx * 3;
-    points += bs_idx * c * m + c_idx * m;
-    idx += bs_idx * n * 3 + pt_idx * 3;
-    out += bs_idx * c * n + c_idx * n;
-
-    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]];
-}
-
-void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n, 
-    const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream) {
-    // points: (B, C, M)
-    // idx: (B, N, 3)
-    // weight: (B, N, 3)
-    // output:
-    //      out: (B, C, N)
-
-    cudaError_t err;
-    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
-    three_interpolate_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, m, n, points, idx, weight, out);
-
-    err = cudaGetLastError();
-    if (cudaSuccess != err) {
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
-    }
-}
-
-
-__global__ void three_interpolate_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out, 
-    const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) {
-    // grad_out: (B, C, N)
-    // weight: (B, N, 3)
-    // output:
-    //      grad_points: (B, C, M)
-
-    int bs_idx = blockIdx.z;
-    int c_idx = blockIdx.y;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
-    
-    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
-    weight += bs_idx * n * 3 + pt_idx * 3;
-    grad_points += bs_idx * c * m + c_idx * m;
-    idx += bs_idx * n * 3 + pt_idx * 3;
-
-
-    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
-    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
-    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
-}
-
-void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out, 
-    const int *idx, const float *weight, float *grad_points, cudaStream_t stream) {
-    // grad_out: (B, C, N)
-    // weight: (B, N, 3)
-    // output:
-    //      grad_points: (B, C, M)
-
-    cudaError_t err;
-    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
-    three_interpolate_grad_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, m, grad_out, idx, weight, grad_points);
-
-    err = cudaGetLastError();
-    if (cudaSuccess != err) {
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
-    }
-}
\ No newline at end of file
diff --git a/model/lib/pointnet2/src/interpolate_gpu.h b/model/lib/pointnet2/src/interpolate_gpu.h
deleted file mode 100644
index f1771087c5e4146e3c5775d3b929ebffffd11ccb..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/src/interpolate_gpu.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _INTERPOLATE_GPU_H
-#define _INTERPOLATE_GPU_H
-
-#include <torch/serialize/tensor.h>
-#include<vector>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-
-
-void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor, 
-  at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
-
-void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
-	const float *known, float *dist2, int *idx, cudaStream_t stream);
-
-
-void three_interpolate_wrapper_fast(int b, int c, int m, int n, at::Tensor points_tensor, 
-    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
-
-void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n, 
-    const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream);
-
-
-void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m, at::Tensor grad_out_tensor, 
-    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
-
-void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out, 
-    const int *idx, const float *weight, float *grad_points, cudaStream_t stream);
-
-#endif
diff --git a/model/lib/pointnet2/src/pointnet2_api.cpp b/model/lib/pointnet2/src/pointnet2_api.cpp
deleted file mode 100644
index d91f0f2176a6080624f071e5535fe509a0ac83c4..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/src/pointnet2_api.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#include <torch/serialize/tensor.h>
-#include <torch/extension.h>
-
-#include "ball_query_gpu.h"
-#include "group_points_gpu.h"
-#include "sampling_gpu.h"
-#include "interpolate_gpu.h"
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("ball_query_wrapper", &ball_query_wrapper_fast, "ball_query_wrapper_fast");
-
-    m.def("group_points_wrapper", &group_points_wrapper_fast, "group_points_wrapper_fast");
-    m.def("group_points_grad_wrapper", &group_points_grad_wrapper_fast, "group_points_grad_wrapper_fast");
-
-    m.def("gather_points_wrapper", &gather_points_wrapper_fast, "gather_points_wrapper_fast");
-    m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper_fast, "gather_points_grad_wrapper_fast");
-
-    m.def("furthest_point_sampling_wrapper", &furthest_point_sampling_wrapper, "furthest_point_sampling_wrapper");
-    
-    m.def("three_nn_wrapper", &three_nn_wrapper_fast, "three_nn_wrapper_fast");
-    m.def("three_interpolate_wrapper", &three_interpolate_wrapper_fast, "three_interpolate_wrapper_fast");
-    m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper_fast, "three_interpolate_grad_wrapper_fast");
-}
diff --git a/model/lib/pointnet2/src/sampling.cpp b/model/lib/pointnet2/src/sampling.cpp
deleted file mode 100644
index 5f54daa763ed66240c17ba6254ee9d5a39b6dfc0..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/src/sampling.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#include <torch/serialize/tensor.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <vector>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAEvent.h>
-#include "sampling_gpu.h"
-
-
-
-int gather_points_wrapper_fast(int b, int c, int n, int npoints, 
-    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor){
-    const float *points = points_tensor.data<float>();
-    const int *idx = idx_tensor.data<int>();
-    float *out = out_tensor.data<float>();
-
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-    gather_points_kernel_launcher_fast(b, c, n, npoints, points, idx, out, stream);
-    return 1;
-}
-
-
-int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints, 
-    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
-
-    const float *grad_out = grad_out_tensor.data<float>();
-    const int *idx = idx_tensor.data<int>();
-    float *grad_points = grad_points_tensor.data<float>();
-
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-    gather_points_grad_kernel_launcher_fast(b, c, n, npoints, grad_out, idx, grad_points, stream);
-    return 1;
-}
-
-
-int furthest_point_sampling_wrapper(int b, int n, int m, 
-    at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor) {
-
-    const float *points = points_tensor.data<float>();
-    float *temp = temp_tensor.data<float>();
-    int *idx = idx_tensor.data<int>();
-
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-    furthest_point_sampling_kernel_launcher(b, n, m, points, temp, idx, stream);
-    return 1;
-}
diff --git a/model/lib/pointnet2/src/sampling_gpu.cu b/model/lib/pointnet2/src/sampling_gpu.cu
deleted file mode 100644
index 9e49a60dd6a80449be4c6c0d0d710be7b5fe9cd5..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/src/sampling_gpu.cu
+++ /dev/null
@@ -1,253 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "cuda_utils.h"
-#include "sampling_gpu.h"
-
-
-__global__ void gather_points_kernel_fast(int b, int c, int n, int m, 
-    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
-    // points: (B, C, N)
-    // idx: (B, M)
-    // output:
-    //      out: (B, C, M)
-
-    int bs_idx = blockIdx.z;
-    int c_idx = blockIdx.y;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
-
-    out += bs_idx * c * m + c_idx * m + pt_idx;
-    idx += bs_idx * m + pt_idx;
-    points += bs_idx * c * n + c_idx * n;
-    out[0] = points[idx[0]];
-}
-
-void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints, 
-    const float *points, const int *idx, float *out, cudaStream_t stream) {
-    // points: (B, C, N)
-    // idx: (B, npoints)
-    // output:
-    //      out: (B, C, npoints)
-
-    cudaError_t err;
-    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
-
-    gather_points_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points, idx, out);
-
-    err = cudaGetLastError();
-    if (cudaSuccess != err) {
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
-    }
-}
-
-__global__ void gather_points_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out, 
-    const int *__restrict__ idx, float *__restrict__ grad_points) {
-    // grad_out: (B, C, M)
-    // idx: (B, M)
-    // output:
-    //      grad_points: (B, C, N)
-
-    int bs_idx = blockIdx.z;
-    int c_idx = blockIdx.y;
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
-
-    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
-    idx += bs_idx * m + pt_idx;
-    grad_points += bs_idx * c * n + c_idx * n;
-
-    atomicAdd(grad_points + idx[0], grad_out[0]);
-}
-
-void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, 
-    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
-    // grad_out: (B, C, npoints)
-    // idx: (B, npoints)
-    // output:
-    //      grad_points: (B, C, N)
-
-    cudaError_t err;
-    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
-
-    gather_points_grad_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, grad_out, idx, grad_points);
-
-    err = cudaGetLastError();
-    if (cudaSuccess != err) {
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
-    }
-}
-
-
-__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, int idx1, int idx2){
-    const float v1 = dists[idx1], v2 = dists[idx2];
-    const int i1 = dists_i[idx1], i2 = dists_i[idx2];
-    dists[idx1] = max(v1, v2);
-    dists_i[idx1] = v2 > v1 ? i2 : i1;
-}
-
-template <unsigned int block_size>
-__global__ void furthest_point_sampling_kernel(int b, int n, int m, 
-    const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) {
-    // dataset: (B, N, 3)
-    // tmp: (B, N)
-    // output:
-    //      idx: (B, M)
-
-    if (m <= 0) return;
-    __shared__ float dists[block_size];
-    __shared__ int dists_i[block_size];
-
-    int batch_index = blockIdx.x;
-    dataset += batch_index * n * 3;
-    temp += batch_index * n;
-    idxs += batch_index * m;
-
-    int tid = threadIdx.x;
-    const int stride = block_size;
-
-    int old = 0;
-    if (threadIdx.x == 0)
-    idxs[0] = old;
-
-    __syncthreads();
-    for (int j = 1; j < m; j++) {
-    int besti = 0;
-    float best = -1;
-    float x1 = dataset[old * 3 + 0];
-    float y1 = dataset[old * 3 + 1];
-    float z1 = dataset[old * 3 + 2];
-    for (int k = tid; k < n; k += stride) {
-        float x2, y2, z2;
-        x2 = dataset[k * 3 + 0];
-        y2 = dataset[k * 3 + 1];
-        z2 = dataset[k * 3 + 2];
-        // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
-        // if (mag <= 1e-3)
-        // continue;
-
-        float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
-        float d2 = min(d, temp[k]);
-        temp[k] = d2;
-        besti = d2 > best ? k : besti;
-        best = d2 > best ? d2 : best;
-    }
-    dists[tid] = best;
-    dists_i[tid] = besti;
-    __syncthreads();
-
-    if (block_size >= 1024) {
-        if (tid < 512) {
-            __update(dists, dists_i, tid, tid + 512);
-        }
-        __syncthreads();
-    }
-
-    if (block_size >= 512) {
-        if (tid < 256) {
-            __update(dists, dists_i, tid, tid + 256);
-        }
-        __syncthreads();
-    }
-    if (block_size >= 256) {
-        if (tid < 128) {
-            __update(dists, dists_i, tid, tid + 128);
-        }
-        __syncthreads();
-    }
-    if (block_size >= 128) {
-        if (tid < 64) {
-            __update(dists, dists_i, tid, tid + 64);
-        }
-        __syncthreads();
-    }
-    if (block_size >= 64) {
-        if (tid < 32) {
-            __update(dists, dists_i, tid, tid + 32);
-        }
-        __syncthreads();
-    }
-    if (block_size >= 32) {
-        if (tid < 16) {
-            __update(dists, dists_i, tid, tid + 16);
-        }
-        __syncthreads();
-    }
-    if (block_size >= 16) {
-        if (tid < 8) {
-            __update(dists, dists_i, tid, tid + 8);
-        }
-        __syncthreads();
-    }
-    if (block_size >= 8) {
-        if (tid < 4) {
-            __update(dists, dists_i, tid, tid + 4);
-        }
-        __syncthreads();
-    }
-    if (block_size >= 4) {
-        if (tid < 2) {
-            __update(dists, dists_i, tid, tid + 2);
-        }
-        __syncthreads();
-    }
-    if (block_size >= 2) {
-        if (tid < 1) {
-            __update(dists, dists_i, tid, tid + 1);
-        }
-        __syncthreads();
-    }
-
-    old = dists_i[0];
-    if (tid == 0)
-        idxs[j] = old;
-    }
-}
-
-void furthest_point_sampling_kernel_launcher(int b, int n, int m, 
-    const float *dataset, float *temp, int *idxs, cudaStream_t stream) {
-    // dataset: (B, N, 3)
-    // tmp: (B, N)
-    // output:
-    //      idx: (B, M)
-
-    cudaError_t err;
-    unsigned int n_threads = opt_n_threads(n);
-
-    switch (n_threads) {
-        case 1024:
-        furthest_point_sampling_kernel<1024><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
-        case 512:
-        furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
-        case 256:
-        furthest_point_sampling_kernel<256><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
-        case 128:
-        furthest_point_sampling_kernel<128><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
-        case 64:
-        furthest_point_sampling_kernel<64><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
-        case 32:
-        furthest_point_sampling_kernel<32><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
-        case 16:
-        furthest_point_sampling_kernel<16><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
-        case 8:
-        furthest_point_sampling_kernel<8><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
-        case 4:
-        furthest_point_sampling_kernel<4><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
-        case 2:
-        furthest_point_sampling_kernel<2><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
-        case 1:
-        furthest_point_sampling_kernel<1><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
-        default:
-        furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
-    }
-
-    err = cudaGetLastError();
-    if (cudaSuccess != err) {
-        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-        exit(-1);
-    }
-}
diff --git a/model/lib/pointnet2/src/sampling_gpu.h b/model/lib/pointnet2/src/sampling_gpu.h
deleted file mode 100644
index 6200c5914e434ecd2fc3b36313985805f6dbe0cc..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2/src/sampling_gpu.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _SAMPLING_GPU_H
-#define _SAMPLING_GPU_H
-
-#include <torch/serialize/tensor.h>
-#include <ATen/cuda/CUDAContext.h>
-#include<vector>
-
-
-int gather_points_wrapper_fast(int b, int c, int n, int npoints, 
-    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
-
-void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints, 
-    const float *points, const int *idx, float *out, cudaStream_t stream);
-
-
-int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints, 
-    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
-
-void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, 
-    const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
-
-
-int furthest_point_sampling_wrapper(int b, int n, int m, 
-    at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor);
-
-void furthest_point_sampling_kernel_launcher(int b, int n, int m, 
-    const float *dataset, float *temp, int *idxs, cudaStream_t stream);
-
-#endif
diff --git a/model/lib/pointnet2_cuda.cpython-39-x86_64-linux-gnu.so b/model/lib/pointnet2_cuda.cpython-39-x86_64-linux-gnu.so
deleted file mode 100644
index 560e153c8189128d5cd3034c8ffdd218592d169d..0000000000000000000000000000000000000000
--- a/model/lib/pointnet2_cuda.cpython-39-x86_64-linux-gnu.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4fe149eddea375d371bc331b11073003e0586254a25c6fe3769b2f6febf8bc54
-size 19727240
diff --git a/model/meta.py b/model/meta.py
deleted file mode 100644
index a0ab6daaa14337633f9d3261d78248683d04c930..0000000000000000000000000000000000000000
--- a/model/meta.py
+++ /dev/null
@@ -1,175 +0,0 @@
-from typing import List
-import torch
-import torch.nn as nn
-import json
-import os
-from .tokenizer import Tokenizer
-from . import LLM
-
-from fairscale.nn.model_parallel import initialize as fs_init
-
-
-class MetaModel(nn.Module):
-
-    def __init__(self, llama_type, llama_config, llama_ckpt_dir=None, tokenizer_path=None):
-        super().__init__()
-
-        self.criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
-
-        ModelArgs = LLM.__dict__[llama_type].ModelArgs
-        Transformer = LLM.__dict__[llama_type].Transformer
-
-        with open(llama_config, "r") as f:
-            params = json.loads(f.read())
-        model_args: ModelArgs = ModelArgs(
-            max_seq_len=2048, max_batch_size=32, **params
-        )
-        self.tokenizer = Tokenizer(model_path=tokenizer_path)
-        model_args.vocab_size = self.tokenizer.n_words
-
-        model = Transformer(model_args)
-        mp_rank = fs_init.get_model_parallel_rank()
-        if llama_ckpt_dir is not None:
-            ckpt_path = os.path.join(llama_ckpt_dir, f"consolidated.{mp_rank:02d}.pth")
-            if os.path.exists(ckpt_path):
-                checkpoint = torch.load(ckpt_path, map_location="cpu")
-                msg = model.load_state_dict(checkpoint, strict=False)
-                print(msg)
-            else:
-                print(f'Checkpoint not found at {ckpt_path}')
-        self.llma = model
-        for name, param in self.named_parameters():
-            if param.requires_grad:
-               print(f"Trainable param: {name}, {param.shape}, {param.dtype}")
-        count = sum(p.numel() for p in self.parameters() if p.requires_grad)
-        print(f"Parameter count : {count}")
-
-    def forward(self, examples, labels, image=None, modal='image'):
-        output = self.llma(examples, image=image, modal=modal)
-        output = output[:, :-1, :]
-        labels = labels[:, 1:]
-
-        if labels.sum() == 0:
-            c_loss = output.mean() * 0
-        else:
-            c_loss = self.criterion(output.reshape(-1, 32000), labels.flatten())
-
-        return c_loss
-
-    def generate(
-        self,
-        prompts: List[str],
-        images,
-        max_gen_len: int,
-        temperature: float = 0.8,
-        top_p: float = 0.95,
-        modal = ['image'],
-    ) -> List[str]:
-        bsz = len(prompts)
-        params = self.llma.params
-        assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
-
-        prompt_tokens = [self.tokenizer.encode(
-            x, bos=True, eos=False) for x in prompts]
-
-        min_prompt_size = min([len(t) for t in prompt_tokens])
-        max_prompt_size = max([len(t) for t in prompt_tokens])
-
-        total_len = min(params.max_seq_len, max_gen_len + max_prompt_size)
-
-        tokens = torch.full(
-            (bsz, total_len), self.tokenizer.pad_id).cuda().long()
-        for k, t in enumerate(prompt_tokens):
-            tokens[k, : len(t)] = torch.tensor(t).long()
-        input_text_mask = tokens != self.tokenizer.pad_id
-        start_pos = min_prompt_size
-        prev_pos = 0
-        for cur_pos in range(start_pos, total_len):
-            logits = self.llma.forward_inference(tokens[:, prev_pos:cur_pos], prev_pos, images if prev_pos == 0 else None, modal=modal)
-            if temperature > 0:
-                probs = torch.softmax(logits / temperature, dim=-1)
-                next_token = self.sample_top_p(probs, top_p)
-            else:
-                next_token = torch.argmax(logits, dim=-1)
-            next_token = next_token.reshape(-1)
-            # only replace token if prompt has already been generated
-            next_token = torch.where(
-                input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
-            )
-            tokens[:, cur_pos] = next_token
-            prev_pos = cur_pos
-
-        decoded = []
-        for i, t in enumerate(tokens.tolist()):
-            # cut to max gen len
-            t = t[: len(prompt_tokens[i]) + max_gen_len]
-            # cut to eos tok if any
-            try:
-                t = t[: t.index(self.tokenizer.eos_id)]
-            except ValueError:
-                pass
-            decoded.append(self.tokenizer.decode(t))
-        return decoded
-    
-    @torch.inference_mode()
-    def stream_generate(
-        self,
-        prompt: str,
-        images,
-        max_gen_len: int,
-        temperature: float = 0.8,
-        top_p: float = 0.95,
-        modal = ['image'],
-    ):
-        params = self.llma.params
-
-        prompt_tokens = self.tokenizer.encode(prompt, bos=True, eos=False)
-        # truncate from the left. leave some space for generation.
-        max_seq_len = params.max_seq_len
-        if images is not None:
-            max_seq_len -= self.llma.image_words
-
-        max_prompt_size = max_seq_len - max_gen_len
-        prompt_tokens = prompt_tokens[-max_prompt_size:]
-
-        prompt_size = len(prompt_tokens)
-
-        total_len = min(max_seq_len, max_gen_len + prompt_size)
-
-        tokens = torch.full([total_len], 0).cuda().long()
-
-        tokens[:len(prompt_tokens)] = torch.tensor(prompt_tokens).long()
-        start_pos = prompt_size
-        prev_pos = 0
-        generate_until = start_pos
-        for cur_pos in range(start_pos, total_len):
-            logits = self.llma.forward_inference(tokens[None, prev_pos:cur_pos], prev_pos, images if prev_pos == 0 else None, modal = modal)
-            if temperature > 0:
-                probs = torch.softmax(logits / temperature, dim=-1)
-                next_token = self.sample_top_p(probs, top_p)
-            else:
-                next_token = torch.argmax(logits, dim=-1)
-            next_token = next_token.item()
-
-            if next_token == self.tokenizer.eos_id:
-                break
-
-            tokens[cur_pos] = next_token
-            prev_pos = cur_pos
-            generate_until = cur_pos + 1
-            yield {"text": self.tokenizer.decode(tokens[start_pos:generate_until].tolist()), "end_of_content": False}
-
-        yield {"text": self.tokenizer.decode(tokens[start_pos:generate_until].tolist()), "end_of_content": True}
-
-    def sample_top_p(self, probs, p):
-        probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
-        probs_sum = torch.cumsum(probs_sort, dim=-1)
-        mask = probs_sum - probs_sort > p
-        probs_sort[mask] = 0.0
-        probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
-        next_token = torch.multinomial(probs_sort, num_samples=1)
-        next_token = torch.gather(probs_idx, -1, next_token)
-        return next_token
-
-    def get_image_words(self):
-        return self.llma.image_words
\ No newline at end of file
diff --git a/model/tokenizer.py b/model/tokenizer.py
deleted file mode 100644
index e4315856eea5c4318499c8909898252902252f30..0000000000000000000000000000000000000000
--- a/model/tokenizer.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed according to the terms of the GNU General Public License version 3.
-
-from sentencepiece import SentencePieceProcessor
-from logging import getLogger
-from typing import List
-import os
-
-
-logger = getLogger()
-
-
-class Tokenizer:
-    def __init__(self, model_path: str):
-        # reload tokenizer
-        assert os.path.isfile(model_path), model_path
-        self.sp_model = SentencePieceProcessor(model_file=model_path)
-        logger.info(f"Reloaded SentencePiece model from {model_path}")
-
-        # BOS / EOS token IDs
-        self.n_words: int = self.sp_model.vocab_size()
-        self.bos_id: int = self.sp_model.bos_id()
-        self.eos_id: int = self.sp_model.eos_id()
-        self.pad_id: int = self.sp_model.pad_id()
-        logger.info(
-            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
-        )
-        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
-
-    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
-        assert type(s) is str
-        t = self.sp_model.encode(s)
-        if bos:
-            t = [self.bos_id] + t
-        if eos:
-            t = t + [self.eos_id]
-        return t
-
-    def decode(self, t: List[int]) -> str:
-        return self.sp_model.decode(t)
diff --git a/pre-requirements.txt b/pre-requirements.txt
deleted file mode 100644
index 70117929355028ace1966672bfc23d54583ffb30..0000000000000000000000000000000000000000
--- a/pre-requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
---extra-index-url https://download.pytorch.org/whl/cu117
-torch==2.0.0+cu117
-packaging
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
old mode 100755
new mode 100644
index 20d78a4e7327b08c5afa6c0495efec20ac50dd44..7e57012a09b6351c24ce70efb47cd764f34b9f36
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,17 @@
---extra-index-url https://download.pytorch.org/whl/cu117
-torch==2.0.0+cu117
-packaging
-fairscale
-sentencepiece
-Pillow
-huggingface_hub
-open_clip_torch
-decord
-pytorchvideo==0.1.5
-torchaudio
-matplotlib
-flash-attn
-gradio
-plotly
\ No newline at end of file
+--extra-index-url https://download.pytorch.org/whl/cu121
+accelerate==0.28.0
+datasets==2.16.1
+deepspeed==0.14.4
+einops==0.8.1
+gradio==5.34.0
+huggingface_hub==0.29.1
+numpy==1.26.1
+Pillow==11.2.1
+pyarrow==17.0.0
+PyYAML==6.0.2
+torch==2.1.2
+torchvision==0.16.2
+tqdm==4.66.5
+transformers==4.50.0
+wandb
+easydict
diff --git a/t2i_inference.py b/t2i_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..36e2a48ed545ddc1835f32871924ae670eb26abe
--- /dev/null
+++ b/t2i_inference.py
@@ -0,0 +1,99 @@
+import re
+from dataclasses import dataclass
+
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from transformers import AutoTokenizer, Qwen2ForCausalLM
+
+from tok.mm_autoencoder import MMAutoEncoder
+
+
+@dataclass
+class T2IConfig:
+    model_path: str = "csuhan/Tar-1.5B"
+    # visual tokenizer config
+    ar_path: str = 'ar_dtok_lp_250px.pth'
+    encoder_path: str = 'ta_tok.pth'
+    decoder_path: str = 'vq_ds16_t2i.pt'
+    
+    device: str = "cuda:0"
+    # generation parameters
+    scale: int = 0  # choose from [0, 1, 2]
+    seq_len: int = 729  # choose from [729, 169, 81]
+    temperature: float = 1.0
+    top_p: float = 0.95
+    top_k: int = 1200
+    cfg_scale: float = 4.0
+
+class TextToImageInference:
+    def __init__(self, config: T2IConfig):
+        self.config = config
+        self.device = torch.device(config.device)
+        self._load_models()
+        
+    def _load_models(self):
+        self.model = Qwen2ForCausalLM.from_pretrained(self.config.model_path).to(self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_path)
+        
+        # Initialize visual tokenizer
+        config = dict(
+            ar_path=self.config.ar_path,
+            encoder_path=self.config.encoder_path,
+            decoder_path=self.config.decoder_path,
+            encoder_args={'input_type': 'rec'},
+            decoder_args={},
+        )
+        self.visual_tokenizer = MMAutoEncoder(**config).eval().to(self.device)
+        self.visual_tokenizer.ar_model.cls_token_num = self.config.seq_len
+        self.visual_tokenizer.encoder.pool_scale = self.config.scale + 1
+
+    def generate_image(self, prompt: str) -> Image.Image:
+        # Prepare prompt
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ]
+        
+        input_text = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True)
+        input_text += f"<im_start><S{self.config.scale}>"
+        
+        # Generate tokens
+        inputs = self.tokenizer(input_text, return_tensors="pt")
+        gen_ids = self.model.generate(
+            inputs.input_ids.to(self.device),
+            max_new_tokens=self.config.seq_len,
+            do_sample=True,
+            temperature=self.config.temperature,
+            top_p=self.config.top_p,
+            top_k=self.config.top_k)
+        
+        # Process generated tokens
+        gen_text = self.tokenizer.batch_decode(gen_ids)[0]
+        gen_code = [int(x) for x in re.findall(r'<I(\d+)>', gen_text)]
+        gen_code = gen_code[:self.config.seq_len] + [0] * max(0, self.config.seq_len - len(gen_code))
+        gen_code = torch.tensor(gen_code).unsqueeze(0).to(self.device)
+        
+        gen_tensor = self.visual_tokenizer.decode_from_encoder_indices(
+            gen_code, 
+            {'cfg_scale': self.config.cfg_scale}
+        )
+        gen_image = Image.fromarray(gen_tensor[0].numpy())
+        return gen_image
+
+def main():
+    config = T2IConfig()
+    config.ar_path = hf_hub_download("csuhan/TA-Tok", "ar_dtok_lp_1024px.pth")
+    config.encoder_path = hf_hub_download("csuhan/TA-Tok", "ta_tok.pth")
+    config.decoder_path = hf_hub_download("peizesun/llamagen_t2i", "vq_ds16_t2i.pt")
+    inference = TextToImageInference(config)
+    
+    prompt = "A photo of a macaw"
+    image = inference.generate_image(prompt)
+    image.save("generated_image.png")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/tok/__init__.py b/tok/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1638f02c6ef62e4f0c65ee71e8243e4c632b3511
--- /dev/null
+++ b/tok/__init__.py
@@ -0,0 +1 @@
+from .ar_dtok import *
\ No newline at end of file
diff --git a/tok/ar_dtok/__init__.py b/tok/ar_dtok/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b011a5071ac65b99ac5f98d5f761f85f6983b1bd
--- /dev/null
+++ b/tok/ar_dtok/__init__.py
@@ -0,0 +1,2 @@
+from .bottleneck import Bottleneck, SimVectorQuantizer
+from .vqvae import VQVAE
\ No newline at end of file
diff --git a/tok/ar_dtok/ar_model.py b/tok/ar_dtok/ar_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..92a0384204f63b896c1efedfca8fbdc44b00da45
--- /dev/null
+++ b/tok/ar_dtok/ar_model.py
@@ -0,0 +1,533 @@
+import os
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from .. import models
+from .generate import generate as ar_generate
+
+
+def find_multiple(n: int, k: int):
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, scale_factor=10000):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    scale_factor: the base for the scaling factor, default is 10000
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / scale_factor**omega  # Parameterized scaling factor (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+@dataclass
+class ModelArgs:
+    dim: int = 4096
+    n_layer: int = 32
+    n_head: int = 32
+
+    n_kv_head: Optional[int] = None
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    ffn_dim_multiplier: Optional[float] = None
+    rope_base: float = 10000
+    norm_eps: float = 1e-5
+    initializer_range: float = 0.02
+    
+    token_dropout_p: float = 0.1
+    attn_dropout_p: float = 0.0
+    resid_dropout_p: float = 0.1
+    ffn_dropout_p: float = 0.1
+    drop_path_rate: float = 0.0
+
+    num_classes: int = 1000
+    class_dropout_prob: float = 0.1
+    model_type: str = 'class_cond' # clip_cond, indice_cond
+    cond_dim: int = 1152
+    cond_vocab_size: int = 8192
+
+    vocab_size: int = 8192
+    cls_token_num: int = 1
+
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+
+    use_fixed_pe: bool = False
+
+    frame_prediction: bool = False
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    @torch.autocast(device_type='cuda', enabled=False)
+    def _norm(self, x):
+        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+
+class MLP(nn.Module):
+    def __init__(self, in_features, hidden_features, out_features):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=False)
+        self.act = nn.GELU(approximate='tanh')
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=False)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+
+
+#################################################################################
+#                            Drop Path Implementation                           #
+#################################################################################
+
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+
+class DropPath(torch.nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+    def extra_repr(self):
+        return f'drop_prob={round(self.drop_prob,3):0.3f}'
+
+
+#################################################################################
+#                                   AR Model                                    #
+#################################################################################
+
+class FeedForward(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        hidden_dim = 4 * config.dim
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if config.ffn_dim_multiplier is not None:
+            hidden_dim = int(config.ffn_dim_multiplier * hidden_dim)
+        hidden_dim = find_multiple(hidden_dim, config.multiple_of)
+
+        self.w1 = nn.Linear(config.dim, hidden_dim, bias=False)
+        self.w3 = nn.Linear(config.dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, config.dim, bias=False)
+        self.ffn_dropout = nn.Dropout(config.ffn_dropout_p)
+
+    def forward(self, x):
+        return self.ffn_dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
+    
+
+class KVCache(nn.Module):
+    def __init__(self, max_batch_size, max_seq_length, n_head, head_dim, dtype):
+        super().__init__()
+        cache_shape = (max_batch_size, n_head, max_seq_length, head_dim)
+        self.register_buffer('k_cache', torch.zeros(cache_shape, dtype=dtype))
+        self.register_buffer('v_cache', torch.zeros(cache_shape, dtype=dtype))
+
+    def update(self, input_pos, k_val, v_val):
+        # input_pos: [S], k_val: [B, H, S, D]
+        assert input_pos.shape[0] == k_val.shape[2], f"{input_pos.shape[0]} != {k_val.shape[2]}"
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out[:, :, input_pos] = k_val.to(k_out.dtype)
+        v_out[:, :, input_pos] = v_val.to(v_out.dtype)
+
+        return k_out, v_out
+
+
+class Attention(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        assert config.dim % config.n_head == 0
+        self.dim = config.dim
+        self.head_dim = config.dim // config.n_head
+        self.n_head = config.n_head
+        self.n_kv_head = config.n_kv_head if config.n_kv_head is not None else config.n_head
+        total_kv_dim = (self.n_head + 2 * self.n_kv_head) * self.head_dim
+
+        # key, query, value projections for all heads, but in a batch
+        self.wqkv = nn.Linear(config.dim, total_kv_dim, bias=False)
+        self.wo = nn.Linear(config.dim, config.dim, bias=False)
+        self.kv_cache = None
+
+        # regularization
+        self.attn_dropout_p = config.attn_dropout_p
+        self.resid_dropout = nn.Dropout(config.resid_dropout_p)
+
+    def forward(
+        self, x: torch.Tensor,
+        input_pos: Optional[torch.Tensor] = None, 
+        mask: Optional[torch.Tensor] = None
+    ):
+        bsz, seqlen, _ = x.shape
+        kv_size = self.n_kv_head * self.head_dim
+        xq, xk, xv = self.wqkv(x).split([self.dim, kv_size, kv_size], dim=-1)
+
+        xq = xq.view(bsz, seqlen, self.n_head, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_kv_head, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_kv_head, self.head_dim)
+        
+        xq, xk, xv = map(lambda x: x.transpose(1, 2), (xq, xk, xv))
+
+        if self.kv_cache is not None:
+            keys, values = self.kv_cache.update(input_pos, xk, xv)
+        else:
+            keys, values = xk, xv
+        keys = keys.repeat_interleave(self.n_head // self.n_kv_head, dim=1)
+        values = values.repeat_interleave(self.n_head // self.n_kv_head, dim=1)
+
+        output = F.scaled_dot_product_attention(
+            xq, keys, values, 
+            attn_mask=mask, 
+            is_causal=True if mask is None else False, # is_causal=False is for KV cache
+            dropout_p=self.attn_dropout_p if self.training else 0)            
+        
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+        output = self.resid_dropout(self.wo(output))
+        return output
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, config: ModelArgs, drop_path: float):
+        super().__init__()
+        self.attention = Attention(config)
+        self.feed_forward = FeedForward(config)
+        self.attention_norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.ffn_norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(
+        self, x: torch.Tensor, start_pos: int, mask: Optional[torch.Tensor] = None):
+        h = x + self.drop_path(self.attention(self.attention_norm(x), start_pos, mask))
+        out = h + self.drop_path(self.feed_forward(self.ffn_norm(h)))
+        return out
+
+
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+
+        # replace all negative labels with the last class (unconditional class)
+        labels = torch.where(labels < 0, self.num_classes, labels)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+
+
+class ARModel(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.n_layer = config.n_layer
+        self.max_seq_length = config.max_seq_len
+        self.num_classes = config.num_classes
+        self.model_type = config.model_type
+        self.cls_token_num = config.cls_token_num
+        self.is_sampling = False
+        self.frame_prediction = config.frame_prediction
+
+        if self.model_type == 'class_cond':
+            self.cls_embedding = LabelEmbedder(config.num_classes, config.dim, config.class_dropout_prob)
+        elif self.model_type == 'clip_cond':
+            self.clip_proj = nn.Linear(config.cond_dim, config.dim)
+        elif self.model_type == 'indice_cond':
+            self.clip_proj = LabelEmbedder(config.cond_vocab_size + 1, config.dim, 0.0)
+        else:
+            raise Exception("please check model type")
+        
+        self.tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
+        self.tok_dropout = nn.Dropout(config.token_dropout_p)
+
+        # transformer blocks
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.n_layer)]
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(config.n_layer):
+            self.layers.append(TransformerBlock(config, dpr[layer_id]))
+
+        # output layer
+        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
+
+        if config.use_fixed_pe:
+            self.register_buffer('abs_pe', torch.zeros(1, config.max_seq_len + config.cls_token_num - 1, config.dim))
+            abs_pe = get_1d_sincos_pos_embed_from_grid(embed_dim=config.dim, pos=np.arange(config.max_seq_len + config.cls_token_num - 1))
+            self.abs_pe.copy_(torch.from_numpy(abs_pe).float().reshape_as(self.abs_pe))
+            print(f"Using fixed absolute PE")
+        else:
+            self.abs_pe = nn.Parameter(torch.randn(1, config.max_seq_len + config.cls_token_num - 1, config.dim) * 0.02)
+            print(f"Using learned absolute PE")
+
+        self.initialize_weights()
+
+    def initialize_weights(self):        
+        # Initialize nn.Linear and nn.Embedding
+        self.apply(self._init_weights)
+
+        # Zero-out output layers:
+        if hasattr(self.output, 'weight') and isinstance(self.output.weight, nn.Parameter):
+            nn.init.constant_(self.output.weight, 0)
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    
+
+    @contextmanager
+    def sampling(self):
+        self.is_sampling = True
+        try:
+            yield
+        finally:
+            self.is_sampling = False
+
+
+    def setup_caches(self, max_batch_size, max_seq_length, dtype):
+        assert max_seq_length == self.max_seq_length + self.cls_token_num, f'{max_seq_length} != {self.max_seq_length} + {self.cls_token_num=}'
+
+        head_dim = self.config.dim // self.config.n_head
+        max_seq_length = find_multiple(max_seq_length, 8)
+
+        for b in self.layers:
+            b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_head, head_dim, dtype)
+
+        causal_mask = torch.tril(torch.ones(max_seq_length, max_seq_length, dtype=torch.bool))
+        self.causal_mask = causal_mask.unsqueeze(0).repeat(max_batch_size, 1, 1)
+
+
+    def reset_caches(self):
+        for b in self.layers:
+            b.attention.kv_cache = None
+
+    def clip_embedding(self, x):
+        if self.model_type == 'clip_cond':
+            if self.training and self.config.class_dropout_prob > 0:
+                drop_ids = torch.rand(x.shape[0], device=x.device) < self.config.class_dropout_prob
+                x[drop_ids] = 0.
+            x = self.clip_proj(x.to(self.dtype)) # Linear
+        elif self.model_type == 'indice_cond':
+            if self.training and self.config.class_dropout_prob > 0:
+                drop_ids = torch.rand(x.shape[0], device=x.device) < self.config.class_dropout_prob
+                x[drop_ids] = self.config.cond_vocab_size
+            x = self.clip_proj(x, train=self.training) # Embedding
+        return x
+
+    def forward(
+        self, 
+        idx: Optional[torch.Tensor], # (b, n)
+        cond_idx: Optional[torch.Tensor],  # cond_idx_or_embed
+        input_pos:  Optional[torch.Tensor] = None, 
+        targets: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+        valid: Optional[torch.Tensor] = None,
+    ):
+        if idx is not None and cond_idx is not None: # training or naive inference
+            if self.model_type == 'class_cond':
+                cond_embeddings = self.cls_embedding(cond_idx, train=self.training).unsqueeze(1)[:,:self.cls_token_num]
+            elif self.model_type in ['clip_cond', 'indice_cond']:
+                cond_embeddings = self.clip_embedding(cond_idx)
+            token_embeddings = self.tok_embeddings(idx) # (b, n, d)
+            token_embeddings = torch.cat((cond_embeddings, token_embeddings), dim=1)  # (b, cls_token_num + n, d)
+            h = self.tok_dropout(token_embeddings)
+        else:
+            if cond_idx is not None: # prefill in inference
+                if self.model_type == 'class_cond':
+                    token_embeddings = self.cls_embedding(cond_idx, train=self.training).unsqueeze(1)[:,:self.cls_token_num]
+                elif self.model_type in ['clip_cond', 'indice_cond']:
+                    token_embeddings = self.clip_embedding(cond_idx)
+            else: # decode_n_tokens(kv cache) in inference
+                token_embeddings = self.tok_embeddings(idx)
+            
+            bs = token_embeddings.shape[0]
+            mask = self.causal_mask[:bs, None, input_pos]
+            h = self.tok_dropout(token_embeddings)
+        
+        if self.is_sampling:
+            h = h + self.abs_pe[:, input_pos]
+        else:
+            h = h + self.abs_pe[:, :h.shape[1]]
+        
+        # transformer blocks
+        for layer in self.layers:
+            h = layer(h, input_pos, mask)
+        
+        # output layers
+        h = self.norm(h)
+        logits = self.output(h)
+        # if self.training or self.is_sampling:
+        if cond_idx is not None:
+        # if self.training:
+            # logits = logits[:, self.cls_token_num - 1:].contiguous()
+            logits = logits[:, cond_idx.size(1) - 1:].contiguous()
+
+        # if we are given some desired targets also calculate the loss
+        loss = None
+        if valid is not None:
+            loss_all = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), reduction='none')
+            valid_all = valid[:,None].repeat(1, targets.shape[1]).view(-1)
+            loss = (loss_all * valid_all).sum() / max(valid_all.sum(), 1)
+        elif targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+
+    
+    @torch.inference_mode()
+    def sample(
+        self, 
+        c,
+        cfg_scale=2.0,
+        cfg_interval=-1,
+        temperature=1.0,
+        top_k=0,
+        top_p=1.0,
+        seq_length=None,
+    ):
+        seq_length = self.max_seq_length if seq_length is None else seq_length     
+        with self.sampling():
+            sampled_seqs = ar_generate(
+                self, c, seq_length,
+                cfg_scale=cfg_scale, cfg_interval=cfg_interval,
+                temperature=temperature, top_k=top_k,
+                top_p=top_p, sample_logits=True, 
+            )   
+        return sampled_seqs
+    
+
+    @classmethod
+    def from_checkpoint(cls, ckpt, load_state_dict=True):
+        if isinstance(ckpt, str):
+            assert os.path.exists(ckpt), f"checkpoint {ckpt} does not exist"
+            ckpt = torch.load(ckpt, map_location=lambda storage, loc: storage)
+        else:
+            assert isinstance(
+                ckpt, dict
+            ), f"checkpoint must be a dict or a path to a checkpoint"
+        model = models.make(ckpt["model"], load_sd=load_state_dict)
+        return model
+
+
+#################################################################################
+#                             LLAMA-ABS Configs                                 #
+#################################################################################
+
+def LLAMA_ABS_XXXL(**kwargs):
+    return ARModel(ModelArgs(n_layer=48, n_head=40, dim=2560, **kwargs)) # 3.9B
+
+def LLAMA_ABS_XXL(**kwargs):
+    return ARModel(ModelArgs(n_layer=48, n_head=24, dim=1536, **kwargs)) # 1.4B
+
+def LLAMA_ABS_XL(**kwargs):
+    return ARModel(ModelArgs(n_layer=36, n_head=20, dim=1280, **kwargs)) # 775M
+
+def LLAMA_ABS_LP(**kwargs):
+    return ARModel(ModelArgs(n_layer=30, n_head=20, dim=1280, **kwargs)) # 632M
+
+def LLAMA_ABS_L(**kwargs):
+    return ARModel(ModelArgs(n_layer=24, n_head=16, dim=1024, **kwargs)) # 343M
+
+def LLAMA_ABS_B(**kwargs):
+    return ARModel(ModelArgs(n_layer=12, n_head=12, dim=768, **kwargs)) # 111M
+
+def LLAMA_ABS_S(**kwargs):
+    return ARModel(ModelArgs(n_layer=12, n_head=6, dim=384, **kwargs)) # 21.7M
+
+ar_models = {
+    'llama-abs-S': LLAMA_ABS_S,
+    'llama-abs-B': LLAMA_ABS_B,
+    'llama-abs-L': LLAMA_ABS_L,
+    'llama-abs-LP': LLAMA_ABS_LP,
+    'llama-abs-XL': LLAMA_ABS_XL,
+    'llama-abs-XXL': LLAMA_ABS_XXL,
+    'llama-abs-XXXL': LLAMA_ABS_XXXL,
+}
+
+models.models.update(ar_models)
diff --git a/tok/ar_dtok/bottleneck.py b/tok/ar_dtok/bottleneck.py
new file mode 100644
index 0000000000000000000000000000000000000000..533393f4d87969d21e211a7d1f4f256529ea0b20
--- /dev/null
+++ b/tok/ar_dtok/bottleneck.py
@@ -0,0 +1,184 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+from .. import models
+from ..models import register
+
+
+@register("bottleneck")
+class Bottleneck(nn.Module):
+    def __init__(
+        self,
+        bottleneck_dim: int,
+        input_dim: int,
+        output_dim: int,
+        token_nums: int,
+        regularizer=None,
+        **kwargs
+    ):  
+        super().__init__()
+        self.token_nums = token_nums
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        if bottleneck_dim > 0:
+            self.bottleneck_dim = bottleneck_dim
+        else:
+            assert self.input_dim == self.output_dim, "input_dim and output_dim must be the same when bottleneck_dim is not specified"
+            self.bottleneck_dim = self.input_dim
+        
+        self.project_dim = self.bottleneck_dim
+
+        if self.bottleneck_dim > 0:
+            self.in_linear = nn.Linear(self.input_dim, self.project_dim)
+            self.out_linear = nn.Linear(self.bottleneck_dim, self.output_dim)
+        else:
+            self.in_linear = self.out_linear = lambda x: x
+        
+        regularizer['args']['dim'] = self.bottleneck_dim
+        regularizer['args']['token_nums'] = self.token_nums
+        self.regularizer = models.make(regularizer)
+
+    def project_in(self, x):
+        assert len(x.shape) == 3, "Input shape must be (batch, n_tokens, e_dim)"
+        z = self.in_linear(x)
+        return z
+
+    def project_out(self, z_cat):
+        z = self.out_linear(z_cat)
+        return z
+
+    def decode(self, bottleneck_rep):
+        regularized_z = self.regularizer.decode(bottleneck_rep)
+        return self.project_out(regularized_z)
+
+    def forward(self, x):  
+        z = self.project_in(x)
+        projected_z = z
+        regularized_output = self.regularizer(z)
+        x_hat = self.project_out(regularized_output['regularized_z'])
+        bottleneck_rep = regularized_output.pop('bottleneck_rep')
+        return {
+            'output': x_hat,
+            'bottleneck_rep': bottleneck_rep,
+            'projected_z': projected_z,
+            **regularized_output,
+        }
+
+
+@register("simvq")
+class SimVectorQuantizer(nn.Module):
+    def __init__(
+        self,
+        dim,
+        codebook_size,
+        l2_normalized=False,
+        same_index_shape=True,
+        stochastic=False,
+        stochastic_temperature=1.0,
+        **kwargs,
+    ):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.dim = dim
+        assert isinstance(l2_normalized, bool)
+        self.l2_normalized = l2_normalized
+        self.stochastic = stochastic
+        self.eval_deterministic = False
+        self.default_stochastic_temperature = stochastic_temperature
+        
+        if self.stochastic:
+            if stochastic_temperature > 0: # fixed temperature
+                self.stochastic_temperature_inv = 1 / stochastic_temperature
+            else: # set stochastic_temperature < 0 to use learnable temperature
+                self.stochastic_temperature_inv = nn.Parameter(torch.tensor(10.0))
+
+        # for clear inference code, we remove the codebook init from LLM's embedding
+        self.embedding = nn.Embedding(self.codebook_size, self.dim)
+        self.embedding_proj = nn.Linear(self.dim, self.dim)
+
+        self.same_index_shape = same_index_shape
+
+    def set_eval_deterministic(self, deterministic=True):
+        self.eval_deterministic = deterministic
+
+    def set_stochastic_temperature(self, temperature):
+        self.stochastic_temperature_inv = 1 / temperature
+
+    @torch.autocast(device_type='cuda', enabled=False)
+    def get_emb(self):
+        emb = self.embedding_proj(self.embedding.weight)
+        if self.l2_normalized:
+            emb = F.normalize(emb, p=2, dim=-1)
+        # assert emb.dtype == torch.float32, f"Embedding weight dtype is {emb.dtype}, expected float32"
+        return emb
+
+    @torch.autocast(device_type='cuda', enabled=False)
+    def forward(self, z):
+        emb = self.get_emb()
+        z = z.to(emb)
+        # z = z.float()
+        assert len(z.shape) == 3, "Input shape must be (batch, n_tokens, e_dim)"
+        if self.l2_normalized:
+            z = F.normalize(z, p=2, dim=-1)
+
+        z_flattened = rearrange(z, 'b n d -> (b n) d')
+
+        if self.stochastic:
+            # sample the softmaxed cosine similarity
+            assert self.l2_normalized, "Stochastic sampling requires l2 normalization"
+            cos_sim = torch.einsum("bd,nd->bn", z_flattened, emb)
+            probs = F.softmax(cos_sim * self.stochastic_temperature_inv, dim=-1)
+            if self.eval_deterministic and not self.training:
+                q_indices = torch.argmax(probs, dim=-1)
+            else:
+                q_indices = torch.multinomial(probs, 1).squeeze(-1)
+        else:
+            d = (
+                torch.sum(z_flattened**2, dim=1, keepdim=True)
+                + torch.sum(emb**2, dim=1)
+                - 2
+                * torch.einsum(
+                    "bd,dn->bn", z_flattened, rearrange(emb, "n d -> d n")
+                )
+            )
+            q_indices = torch.argmin(d, dim=1)
+
+        quantized = F.embedding(q_indices, emb, self.embedding.padding_idx, self.embedding.max_norm,
+            self.embedding.norm_type, self.embedding.scale_grad_by_freq, self.embedding.sparse).view(z.shape)  # (b, n, d)
+        
+        # preserve gradients
+        quantized = z + (quantized - z).detach()
+
+        if self.same_index_shape:
+            q_indices = q_indices.reshape(quantized.shape[0], quantized.shape[1])
+
+        return_dict = {
+            'unregularized_z': z, # but l2 normalized if l2_normalized=True
+            'emb': emb, # but l2 normalized if l2_normalized=True
+            'regularized_z': quantized,
+            'bottleneck_rep': q_indices
+        }
+        return return_dict
+    
+    def get_codebook_entry(self, indices, shape=None):
+        # shape specifying (batch, height, width, channel)
+        indices_shape = indices.shape
+        indices_flatten = rearrange(indices, '... -> (...)')
+
+        # get quantized latent vectors
+        emb = self.get_emb()
+        z_q = F.embedding(indices_flatten, emb)
+        # z_q = self.embedding(indices_flatten)
+        if self.l2_normalized:
+            z_q = F.normalize(z_q, p=2, dim=-1)
+
+        if shape is not None:
+            z_q = z_q.reshape(shape)
+        else:
+            z_q = z_q.reshape([*indices_shape, self.dim])
+        return z_q
+
+    def decode(self, indices):
+        return self.get_codebook_entry(indices)
\ No newline at end of file
diff --git a/tok/ar_dtok/generate.py b/tok/ar_dtok/generate.py
new file mode 100644
index 0000000000000000000000000000000000000000..54f4034f5bd090fa84d4cc6d9226d95737fa39d6
--- /dev/null
+++ b/tok/ar_dtok/generate.py
@@ -0,0 +1,188 @@
+# Modified from:
+#   llamagen: https://github.com/FoundationVision/LlamaGen/blob/main/autoregressive/models/generate.py
+#   gpt-fast: https://github.com/pytorch-labs/gpt-fast/blob/main/generate.py
+#   DiT:      https://github.com/facebookresearch/DiT/blob/main/models.py
+
+
+import torch
+import torch._dynamo.config
+import torch._inductor.config
+from torch.nn import functional as F
+
+
+### from https://huggingface.co/transformers/v3.2.0/_modules/transformers/generation_utils.html
+def top_k_top_p_filtering(
+    logits,
+    top_k: int = 0,
+    top_p: float = 1.0,
+    filter_value: float = -float("Inf"),
+    min_tokens_to_keep: int = 1,
+):
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens_to_keep per batch example in the output
+    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    if top_k > 0:
+        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        if min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        logits[indices_to_remove] = filter_value
+    return logits
+
+
+def sample(logits, temperature: float=1.0, top_k: int=0, top_p: float=1.0, sample_logits=True):        
+    logits = logits[:, -1, :] / max(temperature, 1e-5)
+    if top_k > 0 or top_p < 1.0:
+        logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+
+    # improve numerical stability of softmax
+    probs = F.softmax(logits.float(), dim=-1)
+    if sample_logits:
+        idx = torch.multinomial(probs, num_samples=1)
+    else:
+        _, idx = torch.topk(probs, k=1, dim=-1)
+    return idx, probs
+
+
+def logits_to_probs(logits, temperature: float = 1.0, top_p: float=1.0, top_k: int = None, **kwargs):
+    logits = logits / max(temperature, 1e-5)
+    if top_k > 0 or top_p < 1.0:
+        logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    return probs
+
+
+def prefill(model, cond_idx: torch.Tensor, input_pos: torch.Tensor, cfg_scale: float, **sampling_kwargs):
+    if cfg_scale > 1.0:
+        logits, _ = model(None, cond_idx, input_pos)
+        logits_combined = logits
+        cond_logits, uncond_logits = torch.split(logits_combined, len(logits_combined) // 2, dim=0)
+        logits = uncond_logits + (cond_logits - uncond_logits) * cfg_scale
+    else:
+        logits, _ = model(None, cond_idx, input_pos)
+
+    return sample(logits, **sampling_kwargs)[0]
+
+
+def decode_one_token(model, x: torch.Tensor, input_pos: torch.Tensor, cfg_scale: float, cfg_flag: bool, **sampling_kwargs):
+    assert input_pos.shape[-1] == 1
+    if cfg_scale > 1.0:
+        x_combined = torch.cat([x, x])
+        logits, _ = model(x_combined, cond_idx=None, input_pos=input_pos)
+        logits_combined = logits
+        cond_logits, uncond_logits = torch.split(logits_combined, len(logits_combined) // 2, dim=0) 
+        if cfg_flag:
+            logits = uncond_logits + (cond_logits - uncond_logits) * cfg_scale
+        else:
+            logits = cond_logits
+    else:
+        logits, _ = model(x, cond_idx=None, input_pos=input_pos)
+    return sample(logits, **sampling_kwargs)
+
+
+def decode_n_tokens(
+    model, cur_token: torch.Tensor, input_pos: torch.Tensor, num_new_tokens: int, 
+    cfg_scale: float, cfg_interval: int,
+    **sampling_kwargs):
+    new_tokens, new_probs = [], []
+    cfg_flag = True
+    for i in range(num_new_tokens):
+        with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True): # Actually better for Inductor to codegen attention here
+            if cfg_interval > -1 and i > cfg_interval:
+                cfg_flag = False
+            next_token, next_prob = decode_one_token(
+                model, cur_token, input_pos, cfg_scale, cfg_flag, **sampling_kwargs
+            )
+            input_pos += 1
+            new_tokens.append(next_token.clone())
+            new_probs.append(next_prob.clone())
+            cur_token = next_token.view(-1, 1)
+    
+    return new_tokens, new_probs
+
+
+@torch.no_grad()
+def generate(model, cond, max_new_tokens, emb_masks=None, cfg_scale=1.0, cfg_interval=-1, **sampling_kwargs):
+    if model.frame_prediction:
+        assert cfg_scale == 1.0, "frame prediction requires cfg_scale=1.0 (no classifier-free guidance)"
+        cond_combined = cond
+        T = cond.shape[1]
+    elif model.model_type == 'class_cond':
+        if cfg_scale > 1.0:
+            cond_null = torch.ones_like(cond) * model.num_classes
+            cond_combined = torch.cat([cond, cond_null])
+        else:
+            cond_combined = cond
+        T = 1 
+    elif model.model_type == 'clip_cond':
+        if cfg_scale > 1.0:
+            cond_null = torch.zeros_like(cond)
+            cond_combined = torch.cat([cond, cond_null])
+        else:
+            cond_combined = cond
+        T = model.cls_token_num
+    elif model.model_type == 'indice_cond':
+        if cfg_scale > 1.0:
+            cond_null = torch.ones_like(cond) * model.cond_vocab_size
+            cond_combined = torch.cat([cond, cond_null])
+        else:
+            cond_combined = cond
+        T = model.cls_token_num
+    else:
+        raise Exception("please check model type")
+
+    T_new = T + max_new_tokens
+    max_seq_length = T_new
+    max_batch_size = cond.shape[0]
+
+    device = cond.device
+    with torch.device(device):
+        max_batch_size_cfg = max_batch_size * 2 if cfg_scale > 1.0 else max_batch_size
+        model.setup_caches(max_batch_size=max_batch_size_cfg, max_seq_length=max_seq_length, dtype=model.tok_embeddings.weight.dtype)
+    
+    if emb_masks is not None:
+        assert emb_masks.shape[0] == max_batch_size
+        assert emb_masks.shape[-1] == T
+        if cfg_scale > 1.0:
+            model.causal_mask[:, :, :T] = model.causal_mask[:, :, :T] * torch.cat([emb_masks, emb_masks]).unsqueeze(1)
+        else:
+            model.causal_mask[:, :, :T] = model.causal_mask[:, :, :T] * emb_masks.unsqueeze(1)
+
+        eye_matrix = torch.eye(model.causal_mask.size(1), model.causal_mask.size(2), device=device)
+        model.causal_mask[:] = model.causal_mask * (1 - eye_matrix) + eye_matrix
+    
+    # create an empty tensor of the expected final shape and fill in the current tokens
+    seq = torch.empty((max_batch_size, T_new), dtype=torch.int, device=device)
+
+    input_pos = torch.arange(0, T, device=device)
+
+    next_token = prefill(model, cond_combined, input_pos, cfg_scale, **sampling_kwargs)
+    seq[:, T:T+1] = next_token
+
+    input_pos = torch.tensor([T], device=device, dtype=torch.int)
+    generated_tokens, _ = decode_n_tokens(model, next_token, input_pos, max_new_tokens-1, cfg_scale, cfg_interval, **sampling_kwargs)
+    seq[:, T+1:] = torch.cat(generated_tokens, dim=1)
+
+    return seq[:, T:]
diff --git a/tok/ar_dtok/vqvae.py b/tok/ar_dtok/vqvae.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4dde0cbb53e615363c14594d792df3158ff88a2
--- /dev/null
+++ b/tok/ar_dtok/vqvae.py
@@ -0,0 +1,457 @@
+from dataclasses import dataclass, field
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..models import register
+from ..utils import ScalingLayer
+
+
+@register('vqvae')
+class VQVAE(nn.Module):
+    def __init__(
+        self,
+        model='VQ-16',
+        ckpt='',
+        codebook_size=16384,
+        codebook_embed_dim=8,
+        bottleneck_token_num=256,
+        input_size=256,
+        *args,
+        **kwargs,
+    ):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.codebook_embed_dim = codebook_embed_dim
+        self.bottleneck_token_num = bottleneck_token_num
+        self.input_size = input_size
+        self.model = VQ_models[model](
+            codebook_size=codebook_size,
+            codebook_embed_dim=codebook_embed_dim)
+        ckpt = torch.load(ckpt, map_location='cpu')
+        self.model.load_state_dict(ckpt['model'])
+        self.model.eval()
+
+        self.scale_layer = ScalingLayer(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+    @classmethod
+    def from_checkpoint(cls, ckpt, **kwargs):
+        model = cls(ckpt=ckpt, **kwargs)
+        return model
+
+    def decode_from_bottleneck(self, z):
+        if z.ndim == 2:
+            b = z.size(0)
+            h = w = int(z.size(-1) ** 0.5)
+        z = self.model.decode_code(z, (b, self.codebook_embed_dim, h, w))
+        return self.scale_layer.inv(z)
+
+
+# Adapt from https://github.com/FoundationVision/LlamaGen/blob/main/tokenizer/tokenizer_image/vq_model.py
+@dataclass
+class ModelArgs:
+    codebook_size: int = 16384
+    codebook_embed_dim: int = 8
+    codebook_l2_norm: bool = True
+    codebook_show_usage: bool = True
+    commit_loss_beta: float = 0.25
+    entropy_loss_ratio: float = 0.0
+    
+    encoder_ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
+    decoder_ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
+    z_channels: int = 256
+    dropout_p: float = 0.0
+
+
+class VQModel(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.encoder = Encoder(ch_mult=config.encoder_ch_mult, z_channels=config.z_channels, dropout=config.dropout_p)
+        self.decoder = Decoder(ch_mult=config.decoder_ch_mult, z_channels=config.z_channels, dropout=config.dropout_p)
+
+        self.quantize = VectorQuantizer(config.codebook_size, config.codebook_embed_dim, 
+                                        config.commit_loss_beta, config.entropy_loss_ratio,
+                                        config.codebook_l2_norm, config.codebook_show_usage)
+        self.quant_conv = nn.Conv2d(config.z_channels, config.codebook_embed_dim, 1)
+        self.post_quant_conv = nn.Conv2d(config.codebook_embed_dim, config.z_channels, 1)
+
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+
+    def decode_code(self, code_b, shape=None, channel_first=True):
+        quant_b = self.quantize.get_codebook_entry(code_b, shape, channel_first)
+        dec = self.decode(quant_b)
+        return dec
+
+    def forward(self, input):
+        quant, diff, _ = self.encode(input)
+        dec = self.decode(quant)
+        return dec, diff
+
+
+class Encoder(nn.Module):
+    def __init__(self, in_channels=3, ch=128, ch_mult=(1,1,2,2,4), num_res_blocks=2, 
+                 norm_type='group', dropout=0.0, resamp_with_conv=True, z_channels=256):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.conv_in = nn.Conv2d(in_channels, ch, kernel_size=3, stride=1, padding=1)
+
+        # downsampling
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.conv_blocks = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            conv_block = nn.Module()
+            # res & attn
+            res_block = nn.ModuleList()
+            attn_block = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                res_block.append(ResnetBlock(block_in, block_out, dropout=dropout, norm_type=norm_type))
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn_block.append(AttnBlock(block_in, norm_type))
+            conv_block.res = res_block
+            conv_block.attn = attn_block
+            # downsample
+            if i_level != self.num_resolutions-1:
+                conv_block.downsample = Downsample(block_in, resamp_with_conv)
+            self.conv_blocks.append(conv_block)
+
+        # middle
+        self.mid = nn.ModuleList()
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+        self.mid.append(AttnBlock(block_in, norm_type=norm_type))
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+
+        # end
+        self.norm_out = Normalize(block_in, norm_type)
+        self.conv_out = nn.Conv2d(block_in, z_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        h = self.conv_in(x)
+        # downsampling
+        for i_level, block in enumerate(self.conv_blocks):
+            for i_block in range(self.num_res_blocks):
+                h = block.res[i_block](h)
+                if len(block.attn) > 0:
+                    h = block.attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = block.downsample(h)
+        
+        # middle
+        for mid_block in self.mid:
+            h = mid_block(h)
+        
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+
+class Decoder(nn.Module):
+    def __init__(self, z_channels=256, ch=128, ch_mult=(1,1,2,2,4), num_res_blocks=2, norm_type="group",
+                 dropout=0.0, resamp_with_conv=True, out_channels=3):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+
+       # middle
+        self.mid = nn.ModuleList()
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+        self.mid.append(AttnBlock(block_in, norm_type=norm_type))
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+
+        # upsampling
+        self.conv_blocks = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            conv_block = nn.Module()
+            # res & attn
+            res_block = nn.ModuleList()
+            attn_block = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                res_block.append(ResnetBlock(block_in, block_out, dropout=dropout, norm_type=norm_type))
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn_block.append(AttnBlock(block_in, norm_type))
+            conv_block.res = res_block
+            conv_block.attn = attn_block
+            # downsample
+            if i_level != 0:
+                conv_block.upsample = Upsample(block_in, resamp_with_conv)
+            self.conv_blocks.append(conv_block)
+
+        # end
+        self.norm_out = Normalize(block_in, norm_type)
+        self.conv_out = nn.Conv2d(block_in, out_channels, kernel_size=3, stride=1, padding=1)
+
+    @property
+    def last_layer(self):
+        return self.conv_out.weight
+    
+    def forward(self, z):
+        # z to block_in
+        h = self.conv_in(z)
+
+        # middle
+        for mid_block in self.mid:
+            h = mid_block(h)
+        
+        # upsampling
+        for i_level, block in enumerate(self.conv_blocks):
+            for i_block in range(self.num_res_blocks + 1):
+                h = block.res[i_block](h)
+                if len(block.attn) > 0:
+                    h = block.attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = block.upsample(h)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+
+class VectorQuantizer(nn.Module):
+    def __init__(self, n_e, e_dim, beta, entropy_loss_ratio, l2_norm, show_usage):
+        super().__init__()
+        self.n_e = n_e
+        self.e_dim = e_dim
+        self.beta = beta
+        self.entropy_loss_ratio = entropy_loss_ratio
+        self.l2_norm = l2_norm
+        self.show_usage = show_usage
+
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+        if self.l2_norm:
+            self.embedding.weight.data = F.normalize(self.embedding.weight.data, p=2, dim=-1)
+        if self.show_usage:
+            self.register_buffer("codebook_used", nn.Parameter(torch.zeros(65536)))
+
+    def forward(self, z):
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = torch.einsum('b c h w -> b h w c', z).contiguous()
+        z_flattened = z.view(-1, self.e_dim)
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+
+        if self.l2_norm:
+            z = F.normalize(z, p=2, dim=-1)
+            z_flattened = F.normalize(z_flattened, p=2, dim=-1)
+            embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
+        else:
+            embedding = self.embedding.weight
+
+        d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(embedding**2, dim=1) - 2 * \
+            torch.einsum('bd,dn->bn', z_flattened, torch.einsum('n d -> d n', embedding))
+
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = embedding[min_encoding_indices].view(z.shape)
+        perplexity = None
+        min_encodings = None
+        vq_loss = None
+        commit_loss = None
+        entropy_loss = None
+        codebook_usage = 0
+
+        if self.show_usage and self.training:
+            cur_len = min_encoding_indices.shape[0]
+            self.codebook_used[:-cur_len] = self.codebook_used[cur_len:].clone()
+            self.codebook_used[-cur_len:] = min_encoding_indices
+            codebook_usage = len(torch.unique(self.codebook_used)) / self.n_e
+
+        # compute loss for embedding
+        if self.training:
+            vq_loss = torch.mean((z_q - z.detach()) ** 2) 
+            commit_loss = self.beta * torch.mean((z_q.detach() - z) ** 2) 
+            entropy_loss = self.entropy_loss_ratio * compute_entropy_loss(-d)
+
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+
+        # reshape back to match original input shape
+        z_q = torch.einsum('b h w c -> b c h w', z_q)
+
+        return z_q, (vq_loss, commit_loss, entropy_loss, codebook_usage), (perplexity, min_encodings, min_encoding_indices)
+
+    def get_codebook_entry(self, indices, shape=None, channel_first=True):
+        # shape = (batch, channel, height, width) if channel_first else (batch, height, width, channel)
+        if self.l2_norm:
+            embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
+        else:
+            embedding = self.embedding.weight
+        z_q = embedding[indices]  # (b*h*w, c)
+
+        if shape is not None:
+            if channel_first:
+                z_q = z_q.reshape(shape[0], shape[2], shape[3], shape[1])
+                # reshape back to match original input shape
+                z_q = z_q.permute(0, 3, 1, 2).contiguous()
+            else:
+                z_q = z_q.view(shape)
+        return z_q
+
+
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels, out_channels=None, conv_shortcut=False, dropout=0.0, norm_type='group'):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+
+        self.norm1 = Normalize(in_channels, norm_type)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = Normalize(out_channels, norm_type)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            else:
+                self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels, norm_type='group'):
+        super().__init__()
+        self.norm = Normalize(in_channels, norm_type)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = F.softmax(w_, dim=2)
+
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+
+        h_ = self.proj_out(h_)
+
+        return x+h_
+
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)
+
+def Normalize(in_channels, norm_type='group'):
+    assert norm_type in ['group', 'batch']
+    if norm_type == 'group':
+        return nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+    elif norm_type == 'batch':
+        return nn.SyncBatchNorm(in_channels)
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = F.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = F.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+
+
+def compute_entropy_loss(affinity, loss_type="softmax", temperature=0.01):
+    flat_affinity = affinity.reshape(-1, affinity.shape[-1])
+    flat_affinity /= temperature
+    probs = F.softmax(flat_affinity, dim=-1)
+    log_probs = F.log_softmax(flat_affinity + 1e-5, dim=-1)
+    if loss_type == "softmax":
+        target_probs = probs
+    else:
+        raise ValueError("Entropy loss {} not supported".format(loss_type))
+    avg_probs = torch.mean(target_probs, dim=0)
+    avg_entropy = - torch.sum(avg_probs * torch.log(avg_probs + 1e-5))
+    sample_entropy = - torch.mean(torch.sum(target_probs * log_probs, dim=-1))
+    loss = sample_entropy - avg_entropy
+    return loss
+
+
+#################################################################################
+#                              VQ Model Configs                                 #
+#################################################################################
+def VQ_8(**kwargs):
+    return VQModel(ModelArgs(encoder_ch_mult=[1, 2, 2, 4], decoder_ch_mult=[1, 2, 2, 4], **kwargs))
+
+def VQ_16(**kwargs):
+    return VQModel(ModelArgs(encoder_ch_mult=[1, 1, 2, 2, 4], decoder_ch_mult=[1, 1, 2, 2, 4], **kwargs))
+
+VQ_models = {'VQ-16': VQ_16, 'VQ-8': VQ_8}
\ No newline at end of file
diff --git a/tok/mm_autoencoder.py b/tok/mm_autoencoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3c36d383dc02ba3aec6ea80747b0008d787e50a
--- /dev/null
+++ b/tok/mm_autoencoder.py
@@ -0,0 +1,64 @@
+import torch
+import torch.nn as nn
+
+from tok.ar_dtok.ar_model import ARModel
+from tok.ar_dtok.vqvae import VQVAE
+from tok.ta_tok import TextAlignedTokenizer
+
+
+class MMAutoEncoder(nn.Module):
+    def __init__(self,
+        ar_path,
+        encoder_path, decoder_path, 
+        encoder_args={}, decoder_args={}):
+        super().__init__()
+        self.ar_model = ARModel.from_checkpoint(ar_path)
+
+        self.encoder = TextAlignedTokenizer.from_checkpoint(encoder_path, load_teacher=False, **encoder_args)
+        self.decoder = VQVAE.from_checkpoint(decoder_path, **decoder_args)
+
+    def ar_sample(self, x, args):
+        x = self.ar_model.sample(
+            x,
+            cfg_scale=args.get('cfg_scale', 1.0),
+            cfg_interval=args.get('cfg_interval', -1),
+            temperature=args.get('temperature', 1.0),
+            top_k=args.get('top_k', 0),
+            top_p=args.get('top_p', 1.0)
+        )
+        return x
+
+    def post_process(self, x):
+        x = x.cpu().float().clamp(0., 1.) * 255.
+        x = x.permute(0, 2, 3, 1) # [b, h, w, c]
+        x = x.to(torch.uint8)
+        return x
+    
+    def encode(self, x):
+        return self.encoder(x.to(self.encoder.dtype))['encoded']
+    
+    def get_encoder_indices(self, x):
+        # img -> encoder -> indices
+        return self.encoder(x.to(self.encoder.dtype))['bottleneck_rep']
+    
+    @torch.inference_mode()
+    def decode_from_encoder_indices(self, indices, args={}):
+        # indices -> encoder feats -> ar -> decoder
+        encoder_x = self.encoder.decode_from_bottleneck(indices)
+        ar_indices = self.ar_sample(encoder_x, args)
+        decoder_x = self.decoder.decode_from_bottleneck(ar_indices)
+        x = self.post_process(decoder_x)
+        return x
+    
+    def decode_from_vqvae_indices(self, indices):
+        decoder_x = self.decoder.decode_from_bottleneck(indices)
+        x = self.post_process(decoder_x)
+        return x
+    
+    @torch.inference_mode()
+    def forward(self, x, args={}):
+        encoder_x = self.encoder(x.to(self.encoder.dtype))['encoded']
+        ar_indices = self.ar_sample(encoder_x, args)
+        decoder_x = self.decoder.decode_from_bottleneck(ar_indices)
+        x = self.post_process(decoder_x)
+        return x
\ No newline at end of file
diff --git a/tok/models.py b/tok/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f70ced3a29789010a8e16a4d1bb0f2812984302
--- /dev/null
+++ b/tok/models.py
@@ -0,0 +1,31 @@
+import copy
+import inspect
+
+import torch
+
+models = {}
+
+
+def register(name):
+    def decorator(cls):
+        models[name] = cls
+        return cls
+    return decorator
+
+
+def make(model_spec, args=None, load_sd=False) -> torch.nn.Module:
+    if args is not None:
+        model_args = copy.deepcopy(model_spec['args'])
+        model_args.update(args)
+    else:
+        model_args = model_spec['args']
+    model_params = inspect.signature(models[model_spec['name']]).parameters
+    if 'kwargs' not in model_params:
+        model_args = {k: v for k, v in model_args.items() if k in model_params}
+    model = models[model_spec['name']](**model_args)
+    if load_sd:
+        if ('abs_pe' in  model_spec['sd']) and hasattr(model, 'abs_pe') and model_spec['sd']['abs_pe'].shape != model.abs_pe.shape:
+            del model_spec['sd']['abs_pe']
+        msg = model.load_state_dict(model_spec['sd'], strict=False)
+        print(msg)
+    return model
diff --git a/tok/ta_tok.py b/tok/ta_tok.py
new file mode 100644
index 0000000000000000000000000000000000000000..2af1e94b523a08fd44131c25562f940abf59a66c
--- /dev/null
+++ b/tok/ta_tok.py
@@ -0,0 +1,156 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torchvision.transforms import Resize
+from transformers import AutoConfig, AutoModel, Siglip2VisionConfig, Siglip2VisionModel
+
+from . import models
+from .utils import ScalingLayer
+
+
+class TextAlignedTokenizer(nn.Module):
+    def __init__(
+        self, 
+        bottleneck,
+        bottleneck_token_num=256,
+        input_size=384,
+        teacher='google/siglip2-so400m-patch14-384',
+        input_type='quant', # choose from ['quant', 'rec', 'indices']
+        pool_scale=1, # choose from [1, 2, 3]
+        decoder_depth=3,
+        select_layer_id=-2,
+        *args,
+        **kwargs
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.bottleneck_token_num = bottleneck_token_num
+        self.teacher = teacher
+        self.input_type = input_type
+        self.pool_scale = pool_scale
+        self.decoder_depth = decoder_depth
+        self.select_layer_id = select_layer_id
+       
+        self.bottleneck_dim = bottleneck['args']['bottleneck_dim']
+
+        self.encoder_config = AutoConfig.from_pretrained(teacher)
+        self.encoder = AutoModel.from_config(self.encoder_config).vision_model         
+        
+        self.encoder_hidden_dim = self.encoder.config.hidden_size
+
+        self.decoder_config = Siglip2VisionConfig()
+        self.decoder_config.update({
+            'patch_size': 1,
+            'num_hidden_layers': self.decoder_depth,
+            'num_channels': self.bottleneck_dim,
+            'hidden_size': self.encoder_hidden_dim,
+        })
+        self.decoder = Siglip2VisionModel(self.decoder_config)
+
+        self.encode_task_layer = nn.Sequential(
+            nn.Linear(self.encoder_hidden_dim, self.encoder_hidden_dim),
+            nn.Tanh())
+        self.decode_task_layer = nn.Sequential(
+            nn.Linear(self.encoder_hidden_dim, self.encoder_hidden_dim),
+            nn.Tanh(),
+            nn.Linear(self.encoder_hidden_dim, self.encoder_hidden_dim))
+
+        bottleneck_args = {
+            'token_nums': self.bottleneck_token_num, 
+            'input_dim': self.encoder_hidden_dim, 
+            'output_dim': self.bottleneck_dim}
+        self.bottleneck = models.make(bottleneck, args=bottleneck_args)
+
+        self.scale_layer = ScalingLayer(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])   
+        self.image_resize = Resize((self.input_size, self.input_size))
+       
+    def set_vq_eval_deterministic(self, deterministic=True):
+        self.bottleneck.regularizer.set_eval_deterministic(deterministic)
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    
+    @classmethod
+    def from_checkpoint(cls, ckpt, load_teacher=True, **kwargs):
+        ckpt = torch.load(ckpt, map_location='cpu')
+        ckpt_kwargs = ckpt["model"]["args"]
+        model = cls(**kwargs, **ckpt_kwargs)
+        sd = ckpt["model"]["sd"]
+        if not load_teacher:
+            sd = {k: v for k, v in sd.items() if not k.startswith('teacher')}
+        model.load_state_dict(sd, strict=True)
+        return model
+
+    def encode(self, x, **kwargs):
+        if x.ndim == 5:
+            x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.scale_layer(x)
+        if tuple(x.shape[-2:]) != (self.input_size, self.input_size):
+            x = self.image_resize(x)
+        vq_feats = self.encoder(x, output_hidden_states=True).hidden_states[self.select_layer_id]
+
+        pool_scale = self.pool_scale
+        pool_scale = kwargs.get("pool_scale", pool_scale)
+        if pool_scale != 1:
+            vq_feats = self.avg_pool(vq_feats, pool_scale)
+        vq_feats = self.encode_task_layer(vq_feats.to(x))
+        
+        bottleneck_out = self.bottleneck(vq_feats)
+        z = bottleneck_out.pop('output')
+
+        return {'encoded': z, 'pool_scale': pool_scale, 'vq_feats': vq_feats, **bottleneck_out}
+
+    def avg_pool(self, z, pool_scale=1):
+        if z.ndim == 3:
+            b, n, c = z.shape
+            p = int(n ** 0.5)
+            z = rearrange(z, 'b (p1 p2) c -> b c p1 p2', p1=p, p2=p)
+        else:
+            b, c, p, _ = z.shape
+        p_s = int(p // pool_scale)
+        z = F.avg_pool2d(
+            z,
+            kernel_size=(pool_scale, pool_scale),
+            stride=(pool_scale, pool_scale)
+        ).contiguous()
+        z = rearrange(z, 'b c p1 p2 -> b (p1 p2) c')
+        return z
+
+    def decode(self, z):
+        if z.ndim == 4:
+            z = rearrange(z, 'b c p1 p2 -> b (p1 p2) c')
+        attention_mask = torch.ones(z.shape[:2], dtype=torch.int, device=z.device)
+        p = int(z.shape[1]**0.5)
+        spatial_shape = torch.tensor([[p, p]]*z.shape[0], device=self.device)
+        z = self.decoder(z, attention_mask, spatial_shape, output_hidden_states=True).last_hidden_state
+        z = self.decode_task_layer(z)
+        return z
+
+    def decode_from_bottleneck(self, bottleneck_rep):
+        z = self.bottleneck.decode(bottleneck_rep) # (b, n, c)
+        p = int(z.shape[1]**0.5)
+        z = rearrange(z, 'b (p1 p2) c -> b c p1 p2', p1=p, p2=p)
+        return self.decode(z)
+
+    def forward(self, data, **kwargs):
+        # data: video in shape (b, c, t, h, w)
+        encode_output = self.encode(data, **kwargs)
+        vq_feats = encode_output['encoded']
+        p = int(vq_feats.shape[1] ** 0.5)
+        vq_feats = rearrange(vq_feats, 'b (h w) c -> b c h w', h=p, w=p)
+        pred_feats = self.decode(vq_feats)
+
+        if self.input_type == 'quant':
+            z = encode_output["regularized_z"] # [b, n, c]
+        elif self.input_type == 'indices':
+            z = encode_output["bottleneck_rep"] # [b, n]
+        elif self.input_type == 'rec':
+            z = pred_feats # [b, n, c]
+        encode_output['encoded'] = z
+        return encode_output
\ No newline at end of file
diff --git a/tok/utils.py b/tok/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b9e705f97948b25c5cd8e2b0b427bdba76dfe2a
--- /dev/null
+++ b/tok/utils.py
@@ -0,0 +1,15 @@
+import torch
+import torch.nn as nn
+
+
+class ScalingLayer(nn.Module):
+    def __init__(self, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]):
+        super().__init__()
+        self.register_buffer('shift', torch.Tensor(mean)[None, :, None, None])
+        self.register_buffer('scale', torch.Tensor(std)[None, :, None, None])
+
+    def forward(self, inp):
+        return (inp - self.shift) / self.scale
+    
+    def inv(self, inp):
+        return inp * self.scale + self.shift
\ No newline at end of file
diff --git a/util/__pycache__/misc.cpython-310.pyc b/util/__pycache__/misc.cpython-310.pyc
deleted file mode 100644
index 4caa262729d9934200c8f44f3ca67d0913580474..0000000000000000000000000000000000000000
Binary files a/util/__pycache__/misc.cpython-310.pyc and /dev/null differ
diff --git a/util/__pycache__/misc.cpython-39.pyc b/util/__pycache__/misc.cpython-39.pyc
deleted file mode 100644
index 9f464da6e5db2871e7f85f496f2a0df542be9804..0000000000000000000000000000000000000000
Binary files a/util/__pycache__/misc.cpython-39.pyc and /dev/null differ
diff --git a/util/lr_sched.py b/util/lr_sched.py
deleted file mode 100644
index dc4624f4fb441ea7e37e50857813cb149887a0c0..0000000000000000000000000000000000000000
--- a/util/lr_sched.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-def adjust_learning_rate(optimizer, it, args):
-    """Decay the learning rate with half-cycle cosine after warmup"""
-    if it < args.warmup_iters: # 1) linear warmup for warmup_iters steps
-        lr = args.lr * it / args.warmup_iters
-    elif it > args.lr_decay_iters: # 2) if it > lr_decay_iters, return min learning rate
-        lr = args.min_lr
-    else: # 3) in between, use cosine decay down to min learning rate
-        decay_ratio = (it - args.warmup_iters) / (args.lr_decay_iters - args.warmup_iters)
-        assert 0 <= decay_ratio <= 1
-        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
-        lr = args.min_lr + (args.lr - args.min_lr) * coeff
-
-    for param_group in optimizer.param_groups:
-        if "lr_scale" in param_group:
-            param_group["lr"] = lr * param_group["lr_scale"]
-        else:
-            param_group["lr"] = lr
-    return lr
-
-
-def adjust_learning_rate_epoch(optimizer, epoch, args):
-    """Decay the learning rate with half-cycle cosine after warmup"""
-    if epoch < args.warmup_epochs:
-        lr = args.lr * epoch / args.warmup_epochs
-    else:
-        lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \
-            (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs)))
-    for param_group in optimizer.param_groups:
-        if "lr_scale" in param_group:
-            param_group["lr"] = lr * param_group["lr_scale"]
-        else:
-            param_group["lr"] = lr
-    return lr
-
diff --git a/util/misc.py b/util/misc.py
deleted file mode 100644
index cea0d87e40afd9b8be34ef99da7b1409cb1e43ba..0000000000000000000000000000000000000000
--- a/util/misc.py
+++ /dev/null
@@ -1,516 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# DeiT: https://github.com/facebookresearch/deit
-# BEiT: https://github.com/microsoft/unilm/tree/master/beit
-# --------------------------------------------------------
-
-import builtins
-import datetime
-import os
-import glob
-import time
-from collections import defaultdict, deque
-from pathlib import Path
-import subprocess
-
-import torch
-import torch.distributed as dist
-from torch import inf
-from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
-from torch.distributed.fsdp import (
-    FullyShardedDataParallel as FSDP,
-    StateDictType,
-    FullStateDictConfig,
-)
-from torch.distributed._shard.api import load_with_process_group
-
-from fairscale.nn.model_parallel import initialize as fs_init
-
-from types import TracebackType
-from typing import Any, Optional
-import torch
-import torch.nn as nn
-
-class SmoothedValue(object):
-    """Track a series of values and provide access to smoothed values over a
-    window or the global series average.
-    """
-
-    def __init__(self, window_size=20, fmt=None):
-        if fmt is None:
-            fmt = "{median:.4f} ({global_avg:.4f})"
-        self.deque = deque(maxlen=window_size)
-        self.total = 0.0
-        self.count = 0
-        self.fmt = fmt
-
-    def update(self, value, n=1):
-        self.deque.append(value)
-        self.count += n
-        self.total += value * n
-
-    def synchronize_between_processes(self):
-        """
-        Warning: does not synchronize the deque!
-        """
-        if not is_dist_avail_and_initialized():
-            return
-        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
-        dist.barrier()
-        dist.all_reduce(t)
-        t = t.tolist()
-        self.count = int(t[0])
-        self.total = t[1]
-
-    @property
-    def median(self):
-        d = torch.tensor(list(self.deque))
-        return d.median().item()
-
-    @property
-    def avg(self):
-        d = torch.tensor(list(self.deque), dtype=torch.float32)
-        return d.mean().item()
-
-    @property
-    def global_avg(self):
-        return self.total / self.count
-
-    @property
-    def max(self):
-        return max(self.deque)
-
-    @property
-    def value(self):
-        return self.deque[-1]
-
-    def __str__(self):
-        return self.fmt.format(
-            median=self.median,
-            avg=self.avg,
-            global_avg=self.global_avg,
-            max=self.max,
-            value=self.value)
-
-
-class MetricLogger(object):
-    def __init__(self, delimiter="\t"):
-        self.meters = defaultdict(SmoothedValue)
-        self.delimiter = delimiter
-
-    def update(self, **kwargs):
-        for k, v in kwargs.items():
-            if v is None:
-                continue
-            if isinstance(v, torch.Tensor):
-                v = v.item()
-            assert isinstance(v, (float, int))
-            self.meters[k].update(v)
-
-    def __getattr__(self, attr):
-        if attr in self.meters:
-            return self.meters[attr]
-        if attr in self.__dict__:
-            return self.__dict__[attr]
-        raise AttributeError("'{}' object has no attribute '{}'".format(
-            type(self).__name__, attr))
-
-    def __str__(self):
-        loss_str = []
-        for name, meter in self.meters.items():
-            loss_str.append(
-                "{}: {}".format(name, str(meter))
-            )
-        return self.delimiter.join(loss_str)
-
-    def synchronize_between_processes(self):
-        for meter in self.meters.values():
-            meter.synchronize_between_processes()
-
-    def add_meter(self, name, meter):
-        self.meters[name] = meter
-
-    def log_every(self, iterable, print_freq, header=None, start_iter=0):
-        i = start_iter
-        if not header:
-            header = ''
-        start_time = time.time()
-        end = time.time()
-        iter_time = SmoothedValue(fmt='{avg:.4f}')
-        data_time = SmoothedValue(fmt='{avg:.4f}')
-        log_msg = [
-            header,
-            '[{0' + '}/{1}]',
-            '{meters}',
-            'time: {time}',
-            'data: {data}'
-        ]
-        if torch.cuda.is_available():
-            log_msg.append('max mem: {memory:.0f}')
-        log_msg = self.delimiter.join(log_msg)
-        MB = 1024.0 * 1024.0
-        for obj in iterable:
-            data_time.update(time.time() - end)
-            yield obj
-            iter_time.update(time.time() - end)
-            if i % print_freq == 0:
-                try:
-                    total_len = len(iterable)
-                except:
-                    total_len = "unknown"
-                if torch.cuda.is_available():
-                    print(log_msg.format(
-                        i, total_len,
-                        meters=str(self),
-                        time=str(iter_time), data=str(data_time),
-                        memory=torch.cuda.max_memory_allocated() / MB))
-                else:
-                    print(log_msg.format(
-                        i, total_len,
-                        meters=str(self),
-                        time=str(iter_time), data=str(data_time)))
-            i += 1
-            end = time.time()
-        total_time = time.time() - start_time
-        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
-        print('{} Total time: {} ({:.4f} s / it)'.format(
-            header, total_time_str, total_time / len(iterable)))
-
-
-def setup_for_distributed(is_master):
-    """
-    This function disables printing when not in master process
-    """
-    builtin_print = builtins.print
-
-    def print(*args, **kwargs):
-        force = kwargs.pop('force', False)
-#        force = force or (get_world_size() > 8)
-        if is_master or force:
-            now = datetime.datetime.now().time()
-            builtin_print('[{}] '.format(now), end='')  # print with time stamp
-            builtin_print(*args, **kwargs)
-
-    builtins.print = print
-
-
-def is_dist_avail_and_initialized():
-    if not dist.is_available():
-        return False
-    if not dist.is_initialized():
-        return False
-    return True
-
-
-def get_world_size():
-    if not is_dist_avail_and_initialized():
-        return 1
-    return dist.get_world_size()
-
-
-def get_rank():
-    if not is_dist_avail_and_initialized():
-        return 0
-    return dist.get_rank()
-
-
-def is_main_process():
-    return get_rank() == 0
-
-
-def save_on_master(*args, **kwargs):
-    if is_main_process():
-        torch.save(*args, **kwargs)
-
-def init_distributed_mode(args):
-    if args.dist_on_itp:
-        args.rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
-        args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
-        args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
-        args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
-        os.environ['LOCAL_RANK'] = str(args.gpu)
-        os.environ['RANK'] = str(args.rank)
-        os.environ['WORLD_SIZE'] = str(args.world_size)
-        # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
-    elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
-        args.rank = int(os.environ["RANK"])
-        args.world_size = int(os.environ['WORLD_SIZE'])
-        args.gpu = int(os.environ['LOCAL_RANK'])
-    elif 'SLURM_PROCID' in os.environ:
-        os.environ['MASTER_PORT'] = '8994'
-        while 'MASTER_ADDR' not in os.environ or len(os.environ['MASTER_ADDR'].strip()) == 0:
-            os.environ['MASTER_ADDR'] = subprocess.check_output('sinfo -Nh -n %s | head -n 1 | awk \'{print $1}\'' % os.environ['SLURM_NODELIST'], shell=True, ).decode().strip()
-            time.sleep(1)
-        print(os.environ['MASTER_ADDR'])
-        args.world_size = int(os.environ['SLURM_NPROCS'])
-        args.rank = int(os.environ['SLURM_PROCID'])
-        args.gpu = args.rank % torch.cuda.device_count()
-        args.local_rank = args.gpu
-        os.environ['LOCAL_RANK'] = str(args.gpu)
-        os.environ['WORLD_SIZE'] = str(args.world_size)
-        os.environ['RANK'] = str(args.rank)
-    else:
-        print('Not using distributed mode')
-        setup_for_distributed(is_master=True)  # hack
-        args.distributed = False
-        return
-
-    args.distributed = True
-
-    torch.cuda.set_device(args.gpu)
-    args.dist_backend = 'nccl'
-    print('| distributed init (rank {}): {}, gpu {}'.format(
-        args.rank, args.dist_url, args.gpu), flush=True)
-    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
-                                         world_size=args.world_size, rank=args.rank)
-    torch.distributed.barrier()
-    setup_for_distributed(args.rank == 0)
-
-
-def init_distributed_mode1(args):
-    if args.dist_on_itp:
-        args.rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
-        args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
-        args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
-        args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
-        os.environ['LOCAL_RANK'] = str(args.gpu)
-        os.environ['RANK'] = str(args.rank)
-        os.environ['WORLD_SIZE'] = str(args.world_size)
-        # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
-    elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
-        args.rank = int(os.environ["RANK"])
-        args.world_size = int(os.environ['WORLD_SIZE'])
-        args.gpu = int(os.environ['LOCAL_RANK'])
-    elif 'SLURM_PROCID' in os.environ:
-        args.rank = int(os.environ['SLURM_PROCID'])
-        args.gpu = args.rank % torch.cuda.device_count()
-    else:
-        print('Not using distributed mode')
-        setup_for_distributed(is_master=True)  # hack
-        args.distributed = False
-        return
-
-    args.distributed = True
-
-    torch.cuda.set_device(args.gpu)
-    args.dist_backend = 'nccl'
-    print('| distributed init (rank {}): {}, gpu {}'.format(
-        args.rank, args.dist_url, args.gpu), flush=True)
-    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
-                                         world_size=args.world_size, rank=args.rank)
-    torch.distributed.barrier()
-    setup_for_distributed(args.rank == 0)
-
-
-class NativeScalerWithGradNormCount:
-    state_dict_key = "amp_scaler"
-
-    def __init__(self, args):
-        self._scaler = ShardedGradScaler(enabled=args.precision in ["fp16"])
-
-    def __call__(self, loss, optimizer, model, clip_grad=None, parameters=None, create_graph=False, update_grad=True):
-        if update_grad:
-            self._scaler.scale(loss).backward(create_graph=create_graph)
-            if clip_grad is not None:
-                assert parameters is not None
-                self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
-                # norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
-                norm = model.clip_grad_norm_(clip_grad)
-            else:
-                raise NotImplementedError("please set clip_grad to a very large value if you do not want to clip.")
-                self._scaler.unscale_(optimizer)
-                norm = get_grad_norm_(parameters)
-            self._scaler.step(optimizer)
-            self._scaler.update()
-        else:
-            with model.no_sync():
-                self._scaler.scale(loss).backward(create_graph=create_graph)
-            norm = None
-        return norm
-
-    def state_dict(self):
-        return self._scaler.state_dict()
-
-    def load_state_dict(self, state_dict):
-        self._scaler.load_state_dict(state_dict)
-
-
-def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor:
-    if isinstance(parameters, torch.Tensor):
-        parameters = [parameters]
-    parameters = [p for p in parameters if p.grad is not None]
-    norm_type = float(norm_type)
-    if len(parameters) == 0:
-        return torch.tensor(0.)
-    device = parameters[0].grad.device
-    if norm_type == inf:
-        total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters)
-    else:
-        total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
-    return total_norm
-
-
-def save_model(output_dir, args, epoch, iteration, model, optimizer, loss_scaler, dataset_state):
-    save_dir = os.path.join(output_dir, f"epoch_{epoch}_iter_{iteration:09d}")
-    os.makedirs(save_dir, exist_ok=True)
-    with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
-        to_save = {
-            "model": model.state_dict(),
-            "optimizer": optimizer.state_dict(),
-            "iter": iteration,
-            "epoch": epoch,
-            "scaler": loss_scaler.state_dict(),
-            "args": args,
-            "dataset_state": dataset_state,
-        }
-        save_path = os.path.join(
-            save_dir,
-            f"checkpoint.{dist.get_rank():05d}-of-{dist.get_world_size():05d}.pth",
-        )
-        torch.save(to_save, save_path)
-
-    if args.save_consolidated:
-        mp_rank = fs_init.get_model_parallel_rank()
-        mp_world_size = fs_init.get_model_parallel_world_size()
-        consolidated_model_save_path = os.path.join(
-            save_dir,
-            f"consolidated.{mp_rank:02d}-of-{mp_world_size:02d}.pth",
-        )
-        with FSDP.state_dict_type(
-            model,
-            StateDictType.FULL_STATE_DICT,
-            FullStateDictConfig(rank0_only=True, offload_to_cpu=True),
-        ):
-            save_dtype = {
-                "fp16": torch.float16,
-                "bf16": torch.bfloat16,
-                "tf32": torch.float32,
-            }[args.precision]
-            consolidated_model_state_dict = {
-                k: v.to(save_dtype) for k, v in model.state_dict().items()
-            }
-        if fs_init.get_data_parallel_rank() == 0:
-            torch.save(consolidated_model_state_dict, consolidated_model_save_path)
-    
-    # remove previous ckpts
-    ckpts = glob.glob(os.path.join(output_dir, "iter_*")) + glob.glob(os.path.join(output_dir, "epoch_*"))
-    ckpts.sort()
-    if len(ckpts)>2 and not args.keep_all:
-        for ckpt in ckpts[:-2]:
-            print('del', ckpt)
-            os.system(f'rm {ckpt} -rf')
-
-def load_model(args, model, optimizer, loss_scaler):
-    start_iter = 0
-    start_epoch = 0
-    if args.auto_resume:
-        ckpt_dirs = glob.glob(os.path.join(args.output_dir, "iter_*")) + glob.glob(os.path.join(args.output_dir, "epoch_*"))
-        ckpt_dirs.sort()
-        if len(ckpt_dirs) > 0:
-            args.resume = ckpt_dirs[-1]
-    if args.resume:
-        print("Resume checkpoint %s" % args.resume)
-        local_checkpoint_path = os.path.join(
-            args.resume,
-            f"checkpoint.{dist.get_rank():05d}-of-{dist.get_world_size():05d}.pth",
-        )
-        with load_with_process_group(fs_init.get_data_parallel_group()):
-            checkpoint = torch.load(local_checkpoint_path, map_location='cpu')
-        with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
-            model.load_state_dict(checkpoint['model'])
-        optimizer.load_state_dict(checkpoint['optimizer'])
-        loss_scaler.load_state_dict(checkpoint['scaler'])
-        start_iter = int(checkpoint['iter']) + 1
-        if 'epoch' in checkpoint:
-            start_epoch = int(checkpoint['epoch'])
-    return start_epoch, start_iter
-    
-def all_reduce_mean(x):
-    world_size = get_world_size()
-    if world_size > 1:
-        if isinstance(x, torch.Tensor):
-            x_reduce = x.clone().cuda()
-        else:
-            x_reduce = torch.tensor(x).cuda()
-        dist.all_reduce(x_reduce)
-        x_reduce /= world_size
-        return x_reduce.item()
-    else:
-        return x
-
-
-def add_weight_decay(model, weight_decay=1e-5, skip_list=()):
-    decay = []
-    no_decay = []
-    for name, param in model.named_parameters():
-        if not param.requires_grad:
-            continue  # frozen weights
-        #if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
-        if name.endswith(".bias") or name.endswith("norm.weight"):
-            no_decay.append(param)
-        else:
-            decay.append(param)
-    return [
-        {'params': no_decay, 'weight_decay': 0.},
-        {'params': decay, 'weight_decay': weight_decay}]
-
-
-
-
-class default_tensor_type:
-    _tensor_type_stack = [(torch.float, "cpu")]
-    
-    def __init__(
-        self,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[str] = None,
-    ) -> None:
-        # Only limited combinations are supported.
-        assert device is None or device in ["cpu", "cuda"]
-        assert dtype is None or dtype in [torch.float, torch.bfloat16, torch.half]
-        self.dtype, self.device = dtype, device
-    
-    def __enter__(self) -> None:
-        dtype, device = self.dtype, self.device
-        if dtype is None:
-            dtype = default_tensor_type._tensor_type_stack[-1][0]
-        if device is None:
-            device = default_tensor_type._tensor_type_stack[-1][1]
-        default_tensor_type._tensor_type_stack.append((dtype, device))
-        
-        # We use all 3 calls since the new apis (set_default_device, set_default_dtype)
-        # seems to be ineffective sometimes (e.g., set_default_device is ineffective to
-        # torch.Tensor calls).
-        torch.set_default_tensor_type(default_tensor_type.get_tensor_type(dtype, device))
-        torch.set_default_device(device)
-        torch.set_default_dtype(dtype)
-
-    def __exit__(
-        self,
-        exc_type: Optional[type[BaseException]],
-        exc_val: Optional[BaseException],
-        exc_tb: Optional[TracebackType],
-    ) -> None:
-        default_tensor_type._tensor_type_stack.pop()
-        dtype, device = default_tensor_type._tensor_type_stack[-1]
-
-        torch.set_default_tensor_type(default_tensor_type.get_tensor_type(dtype, device))
-        torch.set_default_device(device)
-        torch.set_default_dtype(dtype)
-
-    @staticmethod
-    def get_tensor_type(dtype: torch.dtype, device: str) -> Any:
-        return {
-            (torch.float, "cpu"): torch.FloatTensor,
-            (torch.bfloat16, "cpu"): torch.BFloat16Tensor,
-            (torch.half, "cpu"): torch.HalfTensor,
-            (torch.float, "cuda"): torch.cuda.FloatTensor,
-            (torch.bfloat16, "cuda"): torch.cuda.BFloat16Tensor,
-            (torch.half, "cuda"): torch.cuda.HalfTensor,
-        }[(dtype, device)]
-
diff --git a/util/pos_embed.py b/util/pos_embed.py
deleted file mode 100644
index 1924913c1ffe7c73b889a4d3bad586ee8b3d2d7d..0000000000000000000000000000000000000000
--- a/util/pos_embed.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# Position embedding utils
-# --------------------------------------------------------
-
-import numpy as np
-import torch
-
-# --------------------------------------------------------
-# 2D sine-cosine position embedding
-# References:
-# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
-# MoCo v3: https://github.com/facebookresearch/moco-v3
-# --------------------------------------------------------
-def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
-    """
-    grid_size: int of the grid height and width
-    return:
-    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
-    """
-    grid_h = np.arange(grid_size, dtype=np.float32)
-    grid_w = np.arange(grid_size, dtype=np.float32)
-    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
-    grid = np.stack(grid, axis=0)
-
-    grid = grid.reshape([2, 1, grid_size, grid_size])
-    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
-    if cls_token:
-        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
-    return pos_embed
-
-
-def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
-    assert embed_dim % 2 == 0
-
-    # use half of dimensions to encode grid_h
-    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
-    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
-
-    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
-    return emb
-
-
-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
-    """
-    embed_dim: output dimension for each position
-    pos: a list of positions to be encoded: size (M,)
-    out: (M, D)
-    """
-    assert embed_dim % 2 == 0
-    omega = np.arange(embed_dim // 2, dtype=np.float)
-    omega /= embed_dim / 2.
-    omega = 1. / 10000**omega  # (D/2,)
-
-    pos = pos.reshape(-1)  # (M,)
-    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
-
-    emb_sin = np.sin(out) # (M, D/2)
-    emb_cos = np.cos(out) # (M, D/2)
-
-    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
-    return emb
-
-
-# --------------------------------------------------------
-# Interpolate position embeddings for high-resolution
-# References:
-# DeiT: https://github.com/facebookresearch/deit
-# --------------------------------------------------------
-def interpolate_pos_embed(model, checkpoint_model):
-    if 'pos_embed' in checkpoint_model:
-        pos_embed_checkpoint = checkpoint_model['pos_embed']
-        embedding_size = pos_embed_checkpoint.shape[-1]
-        num_patches = model.patch_embed.num_patches
-        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
-        # height (== width) for the checkpoint position embedding
-        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
-        # height (== width) for the new position embedding
-        new_size = int(num_patches ** 0.5)
-        # class_token and dist_token are kept unchanged
-        if orig_size != new_size:
-            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
-            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
-            # only the position tokens are interpolated
-            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
-            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
-            pos_tokens = torch.nn.functional.interpolate(
-                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
-            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
-            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
-            checkpoint_model['pos_embed'] = new_pos_embed
-
-
-def interpolate_pos_embed_online(
-    pos_embed, orig_size, new_size, num_extra_tokens: int
-):
-    # [257, 1024]
-    extra_tokens = pos_embed[:num_extra_tokens]
-    pos_tokens = pos_embed[num_extra_tokens:]
-    embedding_size = pos_tokens.shape[1]
-    pos_tokens = pos_tokens.reshape(
-        -1, orig_size[0], orig_size[1], embedding_size
-    ).permute(0, 3, 1, 2)
-    pos_tokens = torch.nn.functional.interpolate(
-        pos_tokens, size=new_size, mode="bicubic", align_corners=False,
-    )
-    pos_tokens = pos_tokens.permute(0, 2, 3, 1).reshape(-1, embedding_size)
-    new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=0)
-    return new_pos_embed
\ No newline at end of file