Spaces:

csuhan
/

Tar

Running on A10G

App Files Files Community

Jiaming Han commited on 3 days ago

Commit

3c55139

1 Parent(s): 5b036b0

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +4 -4
app.py +90 -436
config/llama2/7B.json +0 -1
config/llama2/tokenizer.model +0 -3
data/__pycache__/conversation_lib.cpython-310.pyc +0 -0
data/__pycache__/conversation_lib.cpython-39.pyc +0 -0
data/__pycache__/fintune_dataset.cpython-310.pyc +0 -0
data/__pycache__/fintune_dataset.cpython-39.pyc +0 -0
data/__pycache__/imu_utils.cpython-310.pyc +0 -0
data/__pycache__/imu_utils.cpython-39.pyc +0 -0
data/__pycache__/video_utils.cpython-310.pyc +0 -0
data/__pycache__/video_utils.cpython-39.pyc +0 -0
data/conversation_lib.py +0 -369
data/fintune_dataset.py +0 -449
data/imu_utils.py +0 -257
data/video_utils.py +0 -204
demos/multi_turn_mm.py +0 -300
examples/bell_ring.wav +0 -3
examples/bird_audio.wav +0 -0
examples/depth_normal/depth/0084.png +0 -0
examples/depth_normal/depth/0131.png +0 -0
examples/depth_normal/depth/0297.png +0 -0
examples/depth_normal/depth/0331.png +0 -0
examples/depth_normal/depth/0432.png +0 -0
examples/depth_normal/depth/0633.png +0 -0
examples/depth_normal/depth/0663.png +0 -0
examples/depth_normal/depth/0771.png +0 -0
examples/depth_normal/depth/0782.png +0 -0
examples/depth_normal/depth/1001.png +0 -0
examples/depth_normal/depth/1051.png +0 -0
examples/depth_normal/depth/1129.png +0 -0
examples/depth_normal/depth/1205.png +0 -0
examples/depth_normal/depth/1336.png +0 -0
examples/depth_normal/depth/1383.png +0 -0
examples/depth_normal/depth/1386.png +0 -0
examples/depth_normal/depth/1393.png +0 -0
examples/depth_normal/depth/1447.png +0 -0
examples/depth_normal/depth_scaled/0084.png +0 -0
examples/depth_normal/depth_scaled/0131.png +0 -0
examples/depth_normal/depth_scaled/0297.png +0 -0
examples/depth_normal/depth_scaled/0331.png +0 -0
examples/depth_normal/depth_scaled/0432.png +0 -0
examples/depth_normal/depth_scaled/0633.png +0 -0
examples/depth_normal/depth_scaled/0663.png +0 -0
examples/depth_normal/depth_scaled/0771.png +0 -0
examples/depth_normal/depth_scaled/0782.png +0 -0
examples/depth_normal/depth_scaled/1001.png +0 -0
examples/depth_normal/depth_scaled/1051.png +0 -0
examples/depth_normal/depth_scaled/1129.png +0 -0
examples/depth_normal/depth_scaled/1205.png +0 -0

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
-title: OneLLM
 emoji: 🚀
 colorFrom: red
 colorTo: indigo
 sdk: gradio
-sdk_version: 4.7.1
 app_file: app.py
 pinned: false
-python_version: 3.9.18
 ---
-# OneLLM: One Framework to Align All Modalities with Language

 ---
+title: Tar
 emoji: 🚀
 colorFrom: red
 colorTo: indigo
 sdk: gradio
+sdk_version: 5.34.0
 app_file: app.py
 pinned: false
+python_version: 3.10.18
 ---
+# Tar: Unifying Visual Understanding and Generation via Text-Aligned Representations

app.py CHANGED Viewed

@@ -1,457 +1,111 @@
-import sys
 import os
-import argparse
-import multiprocessing as mp
-import numpy as np
-from typing import List, Optional
-import torch
-import torch.distributed as dist
-from fairscale.nn.model_parallel import initialize as fs_init
 import gradio as gr
-from util.misc import setup_for_distributed
-from util.misc import default_tensor_type
-from model.meta import MetaModel
-from data.conversation_lib import conv_templates, SeparatorStyle
-from PIL import Image
-import torchvision.transforms as transforms
-from data.fintune_dataset import make_audio_features
-from data import video_utils
-from dataclasses import dataclass
-from huggingface_hub import hf_hub_download
-import plotly.graph_objects as go
-from data.fintune_dataset import pc_norm
-from functools import partial
-import glob
-import torchvision.transforms.functional as F
-T_random_resized_crop = transforms.Compose([
-    transforms.RandomResizedCrop(size=(224, 224), scale=(0.9, 1.0), ratio=(0.75, 1.3333), interpolation=3,
-                                 antialias=None),  # 3 is bicubic
-    transforms.ToTensor(),
-    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])
-class PairRandomResizedCrop(transforms.RandomResizedCrop):
-    def forward(self, imgs):
-        i, j, h, w = self.get_params(imgs[0], self.scale, self.ratio)
-        return [F.resized_crop(img, i, j, h, w, self.size, self.interpolation, antialias=self.antialias) for img in imgs]
-class PairToTensor(transforms.ToTensor):
-    def __call__(self, pics):
-        return [F.to_tensor(pic) for pic in pics]
-class PairNormalize(transforms.Normalize):
-    def forward(self, tensors):
-        return [F.normalize(tensor, self.mean, self.std, self.inplace) for tensor in tensors]
-transform_pairimg_train = transforms.Compose([
-    PairRandomResizedCrop(size=(224, 224), scale=(0.99, 1.0), ratio=(0.75, 1.3333), interpolation=3, antialias=None),  # 3 is bicubic
-    PairToTensor(),
-    PairNormalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])
-def load_audio(audio_path):
-    fbank = make_audio_features(audio_path, mel_bins=128)
-    fbank = fbank.transpose(0, 1)[None] #[1, 128, 1024]
-    return fbank
-def load_video(video_path):
-    video_feats = video_utils.load_and_transform_video_data(video_path, video_path, clip_duration=1, clips_per_video=5)
-    return video_feats[:, :, 0]
-def load_point(point_path):
-    point_feat = np.load(point_path)
-    point_feat = torch.tensor(point_feat)
-    point_feat = pc_norm(point_feat)
-    return point_feat
-def load_fmri(fmri_path):
-    data = np.load(fmri_path)
-    data = data.mean(axis=0)
-    data = torch.tensor(data[None])
-    return data
-def load_rgbx(image_path, x_image_path):
-    # trick: replace path if 'depth_scaled' in path
-    x_image_path = x_image_path.replace('depth_scaled', 'depth')
-    image = Image.open(image_path).convert('RGB')
-    x_image = Image.open(x_image_path).convert('RGB')
-    x_image = x_image.resize(image.size[-2:])
-    image, x_image = transform_pairimg_train([image, x_image])
-    # [2, 3, H, W]
-    image = torch.stack([image, x_image], dim=0)
-    return image
-class Ready: pass
-def model_worker(
-    rank: int, args: argparse.Namespace, barrier: mp.Barrier,
-    request_queue: mp.Queue, response_queue: Optional[mp.Queue] = None,
-) -> None:
-    """
-    The worker function that manipulates the GPU to run the inference.
-    Exact n_gpu workers are started, with each one operating on a separate GPU.
-    Args:
-        rank (int): Distributed rank of the worker.
-        args (argparse.Namespace): All command line arguments.
-        barrier (multiprocessing.Barrier): A barrier used to delay the start
-            of Web UI to be after the start of the model.
-    """
-    world_size = len(args.gpu_ids)
-    gpu_id = args.gpu_ids[rank]
-    dist.init_process_group(
-        backend="nccl", rank=rank, world_size=world_size,
-        init_method=f"tcp://{args.master_addr}:{args.master_port}",
-    )
-    print(f"| distributed init on worker {rank}/{world_size}. "
-          f"using gpu: {gpu_id}")
-    fs_init.initialize_model_parallel(world_size)
-    torch.cuda.set_device(gpu_id)
-    torch.manual_seed(1)
-    np.random.seed(1)
-    # set the print behavior.
-    setup_for_distributed(rank == 0)
-    target_dtype = {
-        "bf16": torch.bfloat16,
-        "fp16": torch.float16
-    }[args.dtype]
-    with default_tensor_type(dtype=target_dtype, device="cuda"):
-        model = MetaModel(args.llama_type, args.llama_config, tokenizer_path=args.tokenizer_path)
-    for ckpt_id in range(args.num_ckpts):
-        ckpt_path = hf_hub_download(repo_id=args.pretrained_path, filename=args.ckpt_format.format(str(ckpt_id)))
-        # ckpt_path = os.path.join(args.pretrained_path, args.ckpt_format.format(str(ckpt_id)))
-        print(f"Loading pretrained weights {ckpt_path}")
-        checkpoint = torch.load(ckpt_path, map_location='cpu')
-        msg = model.load_state_dict(checkpoint, strict=False)
-    # print("load result:\n", msg)
-    model.cuda()
-    model.eval()
-    print(f"Model = {str(model)}")
-    barrier.wait()
-    while True:
-        if response_queue is not None:
-            response_queue.put(Ready())
-        img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, temperature, top_p, modality = request_queue.get()
-        try:
-            if 'image' in modality and img_path is not None:
-                image = Image.open(img_path).convert('RGB')
-                inputs = T_random_resized_crop(image)
-            elif 'video' in modality and video_path is not None:
-                inputs = load_video(video_path)
-            elif 'audio' in modality and audio_path is not None:
-                inputs = load_audio(audio_path)
-            elif 'point' in modality and point_path is not None:
-                inputs = load_point(point_path)
-            elif 'fmri' in modality and fmri_path is not None:
-                inputs = load_fmri(fmri_path)
-            elif 'rgbd' in modality and depth_path is not None and depth_rgb_path is not None:
-                inputs = load_rgbx(depth_rgb_path, depth_path)
-            elif 'rgbn' in modality and normal_path is not None and normal_rgb_path is not None:
-                inputs = load_rgbx(normal_rgb_path, normal_path)
-            else:
-                inputs = None
-        except:
-            inputs = None
-        if inputs is not None:
-            inputs = inputs[None].cuda().to(target_dtype)
-        conv = conv_templates["v1"].copy()
-        for user, bot in chatbot:
-            conv.append_message(conv.roles[0], user)
-            conv.append_message(conv.roles[1], bot)
-        with torch.cuda.amp.autocast(dtype=target_dtype):
-            print(conv.get_prompt())
-            for stream_response in model.stream_generate(
-                conv.get_prompt(), inputs,
-                max_gen_len=max_gen_len, temperature=temperature, top_p=top_p,
-                modal = modality
-            ):
-                conv_sep = (
-                    conv.sep
-                    if conv.sep_style == SeparatorStyle.SINGLE
-                    else conv.sep2
-                )
-                end_pos = stream_response["text"].find(conv_sep)
-                if end_pos != -1:
-                    stream_response["text"] = (
-                        stream_response['text'][:end_pos].rstrip() + "\n"
-                    )
-                    stream_response["end_of_content"] = True
-                # keep a few characters if not end_of_content to avoid sending
-                # part of conv_sep before all of it is generated.
-                if not stream_response["end_of_content"]:
-                    if len(stream_response["text"]) < len(conv_sep):
-                        continue
-                    stream_response["text"] = (
-                        stream_response["text"][:-len(conv_sep)]
-                    )
-                if response_queue is not None:
-                    response_queue.put(stream_response)
-                if stream_response["end_of_content"]:
-                    break
-def gradio_worker(
-    request_queues: List[mp.Queue], response_queue: mp.Queue,
-    args: argparse.Namespace, barrier: mp.Barrier,
-) -> None:
-    """
-    The gradio worker is responsible for displaying the WebUI and relay the
-    requests to model workers. It should be launched only once.
-    Args:
-        request_queues (List[mp.Queue]): A list of request queues (one for
-            each model worker).
-        args (argparse.Namespace): All command line arguments.
-        barrier (multiprocessing.Barrier): A barrier used to delay the start
-            of Web UI to be after the start of the model.
-    """
-    def show_user_input(msg, chatbot):
-        return "", chatbot + [[msg, None]]
-    def stream_model_output(img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality):
-        while True:
-            content_piece = response_queue.get()
-            if isinstance(content_piece, Ready):
-                break
-        for queue in request_queues:
-            queue.put((img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality))
-        while True:
-            content_piece = response_queue.get()
-            chatbot[-1][1] = content_piece["text"]
-            yield chatbot
-            if content_piece["end_of_content"]:
-                break
-    def undo(chatbot):
-        if len(chatbot) > 0:
-            chatbot = chatbot[:-1]
-        return chatbot
-    def clear():
-        chatbot = []
-        msg = ""
-        return chatbot, msg
-    def show_point_cloud(file):
-        point = load_point(file).numpy()
-        fig = go.Figure(
-            data=[
-                go.Scatter3d(
-                    x=point[:,0], y=point[:,1], z=point[:,2],
-                    mode='markers',
-                    marker=dict(
-                    size=1.2,
-                    color=['rgb({},{},{})'.format(r, g, b) for r,g,b in zip(point[:,3], point[:,4], point[:,5])]
-                ))],
-            layout=dict(
-                scene=dict(
-                    xaxis=dict(visible=False),
-                    yaxis=dict(visible=False),
-                    zaxis=dict(visible=False)
-                )),)
-        return fig
-    def change_modality(modal):
-        return modal
-    CSS ="""
-    .contain { display: flex; flex-direction: column; }
-    #component-0 { height: 100%; }
-    #chatbot { flex-grow: 1; overflow: auto;}
-    """
-    header="""
-    ## OneLLM: One Framework to Align All Modalities with Language
-    [[Project Page](https://onellm.csuhan.com)] [[Paper](https://arxiv.org/abs/2312.03700)] [[Code](https://github.com/csuhan/OneLLM)]
-    """
-    with gr.Blocks(css=CSS, theme=gr.themes.Base()) as demo:
-        gr.Markdown(header)
-        with gr.Row(equal_height=True):
-            modality = gr.Textbox(value='image', visible=False)
             with gr.Column(scale=1):
-                with gr.Tab('Image') as img_tab:
-                    img_path = gr.Image(label='Image Input', type='filepath')
-                    gr.Examples(
-                        examples=[
-                            "examples/new_york.jpg",
-                            "examples/food_menu.png",
-                            ],
-                        inputs=[img_path],
-                    )
-                with gr.Tab('Video') as video_tab:
-                    video_path = gr.Video(label='Video Input', max_length=180)
-                    gr.Examples(
-                        examples=[
-                            "examples/flower.mp4",
-                            "examples/star_kun.mp4",
-                            ],
-                        inputs=[video_path],
-                    )
-                with gr.Tab('Audio') as audio_tab:
-                    audio_path = gr.Audio(label='Audio Input', type='filepath', sources=['upload'])
-                    gr.Examples(
-                        examples=[
-                            "examples/bell_ring.wav",
-                            "examples/bird_audio.wav",
-                            ],
-                        inputs=[audio_path],
-                    )
-                with gr.Tab('Point Cloud') as point_tab:
-                    point_path = gr.File(label='Point Cloud Input', elem_id="pointpath", elem_classes="")
-                    point_vis = gr.Plot()
-                    btn = gr.Button(value="Show Point Cloud")
-                    btn.click(show_point_cloud, point_path, point_vis)
-                    gr.Examples(
-                        examples=glob.glob("examples/point/*.npy"),
-                        inputs=[point_path],
-                        examples_per_page=5,
-                    )
-                with gr.Tab('IMU') as imu_tab:
-                    gr.Markdown('Coming soon🤗')
-                with gr.Tab('fMRI') as fmri_tab:
-                    fmri_path = gr.File(label='fMRI Input', elem_id="fmripath", elem_classes="")
-                    fmri_image_path = gr.Image(label='Reference Image', interactive=False)
-                    gr.Examples(
-                        examples=[
-                            [file.replace('.jpg', '.npy'), file]
-                            for file in glob.glob("examples/fmri/*.jpg")
-                        ],
-                        inputs=[fmri_path, fmri_image_path],
-                        examples_per_page=3,
-                    )
-                with gr.Tab('Depth Map') as depth_tab:
-                    depth_path = gr.Image(label='Depth Map', type='filepath')
-                    depth_rgb_path = gr.Image(label='RGB Image', type='filepath')
-                    gr.Examples(
-                        examples=[
-                            [rgb_image.replace('rgb', 'depth_scaled'), rgb_image]
-                            for rgb_image in glob.glob("examples/depth_normal/rgb/*.png")[:9]
-                        ],
-                        inputs=[depth_path, depth_rgb_path],
-                        examples_per_page=3,
-                    )
-                with gr.Tab('Normal Map') as normal_tab:
-                    normal_path = gr.Image(label='Normal Map', type='filepath')
-                    normal_rgb_path = gr.Image(label='RGB Image', type='filepath')
-                    gr.Examples(
-                        examples=[
-                            [rgb_image.replace('rgb', 'normal'), rgb_image]
-                            for rgb_image in glob.glob("examples/depth_normal/rgb/*.png")[9:]
-                        ],
-                        inputs=[normal_path, normal_rgb_path],
-                        examples_per_page=3,
-                    )
-            with gr.Column(scale=2):
-                chatbot = gr.Chatbot(elem_id="chatbot")
-                msg = gr.Textbox()
                 with gr.Row():
-                    submit_button = gr.Button("Submit", variant="primary")
-                    undo_button = gr.Button("Undo")
-                    clear_button = gr.ClearButton([chatbot, msg, img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, point_vis])
-                with gr.Row():
-                    max_gen_len = gr.Slider(
-                        minimum=1, maximum=args.model_max_seq_len // 2,
-                        value=args.model_max_seq_len // 2, interactive=True,
-                        label="Single-turn max response length",
-                    )
-                    gen_t = gr.Slider(
-                        minimum=0, maximum=1, value=0.1, interactive=True,
-                        label="Temperature",
-                    )
-                    top_p = gr.Slider(
-                        minimum=0, maximum=1, value=0.75, interactive=True,
-                        label="Top-p",
-                    )
-        img_tab.select(partial(change_modality, 'image'), [], [modality])
-        video_tab.select(partial(change_modality, 'video'), [], [modality])
-        audio_tab.select(partial(change_modality, 'audio'), [], [modality])
-        point_tab.select(partial(change_modality, 'point'), [], [modality])
-        fmri_tab.select(partial(change_modality, 'fmri'), [], [modality])
-        depth_tab.select(partial(change_modality, 'rgbd'), [], [modality])
-        normal_tab.select(partial(change_modality, 'rgbn'), [], [modality])
-        img_path.change(clear, [], [chatbot, msg])
-        audio_path.change(clear, [], [chatbot, msg])
-        video_path.change(clear, [], [chatbot, msg])
-        point_path.change(clear, [], [chatbot, msg])
-        fmri_path.change(clear, [], [chatbot, msg])
-        depth_path.change(clear, [], [chatbot, msg])
-        normal_path.change(clear, [], [chatbot, msg])
-        msg.submit(
-            show_user_input, [msg, chatbot], [msg, chatbot],
-        ).then(
-            stream_model_output, [img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality], chatbot,
         )
-        submit_button.click(
-            show_user_input, [msg, chatbot], [msg, chatbot],
-        ).then(
-            stream_model_output, [img_path, audio_path, video_path, point_path, fmri_path, depth_path, depth_rgb_path, normal_path, normal_rgb_path, chatbot, max_gen_len, gen_t, top_p, modality], chatbot,
-        )
-        undo_button.click(undo, chatbot, chatbot)
-    barrier.wait()
-    demo.queue(api_open=True).launch(share=True, max_threads=1)
-@dataclass
-class DemoConfig:
-    gpu_ids = [0]
-    tokenizer_path = "config/llama2/tokenizer.model"
-    llama_type = "onellm"
-    llama_config = "config/llama2/7B.json"
-    model_max_seq_len = 2048
-    pretrained_path = "csuhan/OneLLM-7B-hf"
-    # pretrained_path = "/home/pgao/jiaming/weights/7B_v20_splits/"
-    ckpt_format = "consolidated.00-of-01.s{}.pth"
-    num_ckpts = 10
-    master_port = 23863
-    master_addr = "127.0.0.1"
-    dtype = "fp16"
-if __name__ == "__main__":
-    args = DemoConfig()
-    # using the default "fork" method messes up some imported libs (e.g.,
-    # pandas)
-    # mp.set_start_method("spawn")
-    # setup the queues and start the model workers
-    request_queues = []
-    response_queue = mp.Queue()
-    worker_processes = []
-    barrier = mp.Barrier(len(args.gpu_ids) + 1)
-    for rank, gpu_id in enumerate(args.gpu_ids):
-        request_queue = mp.Queue()
-        rank_response_queue = response_queue if rank == 0 else None
-        process = mp.Process(
-            target=model_worker,
-            args=(rank, args, barrier, request_queue, rank_response_queue),
         )
-        process.start()
-        worker_processes.append(process)
-        request_queues.append(request_queue)
-    gradio_worker(request_queues, response_queue, args, barrier)

 import os
 import gradio as gr
+from torchvision.transforms.functional import to_tensor
+from huggingface_hub import hf_hub_download, login
+from t2i_inference import T2IConfig, TextToImageInference
+def generate_text(self, image: str, prompt: str) -> str:
+    image = image.convert('RGB')
+    image = to_tensor(image).unsqueeze(0).to(self.device)
+    image_code = self.visual_tokenizer.encoder(image)['bottleneck_rep']
+    image_text = "".join([f"<I{x}>" for x in image_code[0].cpu().tolist()])
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": f"{image_text}\n{prompt}"}
+    ]
+    input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = self.tokenizer(input_text, return_tensors="pt")
+    gen_ids = self.model.generate(
+        inputs.input_ids.to(self.device),
+        max_new_tokens=512,
+        do_sample=True)
+    return self.tokenizer.batch_decode(gen_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
+login(token=os.getenv('HF_TOKEN'))
+config = T2IConfig()
+config.ar_path = hf_hub_download("csuhan/TA-Tok", "ar_dtok_lp_512px.pth")
+config.encoder_path = hf_hub_download("csuhan/TA-Tok", "ta_tok.pth")
+config.decoder_path = hf_hub_download("peizesun/llamagen_t2i", "vq_ds16_t2i.pt")
+inference = TextToImageInference(config)
+def generate_image(prompt, top_p, top_k, cfg_scale):
+    config.top_p = top_p
+    config.top_k = top_k
+    config.cfg_scale = cfg_scale
+    image = inference.generate_image(prompt)
+    return image
+def clear_inputs_t2i():
+    return "", None
+def understand_image(image, prompt):
+    return generate_text(inference, image, prompt)
+def clear_inputs_i2t():
+    return None, ""
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        <div align="center">
+        ### Tar: Unifying Visual Understanding and Generation via Text-Aligned Representations
+        [📄 Paper](https://arxiv.org/abs/xxxx.xxxxx) • [💻 Code](https://github.com/csuhan/Tar) • [📦 Model](https://huggingface.co/csuhan/TA-Tok)
+        </div>
+        """,
+        elem_id="title",
+    )
+    with gr.Tab("Image Generation"):
+      with gr.Row():
+          with gr.Column(scale=1):
+              prompt = gr.Textbox(label="Prompt", placeholder="Enter a prompt")
+              with gr.Accordion("Advanced Settings", open=False):
+                top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
+                top_k = gr.Slider(1, 2000, value=1200, step=10, label="Top-k")
+                cfg_scale = gr.Slider(1.0, 20.0, value=4.0, step=0.5, label="CFG Scale")
+              with gr.Row():
+                  generate_btn = gr.Button("Generate")
+                  clear_btn = gr.Button("Clear")
+          with gr.Column(scale=2):
+              output_image = gr.Image(label="Generated Image")
+      generate_btn.click(
+          generate_image,
+          inputs=[prompt, top_p, top_k, cfg_scale],
+          outputs=output_image
+      )
+      clear_btn.click(
+          clear_inputs_t2i,
+          outputs=[prompt, output_image]
+      )
+    with gr.Tab("Image Understanding"):
+        with gr.Row():
             with gr.Column(scale=1):
+                image_input = gr.Image(label="Upload Image", type="pil")
+                question_input = gr.Textbox(label="Instruction", value="Describe the image shortly.")
                 with gr.Row():
+                    qa_btn = gr.Button("Generate")
+                    clear_btn_i2t = gr.Button("Clear")
+            with gr.Column(scale=1):
+                answer_output = gr.Textbox(label="Response", lines=4)
+        qa_btn.click(
+            understand_image,
+            inputs=[image_input, question_input],
+            outputs=answer_output
         )
+        clear_btn_i2t.click(
+            clear_inputs_i2t,
+            outputs=[image_input, question_input, answer_output]
         )
+demo.launch(share=True)

config/llama2/7B.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"dim": 4096, "multiple_of": 256, "n_heads": 32, "n_layers": 32, "norm_eps": 1e-05, "vocab_size": -1}

config/llama2/tokenizer.model DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
-size 499723

data/__pycache__/conversation_lib.cpython-310.pyc DELETED Viewed

Binary file (9.14 kB)

data/__pycache__/conversation_lib.cpython-39.pyc DELETED Viewed

Binary file (9.15 kB)

data/__pycache__/fintune_dataset.cpython-310.pyc DELETED Viewed

Binary file (14.2 kB)

data/__pycache__/fintune_dataset.cpython-39.pyc DELETED Viewed

Binary file (14.2 kB)

data/__pycache__/imu_utils.cpython-310.pyc DELETED Viewed

Binary file (6.71 kB)

data/__pycache__/imu_utils.cpython-39.pyc DELETED Viewed

Binary file (6.71 kB)

data/__pycache__/video_utils.cpython-310.pyc DELETED Viewed

Binary file (6.53 kB)

data/__pycache__/video_utils.cpython-39.pyc DELETED Viewed

Binary file (6.51 kB)

data/conversation_lib.py DELETED Viewed

@@ -1,369 +0,0 @@
-import dataclasses
-from enum import auto, Enum
-from typing import List, Tuple
-class SeparatorStyle(Enum):
-    """Different separator style."""
-    SINGLE = auto()
-    TWO = auto()
-    MPT = auto()
-@dataclasses.dataclass
-class Conversation:
-    """A class that keeps all conversation history."""
-    system: str
-    roles: List[str]
-    messages: List[List[str]]
-    offset: int
-    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
-    sep: str = "###"
-    sep2: str = None
-    version: str = "Unknown"
-    skip_next: bool = False
-    def get_prompt(self):
-        if self.sep_style == SeparatorStyle.SINGLE:
-            ret = self.system + '\n\n' + self.sep
-            for role, message in self.messages:
-                if message:
-                    if type(message) is tuple:
-                        message, _, _ = message
-                    ret += role + ": " + message + '\n' + self.sep
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.TWO:
-            seps = [self.sep, self.sep2]
-            ret = self.system + seps[0]
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    if type(message) is tuple:
-                        message, _, _ = message
-                    ret += role + ": " + message + seps[i % 2]
-                else:
-                    ret += role + ":"
-            return ret
-        if self.sep_style == SeparatorStyle.MPT:
-            ret = self.system + self.sep
-            for role, message in self.messages:
-                if message:
-                    if type(message) is tuple:
-                        message, _, _ = message
-                    ret += role + message + self.sep
-                else:
-                    ret += role
-            return ret
-        else:
-            raise ValueError(f"Invalid style: {self.sep_style}")
-    def append_message(self, role, message):
-        self.messages.append([role, message])
-    def get_images(self, return_pil=False):
-        images = []
-        for i, (role, msg) in enumerate(self.messages[self.offset:]):
-            if i % 2 == 0:
-                if type(msg) is tuple:
-                    import base64
-                    from io import BytesIO
-                    from PIL import Image
-                    msg, image, image_process_mode = msg
-                    if image_process_mode == "Pad":
-                        def expand2square(pil_img, background_color=(122, 116, 104)):
-                            width, height = pil_img.size
-                            if width == height:
-                                return pil_img
-                            elif width > height:
-                                result = Image.new(pil_img.mode, (width, width), background_color)
-                                result.paste(pil_img, (0, (width - height) // 2))
-                                return result
-                            else:
-                                result = Image.new(pil_img.mode, (height, height), background_color)
-                                result.paste(pil_img, ((height - width) // 2, 0))
-                                return result
-                        image = expand2square(image)
-                    elif image_process_mode == "Crop":
-                        pass
-                    elif image_process_mode == "Resize":
-                        image = image.resize((224, 224))
-                    else:
-                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
-                    max_hw, min_hw = max(image.size), min(image.size)
-                    aspect_ratio = max_hw / min_hw
-                    max_len, min_len = 800, 400
-                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
-                    longest_edge = int(shortest_edge * aspect_ratio)
-                    W, H = image.size
-                    if H > W:
-                        H, W = longest_edge, shortest_edge
-                    else:
-                        H, W = shortest_edge, longest_edge
-                    image = image.resize((W, H))
-                    if return_pil:
-                        images.append(image)
-                    else:
-                        buffered = BytesIO()
-                        image.save(buffered, format="JPEG")
-                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
-                        images.append(img_b64_str)
-        return images
-    def to_gradio_chatbot(self):
-        ret = []
-        for i, (role, msg) in enumerate(self.messages[self.offset:]):
-            if i % 2 == 0:
-                if type(msg) is tuple:
-                    import base64
-                    from io import BytesIO
-                    msg, image, image_process_mode = msg
-                    max_hw, min_hw = max(image.size), min(image.size)
-                    aspect_ratio = max_hw / min_hw
-                    max_len, min_len = 800, 400
-                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
-                    longest_edge = int(shortest_edge * aspect_ratio)
-                    W, H = image.size
-                    if H > W:
-                        H, W = longest_edge, shortest_edge
-                    else:
-                        H, W = shortest_edge, longest_edge
-                    image = image.resize((W, H))
-                    # image = image.resize((224, 224))
-                    buffered = BytesIO()
-                    image.save(buffered, format="JPEG")
-                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
-                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
-                    msg = msg.replace('<image>', img_str)
-                ret.append([msg, None])
-            else:
-                ret[-1][-1] = msg
-        return ret
-    def copy(self):
-        return Conversation(
-            system=self.system,
-            roles=self.roles,
-            messages=[[x, y] for x, y in self.messages],
-            offset=self.offset,
-            sep_style=self.sep_style,
-            sep=self.sep,
-            sep2=self.sep2)
-    def dict(self):
-        if len(self.get_images()) > 0:
-            return {
-                "system": self.system,
-                "roles": self.roles,
-                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
-                "offset": self.offset,
-                "sep": self.sep,
-                "sep2": self.sep2,
-            }
-        return {
-            "system": self.system,
-            "roles": self.roles,
-            "messages": self.messages,
-            "offset": self.offset,
-            "sep": self.sep,
-            "sep2": self.sep2,
-        }
-conv_v1 = Conversation(
-    system="A chat between a curious human and an artificial intelligence assistant. "
-           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
-    roles=("Human", "Assistant"),
-    messages=(
-        ("Human", "Give three tips for staying healthy."),
-        ("Assistant",
-         "Sure, here are three tips for staying healthy:\n"
-         "1. Exercise regularly: Regular physical activity can help improve your overall health and wellbeing. "
-         "It can also help reduce your risk of chronic conditions such as obesity, diabetes, heart disease, "
-         "and certain cancers. Aim for at least 150 minutes of moderate-intensity aerobic exercise or "
-         "75 minutes of vigorous-intensity aerobic exercise per week, along with muscle-strengthening "
-         "activities at least two days per week.\n"
-         "2. Eat a balanced diet: Eating a balanced diet that is rich in fruits, "
-         "vegetables, whole grains, lean proteins, and healthy fats can help support "
-         "your overall health. Try to limit your intake of processed and high-sugar foods, "
-         "and aim to drink plenty of water throughout the day.\n"
-         "3. Get enough sleep: Getting enough quality sleep is essential for your physical "
-         "and mental health. Adults should aim for seven to nine hours of sleep per night. "
-         "Establish a regular sleep schedule and try to create a relaxing bedtime routine to "
-         "help improve the quality of your sleep.")
-    ),
-    offset=2,
-    sep_style=SeparatorStyle.SINGLE,
-    sep="###",
-)
-conv_v1_2 = Conversation(
-    system="A chat between a curious human and an artificial intelligence assistant. "
-           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
-    roles=("Human", "Assistant"),
-    messages=(),
-    # (
-    #     ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
-    #     ("Assistant",
-    #         "Renewable energy sources are those that can be replenished naturally in a relatively "
-    #         "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
-    #         "Non-renewable energy sources, on the other hand, are finite and will eventually be "
-    #         "depleted, such as coal, oil, and natural gas. Here are some key differences between "
-    #         "renewable and non-renewable energy sources:\n"
-    #         "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
-    #         "energy sources are finite and will eventually run out.\n"
-    #         "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
-    #         "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
-    #         "and other negative effects.\n"
-    #         "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
-    #         "have lower operational costs than non-renewable sources.\n"
-    #         "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
-    #         "locations than non-renewable sources.\n"
-    #         "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
-    #         "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
-    #         "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
-    #         "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
-    # )
-    offset = 2,
-    sep_style = SeparatorStyle.SINGLE,
-    sep = "###",
-    )
-conv_vicuna_v1_1 = Conversation(
-    system="A chat between a curious user and an artificial intelligence assistant. "
-           "The assistant gives helpful, detailed, and polite answers to the user's questions.",
-    roles=("USER", "ASSISTANT"),
-    version="v1",
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.TWO,
-    sep=" ",
-    sep2="</s>",
-)
-conv_mpt = Conversation(
-    system="""<|im_start|>system
-- You are a helpful language and vision assistant.
-- You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
-- You should follow the instructions carefully and explain your answers in detail.""",
-    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
-    version="mpt",
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.MPT,
-    sep="<|im_end|>",
-)
-conv_mpt_text = Conversation(
-    system="""<|im_start|>system
-- You are a helpful assistant chatbot trained by MosaicML.
-- You answer questions.
-- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
-- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
-    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
-    version="mpt",
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.MPT,
-    sep="<|im_end|>",
-)
-conv_bair_v1 = Conversation(
-    system="BEGINNING OF CONVERSATION:",
-    roles=("USER", "GPT"),
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.TWO,
-    sep=" ",
-    sep2="</s>",
-)
-simple_conv = Conversation(
-    system="A chat between a curious human and an artificial intelligence assistant. "
-           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
-    roles=("Human", "Assistant"),
-    messages=(
-        ("Human", "Hi!"),
-        ("Assistant", "Hi there! How can I help you today?")
-    ),
-    offset=2,
-    sep_style=SeparatorStyle.SINGLE,
-    sep="###",
-)
-simple_conv_multimodal = Conversation(
-    system="You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab."
-           "You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
-           "Follow the instructions carefully and explain your answers in detail.",
-    roles=("Human", "Assistant"),
-    messages=(
-        ("Human", "Hi!"),
-        ("Assistant", "Hi there!  How can I help you today?\n")
-    ),
-    offset=2,
-    sep_style=SeparatorStyle.SINGLE,
-    sep="###",
-)
-simple_conv_mpt_multimodal = Conversation(
-    system="""<|im_start|>system
-- You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab.
-- You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
-- You should follow the instructions carefully and explain your answers in detail.""",
-    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
-    version="mpt",
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.MPT,
-    sep="<|im_end|>",
-)
-simple_conv_legacy = Conversation(
-    system="You are LLaVA, a large language model trained by UW Madison WAIV Lab."
-           "You are designed to assist human with a variety of tasks using natural language."
-           "Follow the instructions carefully.",
-    roles=("Human", "Assistant"),
-    messages=(
-        ("Human", "Hi!\n\n### Response:"),
-        ("Assistant", "Hi there!  How can I help you today?\n")
-    ),
-    offset=2,
-    sep_style=SeparatorStyle.SINGLE,
-    sep="###",
-)
-conv_llava_v1 = Conversation(
-    system="You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab."
-           "You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
-           "Follow the instructions carefully and explain your answers in detail.",
-    roles=("USER", "ASSISTANT"),
-    version="v1",
-    messages=(),
-    offset=0,
-    sep_style=SeparatorStyle.TWO,
-    sep=" ",
-    sep2="</s>",
-)
-default_conversation = conv_v1_2
-conv_templates = {
-    "default": conv_v1_2,
-    "simple": simple_conv,
-    "simple_legacy": simple_conv_legacy,
-    "multimodal": simple_conv_multimodal,
-    "mpt_multimodal": simple_conv_mpt_multimodal,
-    "llava_v1": conv_llava_v1,
-    # fastchat
-    "v1": conv_v1_2,
-    "bair_v1": conv_bair_v1,
-    "vicuna_v1_1": conv_vicuna_v1_1,
-    "mpt": conv_mpt,
-    "mpt_text": conv_mpt_text,
-}
-if __name__ == "__main__":
-    print(default_conversation.get_prompt())

data/fintune_dataset.py DELETED Viewed

@@ -1,449 +0,0 @@
-import warnings
-import torch
-import yaml
-from torch.utils.data import Dataset
-from PIL import Image
-import json
-from model.tokenizer import Tokenizer
-import os
-import torchvision.transforms as transforms
-import random
-import torchvision.transforms.functional as F
-import torchaudio
-from . import conversation_lib
-import numpy as np
-from . import video_utils
-from .imu_utils import get_imu_frames
-IGNORE_INDEX = -100
-DEFAULT_IMAGE_TOKEN = "<image>"
-try:
-    from torchvision.transforms import InterpolationMode
-    BICUBIC = InterpolationMode.BICUBIC
-except ImportError:
-    BICUBIC = Image.BICUBIC
-T_random_resized_crop = transforms.Compose([
-    transforms.RandomResizedCrop(size=(224, 224), scale=(0.9, 1.0), ratio=(0.75, 1.3333), interpolation=BICUBIC,
-                                 antialias=None),  # 3 is bicubic
-    transforms.ToTensor(),
-    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])
-# image transform
-transform_img_train = transforms.Compose([
-    transforms.RandomResizedCrop(size=(224, 224), scale=(0.9, 1.0), ratio=(
-        0.75, 1.3333), interpolation=3, antialias=None),  # 3 is bicubic
-    transforms.ToTensor(),
-    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])
-class PairRandomResizedCrop(transforms.RandomResizedCrop):
-    def forward(self, imgs):
-        i, j, h, w = self.get_params(imgs[0], self.scale, self.ratio)
-        return [F.resized_crop(img, i, j, h, w, self.size, self.interpolation, antialias=self.antialias) for img in imgs]
-class PairToTensor(transforms.ToTensor):
-    def __call__(self, pics):
-        return [F.to_tensor(pic) for pic in pics]
-class PairNormalize(transforms.Normalize):
-    def forward(self, tensors):
-        return [F.normalize(tensor, self.mean, self.std, self.inplace) for tensor in tensors]
-transform_pairimg_train = transforms.Compose([
-    PairRandomResizedCrop(size=(224, 224), scale=(0.9, 1.0), ratio=(
-        0.75, 1.3333), interpolation=3, antialias=None),  # 3 is bicubic
-    PairToTensor(),
-    PairNormalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])
-def pc_norm(pc):
-    """ pc: NxC, return NxC """
-    xyz = pc[:, :3]
-    other_feature = pc[:, 3:]
-    centroid = torch.mean(xyz, dim=0)
-    xyz = xyz - centroid
-    m = torch.max(torch.sqrt(torch.sum(xyz ** 2, dim=1)))
-    xyz = xyz / m
-    pc = torch.cat((xyz, other_feature), dim=1)
-    return pc
-def make_audio_features(wav_name, mel_bins=128, target_length=1024, aug=False):
-    waveform, sr = torchaudio.load(wav_name)
-    # assert sr == 16000, 'input audio sampling rate must be 16kHz'
-    if sr != 16000:
-        trans = torchaudio.transforms.Resample(sr, 16000)
-        waveform = trans(waveform)
-    waveform = waveform - waveform.mean()
-    fbank = torchaudio.compliance.kaldi.fbank(
-        waveform, htk_compat=True, sample_frequency=16000, use_energy=False,
-        window_type='hanning', num_mel_bins=mel_bins, dither=0.0, frame_shift=10)
-    n_frames = fbank.shape[0]
-    p = target_length - n_frames
-    if p > 0:
-        m = torch.nn.ZeroPad2d((0, 0, 0, p))
-        fbank = m(fbank)
-    elif p < 0:
-        fbank = fbank[0:target_length, :]
-    if aug:
-        freqm = torchaudio.transforms.FrequencyMasking(48)
-        timem = torchaudio.transforms.TimeMasking(192)
-        fbank = torch.transpose(fbank, 0, 1)
-        fbank = fbank.unsqueeze(0)
-        fbank = freqm(fbank)
-        fbank = timem(fbank)
-        fbank = fbank.squeeze(0)
-        fbank = torch.transpose(fbank, 0, 1)
-    fbank = (fbank - (-4.2677393)) / (4.5689974 * 2)
-    return fbank
-class ConversationGenerator:
-    def __init__(self, tokenizer):
-        self.tokenizer = tokenizer
-        self.header = f"{conversation_lib.default_conversation.system}\n\n"
-        self._probe_tokenizer_style()
-    def _probe_tokenizer_style(self):
-        """
-        Given a sentence, e.g. "My darling", some tokenizers will make the space a seperate token,
-        while some others will merge the space into the next word, forming a token representing " darling".
-        Knowing which style the tokenizer takes is necessary for correct ground-truth label masking.
-        """
-        probe = "Probe am I"
-        sentence1 = self.tokenizer.encode(conversation_lib.default_conversation.roles[1] + ": " + probe,
-                                          bos=False, eos=False)
-        sentence2 = self.tokenizer.encode(probe,
-                                          bos=False, eos=False)
-        if sentence1[-len(sentence2):] == sentence2:
-            self.space_before_to_predict = False
-        else:
-            sentence3 = self.tokenizer.encode(" " + probe,
-                                              bos=False, eos=False)
-            assert sentence1[-len(sentence3):] == sentence3
-            self.space_before_to_predict = True
-    def add_speaker_and_signal(self, source, get_conversation=True):
-        """Add speaker and start/end signal on each round."""
-        BEGIN_SIGNAL = "### "
-        END_SIGNAL = "\n"
-        conversation = self.header
-        to_predict_list = []
-        for sentence in source:
-            from_str = sentence["from"]
-            if from_str.lower() in ["human"]:
-                from_str = conversation_lib.default_conversation.roles[0]
-            elif from_str.lower() in ["gpt", "assistant"]:
-                from_str = conversation_lib.default_conversation.roles[1]
-            else:
-                raise ValueError(f"unknown dialog role: {from_str.lower()}")
-            value = sentence["value"]
-            if DEFAULT_IMAGE_TOKEN in value:
-                value = value.replace(DEFAULT_IMAGE_TOKEN, '').strip()
-            sentence_value = BEGIN_SIGNAL + from_str + ": " + value + END_SIGNAL
-            if from_str == conversation_lib.default_conversation.roles[1]:
-                to_predict_value = value + END_SIGNAL + "###"
-                if self.space_before_to_predict:
-                    to_predict_value = " " + to_predict_value
-                to_predict_list.append(to_predict_value)
-            if get_conversation:
-                conversation = conversation + sentence_value
-        conversation = conversation + BEGIN_SIGNAL
-        return conversation, to_predict_list
-DATASETS = dict(
-    image=[
-        dict(path="datasets/InstructionTuning/image/llava_v1_5_mix665k_image.json", type='image'),
-        dict(path='datasets/InstructionTuning/image/cococap_train.json', type='image'),
-        dict(path="datasets/InstructionTuning/image/llava_v1_5_mix665k_text.json", type='text'),
-    ],
-    audio=[
-        dict(path="datasets/InstructionTuning/audio/audiocap_train.json", type='audio'),
-        dict(path="datasets/InstructionTuning/audio/audiocap_val.json", type='audio'),
-        dict(path="datasets/InstructionTuning/audio/audio_conversation.json", type='audio'),
-    ],
-    video=[
-        dict(path="datasets/InstructionTuning/video/msrvtt_cap_trainval.json", type='video'),
-        dict(path="datasets/InstructionTuning/video/msrvtt_cap_test.json", type='video'),
-        dict(path="datasets/InstructionTuning/video/msrvtt_vqa_train.json", type='video'),
-        dict(path="datasets/InstructionTuning/video/msrvtt_vqa_val.json", type='video'),
-        dict(path="datasets/InstructionTuning/video/msrvtt_vqa_test.json", type='video'),
-        dict(path="datasets/InstructionTuning/video/video_complex_reasoning_10k.json", type='video'),
-        dict(path="datasets/InstructionTuning/video/video_conversation_10k.json", type='video'),
-        dict(path="datasets/InstructionTuning/video/video_detail_10k.json", type='video'),
-    ],
-    point=[
-        dict(path="datasets/InstructionTuning/point/pointllm_70k_formated.json", type='point'),
-    ],
-    rgbd=[
-        dict(path="datasets/InstructionTuning/depth_normal/llava_instruct_50k_depth.json", type='rgbd'),
-    ],
-    rgbn=[
-        dict(path="datasets/InstructionTuning/depth_normal/llava_instruct_50k_normal.json", type='rgbn'),
-    ],
-    imu=[
-        dict(path="datasets/InstructionTuning/imu/imu_fixed_50k.json", type='imu'),
-    ],
-    fmri=[
-        dict(path="datasets/InstructionTuning/fmri/fmri_fixed.json", type='fmri'),
-    ],
-)
-IMU_PATH = "/mnt/petrelfs/share_data/hanjiaming/ego4d/v2/processed_imu/"
-class FinetuneDialogDataset(Dataset):
-    def __init__(self, dataset=['image'], transform=T_random_resized_crop, max_words=2048, image_words=30, tokenizer_path=None):
-        if isinstance(dataset, str):
-            dataset = [dataset]
-        self.dataset = dataset
-        group_ann = {}
-        for d in dataset:
-            for meta in DATASETS[d]:
-                meta_path, meta_type = meta['path'], meta['type']
-                meta_ext = os.path.splitext(meta_path)[-1]
-                if meta_ext == ".json":
-                    with open(meta_path) as f:
-                        meta_l = json.load(f)
-                        # add data_type
-                        # this is a temp solution
-                        new_meta_l = []
-                        for l in meta_l:
-                            l['data_type'] = meta_type
-                            new_meta_l.append(l)
-                        meta_l = new_meta_l
-                elif meta_ext == ".jsonl":
-                    meta_l = []
-                    with open(meta_path) as f:
-                        for i, line in enumerate(f):
-                            try:
-                                meta_l.append(json.loads(line))
-                            except json.decoder.JSONDecodeError as e:
-                                print(
-                                    f"Error decoding the following jsonl line ({i}):\n{line.rstrip()}", force=True)
-                                raise e
-                else:
-                    raise NotImplementedError(
-                        f"Unknown meta file extension: \"{meta_ext}\". "
-                        f"Currently, .json, .jsonl are supported. "
-                        "If you are using a supported format, please set the file extension so that the proper parsing "
-                        "routine can be called."
-                    )
-                if meta_type not in group_ann:
-                    group_ann[meta_type] = []
-                print(f"{meta_path}, type {meta_type}: len {len(meta_l)}")
-                group_ann[meta_type] += meta_l
-        # sort group_ann for higher efficiency (items in one global batch with similar length)
-        for meta_type, meta_l in group_ann.items():
-            meta_l.sort(key=lambda data_item: sum(
-                [len(_['value']) for _ in data_item['conversations']]))
-        self.group_ann = group_ann
-        self.ann = sum(list(self.group_ann.values()), start=[])
-        self.group_indices = {}
-        start_pos = 0
-        for meta_type, meta_l in self.group_ann.items():
-            self.group_indices[meta_type] = list(
-                range(start_pos, start_pos + len(meta_l)))
-            start_pos = start_pos + len(meta_l)
-        print(f"total length: {len(self)}")
-        self.transform = transform
-        print(f"transform:\n{self.transform}")
-        self.max_words = max_words
-        self.image_words = image_words
-        self.tokenizer = Tokenizer(model_path=tokenizer_path)
-        self.conversation_generator = ConversationGenerator(self.tokenizer)
-        self.load_funcs = dict(
-            image=self.load_image,
-            audio=self.load_audio,
-            video=self.load_video,
-            point=self.load_point,
-            rgbd=self.load_rgbx,
-            rgbn=self.load_rgbx,
-            imu=self.load_imu,
-            fmri=self.load_fmri
-        )
-    def __len__(self):
-        return len(self.ann)
-    def load_image(self, data):
-        filename = data['image']
-        image = Image.open(filename).convert('RGB')
-        image = self.transform(image)
-        return image
-    def load_audio(self, data):
-        audio_path = data['image']
-        fbank = make_audio_features(audio_path, mel_bins=128)
-        fbank = fbank.transpose(0, 1)[None]  # [1, 128, 1024]
-        return fbank
-    def load_video(self, data):
-        video_path = data['image']
-        video_feats = video_utils.load_and_transform_video_data(
-            video_path, video_path, clip_duration=1, clips_per_video=5)
-        return video_feats[:, :, 0]
-    def load_point(self, data):
-        point_path = data['image']
-        point_feat = torch.load(point_path, map_location='cpu')
-        point_feat = point_feat.transpose(0, 1)
-        return point_feat
-    def load_rgbx(self, data):
-        image_path = data['image']
-        x_image_path = data['depth_image'] if 'depth_image' in data else data['normal_image']
-        image = Image.open(image_path).convert('RGB')
-        x_image = Image.open(x_image_path).convert('RGB')
-        x_image = x_image.resize(image.size[-2:])
-        image, x_image = transform_pairimg_train([image, x_image])
-        # [2, 3, H, W]
-        image = torch.stack([image, x_image], dim=0)
-        return image
-    def load_fmri(self, data):
-        fmri_path = data['image']
-        data = np.load(fmri_path)
-        data = data.mean(axis=0)
-        data = torch.tensor(data[None])
-        return data
-    def load_imu(self, data_dict):
-        uid = data_dict["video_uid"]
-        w_s = data_dict["window_start"]
-        w_e = data_dict["window_end"]
-        imu_data = get_imu_frames(
-            IMU_PATH, uid,
-            video_start_sec=w_s,
-            video_end_sec=w_e,
-        )
-        if imu_data is None:
-            raise ValueError
-        return imu_data['signal']
-    def __getitem__(self, index, expect_type=None):
-        if expect_type is None:
-            data_item = self.ann[index]
-        else:
-            # in case we want get data from specific data_type
-            data_item = self.group_ann[expect_type][index]
-        data_type = data_item['data_type']
-        if data_type != 'text':
-            if data_type in self.load_funcs:
-                try:
-                    image = self.load_funcs[data_type](data_item)
-                    if image == None:
-                        raise ValueError('Data is None')
-                except:
-                    print('Error', data_item)
-                    rand_idx = random.randint(
-                        0, len(self.group_ann[data_type]))
-                    return self.__getitem__(rand_idx, expect_type=data_type)
-            else:
-                raise ValueError(f'Does not support {data_type}')
-        else:
-            image = None
-            # warnings.warn("pure black image for examples without image")
-            # image = torch.zeros(3, 224, 224)
-        source = data_item["conversations"]
-        conversation, to_predict_values = self.conversation_generator.add_speaker_and_signal(
-            source)
-        if len(to_predict_values) == 0:
-            warnings.warn(
-                f"see dialog data with nothing to predict, data: {data_item}")
-            return self[index-1]
-        tokenzed_conversation = self.tokenizer.encode(
-            conversation, bos=True, eos=True)
-        labels = [IGNORE_INDEX for _ in tokenzed_conversation]
-        check_pos = 0
-        for value in to_predict_values:
-            tokenized_value = self.tokenizer.encode(
-                value, bos=False, eos=False)
-            value_pos = find_sublist(
-                tokenzed_conversation[check_pos:], tokenized_value) + check_pos
-            if value_pos == -1:
-                print(
-                    "a sentence mismatches the corresponding piece in the conversation")
-                return self[index-1]
-            labels[value_pos:value_pos+len(tokenized_value)] = tokenized_value
-            assert labels[value_pos:value_pos+len(
-                tokenized_value)] == tokenzed_conversation[value_pos:value_pos+len(tokenized_value)]
-            check_pos = value_pos+len(tokenized_value)
-        input2 = torch.tensor(tokenzed_conversation, dtype=torch.int64)
-        labels = torch.tensor(labels, dtype=torch.int64)
-        if image is not None:
-            max_words = self.max_words - self.image_words
-        else:
-            max_words = self.max_words
-        padding = max_words - input2.shape[0]
-        if padding > 0:
-            input2 = torch.cat(
-                (input2, torch.zeros(padding, dtype=torch.int64) - 1))
-            labels = torch.cat(
-                (labels, torch.zeros(padding, dtype=torch.int64) - 1))
-        elif padding < 0:
-            input2 = input2[:max_words]
-            labels = labels[:max_words]
-        input2_mask = input2.ge(0)
-        label_mask = labels.ge(0)
-        input2[~input2_mask] = 0
-        labels[~label_mask] = 0
-        input2_mask = input2_mask.float()
-        label_mask = label_mask.float()
-        if image is None:
-            return input2, labels, data_item['data_type']
-        else:
-            return input2, labels, image, data_item['data_type']
-    def groups(self):
-        return list(self.group_indices.values())
-def find_sublist(a: list, b: list):
-    len_a, len_b = len(a), len(b)
-    for i in range(len_a - len_b + 1):
-        if a[i:i+len_b] == b:
-            return i
-    return -1

data/imu_utils.py DELETED Viewed

@@ -1,257 +0,0 @@
-import string
-import numpy as np
-import matplotlib.animation as animation
-from matplotlib import pyplot as plt
-import json
-from collections import defaultdict
-from bisect import bisect_left
-import os
-import torch
-import torchaudio
-torchaudio.set_audio_backend("sox_io")
-def load_json(json_path: str):
-    """
-    Load a json file
-    """
-    with open(json_path, "r", encoding="utf-8") as f_name:
-        data = json.load(f_name)
-    return data
-def check_window_signal(info_t, w_s, w_e):
-    length = w_e - w_s
-    frame_offset = int(w_s * info_t.sample_rate)
-    num_frames = int(length * info_t.sample_rate)
-    if frame_offset + num_frames > int(info_t.num_frames):
-        return False
-    else:
-        return True
-def index_narrations(ann_path):
-    narration_raw = load_json(ann_path)
-    narration_dict = defaultdict(list)
-    summary_dict = defaultdict(list)
-    avg_len = []
-    for v_id, narr in narration_raw.items():
-        narr_list = []
-        summ_list = []
-        if "narration_pass_1" in narr:
-            narr_list += narr["narration_pass_1"]["narrations"]
-            summ_list += narr["narration_pass_1"]["summaries"]
-        if "narration_pass_2" in narr:
-            narr_list += narr["narration_pass_2"]["narrations"]
-            summ_list += narr["narration_pass_2"]["summaries"]
-        if len(narr_list) > 0:
-            narration_dict[v_id] = [
-                (
-                    float(n_t["timestamp_sec"]),
-                    n_t["narration_text"],
-                    n_t["annotation_uid"],
-                    n_t["timestamp_frame"],
-                )
-                for n_t in narr_list
-            ]
-            avg_len.append(len(narration_dict[v_id]))
-        else:
-            narration_dict[v_id] = []
-        if len(summ_list) > 0:
-            summary_dict[v_id] = [
-                (
-                    float(s_t["start_sec"]),
-                    float(s_t["end_sec"]),
-                    s_t["summary_text"],
-                )
-                for s_t in summ_list
-            ]
-        else:
-            summary_dict[v_id] = []
-    # print(f"Number of Videos with narration {len(narration_dict)}")
-    # print(f"Avg. narration length {np.mean(avg_len)}")
-    # print(f"Number of Videos with summaries {len(summary_dict)}")
-    return narration_dict, summary_dict
-def get_signal_info(signal_fn: str):
-    return torchaudio.info(signal_fn)
-def get_signal_frames(signal_fn: str, video_start_sec: float, video_end_sec: float):
-    """
-    Given a signal track return the frames between video_start_sec and video_end_sec
-    """
-    info_t = get_signal_info(signal_fn)
-    length = video_end_sec - video_start_sec
-    aframes, _ = torchaudio.load(
-        signal_fn,
-        normalize=True,
-        frame_offset=int(video_start_sec * info_t.sample_rate),
-        num_frames=int(length * info_t.sample_rate),
-    )
-    return {"signal": aframes, "meta": info_t}
-def tosec(value):
-    return value / 1000
-def toms(value):
-    return value * 1000
-def delta(first_num: float, second_num: float):
-    """Compute the absolute value of the difference of two numbers"""
-    return abs(first_num - second_num)
-def padIMU(signal, duration_sec):
-    """
-    Pad the signal if necessary
-    """
-    expected_elements = round(duration_sec) * 200
-    if signal.shape[0] > expected_elements:
-        signal = signal[:expected_elements, :]
-    elif signal.shape[0] < expected_elements:
-        padding = expected_elements - signal.shape[0]
-        padded_zeros = np.zeros((padding, 6))
-        signal = np.concatenate([signal, padded_zeros], 0)
-        # signal = signal[:expected_elements, :]
-    return signal
-def resample(
-    signals: np.ndarray,
-    timestamps: np.ndarray,
-    original_sample_rate: int,
-    resample_rate: int,
-):
-    """
-    Resamples data to new sample rate
-    """
-    signals = torch.as_tensor(signals)
-    timestamps = torch.from_numpy(timestamps).unsqueeze(-1)
-    signals = torchaudio.functional.resample(
-        waveform=signals.data.T,
-        orig_freq=original_sample_rate,
-        new_freq=resample_rate,
-    ).T.numpy()
-    nsamples = len(signals)
-    period = 1 / resample_rate
-    # timestamps are expected to be shape (N, 1)
-    initital_seconds = timestamps[0] / 1e3
-    ntimes = (torch.arange(nsamples) * period).view(-1, 1) + initital_seconds
-    timestamps = (ntimes * 1e3).squeeze().numpy()
-    return signals, timestamps
-def resampleIMU(signal, timestamps):
-    sampling_rate = int(1000 * (1 / (np.mean(np.diff(timestamps)))))
-    # resample all to 200hz
-    if sampling_rate != 200:
-        signal, timestamps = resample(signal, timestamps, sampling_rate, 200)
-    return signal, timestamps
-def get_imu_frames(
-    imu_path,
-    uid: str,
-    video_start_sec: float,
-    video_end_sec: float,
-):
-    """
-    Given a IMU signal return the frames between video_start_sec and video_end_sec
-    """
-    signal = np.load(os.path.join(imu_path, f"{uid}.npy"))
-    signal = signal.transpose()
-    timestamps = np.load(os.path.join(imu_path, f"{uid}_timestamps.npy"))
-    if toms(video_start_sec) > timestamps[-1] or toms(video_end_sec) > timestamps[-1]:
-        return None
-    start_id = bisect_left(timestamps, toms(video_start_sec))
-    end_id = bisect_left(timestamps, toms(video_end_sec))
-    # make sure the retrieved window interval are correct by a max of 1 sec margin
-    if (
-        delta(video_start_sec, tosec(timestamps[start_id])) > 4
-        or delta(video_end_sec, tosec(timestamps[end_id])) > 4
-    ):
-        return None
-    # get the window
-    if start_id == end_id:
-        start_id -= 1
-        end_id += 1
-    signal, timestamps = signal[start_id:end_id], timestamps[start_id:end_id]
-    if len(signal) < 10 or len(timestamps) < 10:
-        return None
-    # resample the signal at 200hz if necessary
-    signal, timestamps = resampleIMU(signal, timestamps)
-    # pad  the signal if necessary
-    signal = padIMU(signal, video_end_sec - video_start_sec)
-    sample_dict = {
-        "timestamp": timestamps,
-        "signal": torch.tensor(signal.T),
-        "sampling_rate": 200,
-    }
-    return sample_dict
-def display_animation(frames, title, save_path_gif):
-    fig, ax = plt.subplots()
-    frames = [[ax.imshow(frames[i])] for i in range(len(frames))]
-    plt.title(title)
-    ani = animation.ArtistAnimation(fig, frames)
-    ani.save(save_path_gif, writer="imagemagick")
-    plt.close()
-def display_animation_imu(frames, imu, title, save_path_gif):
-    fig, (ax1, ax2, ax3) = plt.subplots(3, 1)
-    ax1.set_title(title)
-    ax2.set_title("Acc.")
-    ax3.set_title("Gyro.")
-    frames = [[ax1.imshow(frames[i])] for i in range(len(frames))]
-    ani = animation.ArtistAnimation(fig, frames)
-    ax2.plot(imu[0].cpu().numpy(), color="red")
-    ax2.plot(imu[1].cpu().numpy(), color="blue")
-    ax2.plot(imu[2].cpu().numpy(), color="green")
-    ax3.plot(imu[3].cpu().numpy(), color="red")
-    ax3.plot(imu[4].cpu().numpy(), color="blue")
-    ax3.plot(imu[5].cpu().numpy(), color="green")
-    plt.tight_layout()
-    ani.save(save_path_gif, writer="imagemagick")
-    plt.close()
-def filter_narration(narration_text: str) -> bool:
-    if "#c" in narration_text.lower():
-        return True
-    return False
-def clean_narration_text(narration_text: str) -> str:
-    return (
-        narration_text.replace("#C C ", "")
-        .replace("#C", "")
-        .replace("#unsure", "something")
-        .strip()
-        .strip(string.punctuation)
-        .lower()[:128]
-    )

data/video_utils.py DELETED Viewed

@@ -1,204 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-from pytorchvideo import transforms as pv_transforms
-from pytorchvideo.data.clip_sampling import ConstantClipsPerVideoSampler
-from pytorchvideo.data.encoded_video import EncodedVideo
-from pytorchvideo.data.encoded_video_decord import EncodedVideoDecord
-from torchvision import transforms
-from torchvision.transforms._transforms_video import NormalizeVideo
-def get_clip_timepoints(clip_sampler, duration):
-    # Read out all clips in this video
-    all_clips_timepoints = []
-    is_last_clip = False
-    end = 0.0
-    while not is_last_clip:
-        start, end, _, _, is_last_clip = clip_sampler(end, duration, annotation=None)
-        all_clips_timepoints.append((start, end))
-    return all_clips_timepoints
-def crop_boxes(boxes, x_offset, y_offset):
-    """
-    Perform crop on the bounding boxes given the offsets.
-    Args:
-        boxes (ndarray or None): bounding boxes to perform crop. The dimension
-            is `num boxes` x 4.
-        x_offset (int): cropping offset in the x axis.
-        y_offset (int): cropping offset in the y axis.
-    Returns:
-        cropped_boxes (ndarray or None): the cropped boxes with dimension of
-            `num boxes` x 4.
-    """
-    cropped_boxes = boxes.copy()
-    cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
-    cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
-    return cropped_boxes
-def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
-    """
-    Perform uniform spatial sampling on the images and corresponding boxes.
-    Args:
-        images (tensor): images to perform uniform crop. The dimension is
-            `num frames` x `channel` x `height` x `width`.
-        size (int): size of height and weight to crop the images.
-        spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
-            is larger than height. Or 0, 1, or 2 for top, center, and bottom
-            crop if height is larger than width.
-        boxes (ndarray or None): optional. Corresponding boxes to images.
-            Dimension is `num boxes` x 4.
-        scale_size (int): optinal. If not None, resize the images to scale_size before
-            performing any crop.
-    Returns:
-        cropped (tensor): images with dimension of
-            `num frames` x `channel` x `size` x `size`.
-        cropped_boxes (ndarray or None): the cropped boxes with dimension of
-            `num boxes` x 4.
-    """
-    assert spatial_idx in [0, 1, 2]
-    ndim = len(images.shape)
-    if ndim == 3:
-        images = images.unsqueeze(0)
-    height = images.shape[2]
-    width = images.shape[3]
-    if scale_size is not None:
-        if width <= height:
-            width, height = scale_size, int(height / width * scale_size)
-        else:
-            width, height = int(width / height * scale_size), scale_size
-        images = torch.nn.functional.interpolate(
-            images,
-            size=(height, width),
-            mode="bilinear",
-            align_corners=False,
-        )
-    y_offset = int(math.ceil((height - size) / 2))
-    x_offset = int(math.ceil((width - size) / 2))
-    if height > width:
-        if spatial_idx == 0:
-            y_offset = 0
-        elif spatial_idx == 2:
-            y_offset = height - size
-    else:
-        if spatial_idx == 0:
-            x_offset = 0
-        elif spatial_idx == 2:
-            x_offset = width - size
-    cropped = images[:, :, y_offset : y_offset + size, x_offset : x_offset + size]
-    cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
-    if ndim == 3:
-        cropped = cropped.squeeze(0)
-    return cropped, cropped_boxes
-class SpatialCrop(nn.Module):
-    """
-    Convert the video into 3 smaller clips spatially. Must be used after the
-        temporal crops to get spatial crops, and should be used with
-        -2 in the spatial crop at the slowfast augmentation stage (so full
-        frames are passed in here). Will return a larger list with the
-        3x spatial crops as well.
-    """
-    def __init__(self, crop_size: int = 224, num_crops: int = 3):
-        super().__init__()
-        self.crop_size = crop_size
-        if num_crops == 3:
-            self.crops_to_ext = [0, 1, 2]
-            self.flipped_crops_to_ext = []
-        elif num_crops == 1:
-            self.crops_to_ext = [1]
-            self.flipped_crops_to_ext = []
-        else:
-            raise NotImplementedError("Nothing else supported yet")
-    def forward(self, videos):
-        """
-        Args:
-            videos: A list of C, T, H, W videos.
-        Returns:
-            videos: A list with 3x the number of elements. Each video converted
-                to C, T, H', W' by spatial cropping.
-        """
-        assert isinstance(videos, list), "Must be a list of videos after temporal crops"
-        assert all([video.ndim == 4 for video in videos]), "Must be (C,T,H,W)"
-        res = []
-        for video in videos:
-            for spatial_idx in self.crops_to_ext:
-                res.append(uniform_crop(video, self.crop_size, spatial_idx)[0])
-            if not self.flipped_crops_to_ext:
-                continue
-            flipped_video = transforms.functional.hflip(video)
-            for spatial_idx in self.flipped_crops_to_ext:
-                res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0])
-        return res
-def load_and_transform_video_data(
-    video_file,
-    video_path,
-    clip_duration=2,
-    clips_per_video=5,
-    sample_rate=16000,
-    with_audio=False
-):
-    video_transform = transforms.Compose(
-        [
-            pv_transforms.ShortSideScale(224),
-            NormalizeVideo(
-                mean=(0.48145466, 0.4578275, 0.40821073),
-                std=(0.26862954, 0.26130258, 0.27577711),
-            ),
-        ]
-    )
-    clip_sampler = ConstantClipsPerVideoSampler(
-        clip_duration=clip_duration, clips_per_video=clips_per_video
-    )
-    frame_sampler = pv_transforms.UniformTemporalSubsample(num_samples=clip_duration)
-    if isinstance(video_file, str):
-        video = EncodedVideo.from_path(
-            video_file,
-            decoder="decord",
-            decode_audio=with_audio,
-            # **{"sample_rate": sample_rate},
-        )
-    else:
-        video = EncodedVideoDecord(video_file, video_name=video_path, decode_video=True, decode_audio=with_audio, sample_rate=sample_rate)
-    all_clips_timepoints = get_clip_timepoints(clip_sampler, video.duration)
-    all_video = []
-    for clip_timepoints in all_clips_timepoints:
-        # Read the clip, get frames
-        clip = video.get_clip(clip_timepoints[0], clip_timepoints[1])
-        if clip is None:
-            raise ValueError("No clip found")
-        video_clip = frame_sampler(clip["video"])
-        video_clip = video_clip / 255.0  # since this is float, need 0-1
-        all_video.append(video_clip)
-    all_video = [video_transform(clip) for clip in all_video]
-    all_video = SpatialCrop(224, num_crops=3)(all_video)
-    all_video = torch.stack(all_video, dim=0)
-    if not with_audio:
-        return all_video
-    else:
-        return all_video, clip['audio']
-if __name__ == '__main__':
-    video_path = "datasets/InstructionTuning/video/music_aqa/MUSIC-AVQA-videos-Real/00000002.mp4"
-    video, audio = load_and_transform_video_data(video_path, video_path, clip_duration=1, clips_per_video=5, with_audio=True)
-    import pdb;pdb.set_trace()

demos/multi_turn_mm.py DELETED Viewed

@@ -1,300 +0,0 @@
-import sys
-import os
-sys.path.append(os.path.abspath(__file__).rsplit('/', 2)[0])
-import argparse
-import multiprocessing as mp
-import numpy as np
-from typing import List, Optional
-import torch
-import torch.distributed as dist
-from fairscale.nn.model_parallel import initialize as fs_init
-import gradio as gr
-from util.misc import setup_for_distributed
-from util.misc import default_tensor_type
-from model.meta import MetaModel
-from data.conversation_lib import conv_templates, SeparatorStyle
-from PIL import Image
-import torchvision.transforms as transforms
-from data.fintune_dataset import make_audio_features
-from data import video_utils
-T_random_resized_crop = transforms.Compose([
-    transforms.RandomResizedCrop(size=(224, 224), scale=(0.9, 1.0), ratio=(0.75, 1.3333), interpolation=3,
-                                 antialias=None),  # 3 is bicubic
-    transforms.ToTensor(),
-    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])
-def load_audio(audio_path):
-    fbank = make_audio_features(audio_path, mel_bins=128)
-    fbank = fbank.transpose(0, 1)[None] #[1, 128, 1024]
-    return fbank
-def load_video(video_path):
-    video_feats = video_utils.load_and_transform_video_data(video_path, video_path, clip_duration=1, clips_per_video=5)
-    return video_feats[:, :, 0]
-def model_worker(
-    rank: int, args: argparse.Namespace, barrier: mp.Barrier,
-    request_queue: mp.Queue, response_queue: Optional[mp.Queue] = None,
-) -> None:
-    """
-    The worker function that manipulates the GPU to run the inference.
-    Exact n_gpu workers are started, with each one operating on a separate GPU.
-    Args:
-        rank (int): Distributed rank of the worker.
-        args (argparse.Namespace): All command line arguments.
-        barrier (multiprocessing.Barrier): A barrier used to delay the start
-            of Web UI to be after the start of the model.
-    """
-    world_size = len(args.gpu_ids)
-    gpu_id = args.gpu_ids[rank]
-    dist.init_process_group(
-        backend="nccl", rank=rank, world_size=world_size,
-        init_method=f"tcp://{args.master_addr}:{args.master_port}",
-    )
-    print(f"| distributed init on worker {rank}/{world_size}. "
-          f"using gpu: {gpu_id}")
-    fs_init.initialize_model_parallel(world_size)
-    torch.cuda.set_device(gpu_id)
-    torch.manual_seed(1)
-    np.random.seed(1)
-    # set the print behavior.
-    setup_for_distributed(rank == 0)
-    target_dtype = {
-        "bf16": torch.bfloat16,
-        "fp16": torch.float16
-    }[args.dtype]
-    with default_tensor_type(dtype=target_dtype, device="cuda"):
-        model = MetaModel(args.llama_type, args.llama_config, tokenizer_path=args.tokenizer_path)
-    print("Loading pretrained weights ...")
-    checkpoint = torch.load(args.pretrained_path, map_location='cpu')
-    msg = model.load_state_dict(checkpoint, strict=False)
-    print("load result:\n", msg)
-    model.cuda()
-    model.eval()
-    print(f"Model = {str(model)}")
-    barrier.wait()
-    while True:
-        img_path, audio_path, video_path, chatbot, max_gen_len, temperature, top_p, modality = request_queue.get()
-        if 'image' in modality and img_path is not None:
-            image = Image.open(img_path).convert('RGB')
-            inputs = T_random_resized_crop(image)
-        elif 'video' in modality and video_path is not None:
-            inputs = load_video(video_path)
-        elif 'audio' in modality and audio_path is not None:
-            inputs = load_audio(audio_path)
-        else:
-            inputs = None
-        if inputs is not None:
-            inputs = inputs[None].cuda().to(target_dtype)
-        conv = conv_templates["v1"].copy()
-        for user, bot in chatbot:
-            conv.append_message(conv.roles[0], user)
-            conv.append_message(conv.roles[1], bot)
-        with torch.cuda.amp.autocast(dtype=target_dtype):
-            print(conv.get_prompt())
-            for stream_response in model.stream_generate(
-                conv.get_prompt(), inputs,
-                max_gen_len=max_gen_len, temperature=temperature, top_p=top_p,
-                modal = modality
-            ):
-                conv_sep = (
-                    conv.sep
-                    if conv.sep_style == SeparatorStyle.SINGLE
-                    else conv.sep2
-                )
-                end_pos = stream_response["text"].find(conv_sep)
-                if end_pos != -1:
-                    stream_response["text"] = (
-                        stream_response['text'][:end_pos].rstrip() + "\n"
-                    )
-                    stream_response["end_of_content"] = True
-                # keep a few characters if not end_of_content to avoid sending
-                # part of conv_sep before all of it is generated.
-                if not stream_response["end_of_content"]:
-                    if len(stream_response["text"]) < len(conv_sep):
-                        continue
-                    stream_response["text"] = (
-                        stream_response["text"][:-len(conv_sep)]
-                    )
-                if response_queue is not None:
-                    response_queue.put(stream_response)
-                if stream_response["end_of_content"]:
-                    break
-def gradio_worker(
-    request_queues: List[mp.Queue], response_queue: mp.Queue,
-    args: argparse.Namespace, barrier: mp.Barrier,
-) -> None:
-    """
-    The gradio worker is responsible for displaying the WebUI and relay the
-    requests to model workers. It should be launched only once.
-    Args:
-        request_queues (List[mp.Queue]): A list of request queues (one for
-            each model worker).
-        args (argparse.Namespace): All command line arguments.
-        barrier (multiprocessing.Barrier): A barrier used to delay the start
-            of Web UI to be after the start of the model.
-    """
-    def show_user_input(msg, chatbot):
-        return "", chatbot + [[msg, None]]
-    def stream_model_output(img_path, audio_path, video_path, chatbot, max_gen_len, gen_t, top_p, modality):
-        for queue in request_queues:
-            queue.put((img_path, audio_path, video_path, chatbot, max_gen_len, gen_t, top_p, modality))
-        while True:
-            content_piece = response_queue.get()
-            chatbot[-1][1] = content_piece["text"]
-            yield chatbot
-            if content_piece["end_of_content"]:
-                break
-    def undo(chatbot):
-        if len(chatbot) > 0:
-            chatbot = chatbot[:-1]
-        return chatbot
-    def clear():
-        chatbot = []
-        msg = ""
-        return chatbot, msg
-    CSS ="""
-    .contain { display: flex; flex-direction: column; }
-    #component-0 { height: 100%; }
-    #chatbot { flex-grow: 1; overflow: auto;}
-    """
-    with gr.Blocks(css=CSS) as demo:
-        gr.Markdown("## OneLLM: One Framework to Align All Modalities with Language")
-        with gr.Row(equal_height=True):
-            with gr.Column(scale=1):
-                img_path = gr.Image(label='Image Input', type='filepath')
-                video_path = gr.Video(label='Video Input')
-                audio_path = gr.Audio(label='Audio Input', type='filepath', sources=['upload'])
-                modality = gr.Radio(choices=['image', 'audio', 'video'], value='image', interactive=True, label='Input Modalities')
-            with gr.Column(scale=2):
-                chatbot = gr.Chatbot(elem_id="chatbot")
-                msg = gr.Textbox()
-        with gr.Row():
-            submit_button = gr.Button("Submit", variant="primary")
-            undo_button = gr.Button("Undo")
-            clear_button = gr.ClearButton([chatbot, msg, img_path, audio_path, video_path, modality])
-        with gr.Row():
-            max_gen_len = gr.Slider(
-                minimum=1, maximum=args.model_max_seq_len // 2,
-                value=args.model_max_seq_len // 2, interactive=True,
-                label="Single-turn max response length",
-            )
-            gen_t = gr.Slider(
-                minimum=0, maximum=1, value=0.1, interactive=True,
-                label="Temperature",
-            )
-            top_p = gr.Slider(
-                minimum=0, maximum=1, value=0.75, interactive=True,
-                label="Top-p",
-            )
-        msg.submit(
-            show_user_input, [msg, chatbot], [msg, chatbot],
-        ).then(
-            stream_model_output, [img_path, audio_path, video_path, chatbot, max_gen_len, gen_t, top_p, modality], chatbot,
-        )
-        submit_button.click(
-            show_user_input, [msg, chatbot], [msg, chatbot],
-        ).then(
-            stream_model_output, [img_path, audio_path, video_path, chatbot, max_gen_len, gen_t, top_p, modality], chatbot,
-        )
-        undo_button.click(undo, chatbot, chatbot)
-        # img_path.change(clear, [], [chatbot, msg])
-    barrier.wait()
-    demo.queue(api_open=True).launch(share=True, max_threads=1)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser("Chat Demo")
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument(
-        "--gpu_ids", type=int, nargs="+",
-        help="A list of space-separated gpu ids to run the model on. "
-             "The model will span across GPUs in tensor-parallel mode."
-    )
-    parser.add_argument(
-        "--tokenizer_path", type=str,
-        help="Path to the tokenizer.model file provided along with the LLaMA "
-             "model."
-    )
-    parser.add_argument(
-        "--llama_type", default="onellm", type=str, metavar="MODEL",
-        help="LLaMA model type."
-    )
-    parser.add_argument(
-        "--llama_config", type=str, required=True,
-        help="Path to the llama model config json."
-    )
-    parser.add_argument(
-        "--model_max_seq_len", type=int, default=2048,
-        help="Max sequence length accepted by the pretrained model."
-    )
-    parser.add_argument(
-        "--pretrained_path", type=str, required=True,
-        help="Path to the llama model checkpoints. A list of checkpoints is "
-             "supported and will be merged from left to right.")
-    parser.add_argument(
-        "--master_port", type=int, default=23862,
-        help="A port used by the PyTorch distributed module to initialize."
-    )
-    parser.add_argument(
-        "--master_addr", type=str, default="127.0.0.1",
-        help="An address used by the PyTorch distributed module to initialize."
-    )
-    parser.add_argument(
-        "--dtype", type=str, choices=["fp16", "bf16"], default="fp16",
-        help="The dtype used for model weights and inference."
-    )
-    args = parser.parse_args()
-    # using the default "fork" method messes up some imported libs (e.g.,
-    # pandas)
-    mp.set_start_method("spawn")
-    # setup the queues and start the model workers
-    request_queues = []
-    response_queue = mp.Queue()
-    worker_processes = []
-    barrier = mp.Barrier(len(args.gpu_ids) + 1)
-    for rank, gpu_id in enumerate(args.gpu_ids):
-        request_queue = mp.Queue()
-        rank_response_queue = response_queue if rank == 0 else None
-        process = mp.Process(
-            target=model_worker,
-            args=(rank, args, barrier, request_queue, rank_response_queue),
-        )
-        process.start()
-        worker_processes.append(process)
-        request_queues.append(request_queue)
-    gradio_worker(request_queues, response_queue, args, barrier)

examples/bell_ring.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:415b5406175cafa874396f89811121fb306c17084db8ec20753ab5666b4fdcca
-size 3630044

examples/bird_audio.wav DELETED Viewed

Binary file (882 kB)

examples/depth_normal/depth/0084.png DELETED Viewed

Binary file (10.9 kB)

examples/depth_normal/depth/0131.png DELETED Viewed

Binary file (5.5 kB)

examples/depth_normal/depth/0297.png DELETED Viewed

Binary file (11.1 kB)

examples/depth_normal/depth/0331.png DELETED Viewed

Binary file (10.6 kB)

examples/depth_normal/depth/0432.png DELETED Viewed

Binary file (7.89 kB)

examples/depth_normal/depth/0633.png DELETED Viewed

Binary file (9.35 kB)

examples/depth_normal/depth/0663.png DELETED Viewed

Binary file (5.68 kB)

examples/depth_normal/depth/0771.png DELETED Viewed

Binary file (9.65 kB)

examples/depth_normal/depth/0782.png DELETED Viewed

Binary file (9.58 kB)

examples/depth_normal/depth/1001.png DELETED Viewed

Binary file (7.17 kB)

examples/depth_normal/depth/1051.png DELETED Viewed

Binary file (6.81 kB)

examples/depth_normal/depth/1129.png DELETED Viewed

Binary file (6.66 kB)

examples/depth_normal/depth/1205.png DELETED Viewed

Binary file (9.24 kB)

examples/depth_normal/depth/1336.png DELETED Viewed

Binary file (11.5 kB)

examples/depth_normal/depth/1383.png DELETED Viewed

Binary file (9.98 kB)

examples/depth_normal/depth/1386.png DELETED Viewed

Binary file (12.2 kB)

examples/depth_normal/depth/1393.png DELETED Viewed

Binary file (8.6 kB)

examples/depth_normal/depth/1447.png DELETED Viewed

Binary file (10.3 kB)

examples/depth_normal/depth_scaled/0084.png DELETED Viewed

Binary file (11.1 kB)

examples/depth_normal/depth_scaled/0131.png DELETED Viewed

Binary file (5.68 kB)

examples/depth_normal/depth_scaled/0297.png DELETED Viewed

Binary file (11.8 kB)

examples/depth_normal/depth_scaled/0331.png DELETED Viewed

Binary file (11.1 kB)

examples/depth_normal/depth_scaled/0432.png DELETED Viewed

Binary file (8.24 kB)

examples/depth_normal/depth_scaled/0633.png DELETED Viewed

Binary file (9.96 kB)

examples/depth_normal/depth_scaled/0663.png DELETED Viewed

Binary file (5.91 kB)

examples/depth_normal/depth_scaled/0771.png DELETED Viewed

Binary file (10.1 kB)

examples/depth_normal/depth_scaled/0782.png DELETED Viewed

Binary file (9.98 kB)

examples/depth_normal/depth_scaled/1001.png DELETED Viewed

Binary file (7.19 kB)

examples/depth_normal/depth_scaled/1051.png DELETED Viewed

Binary file (7.05 kB)

examples/depth_normal/depth_scaled/1129.png DELETED Viewed

Binary file (6.9 kB)

examples/depth_normal/depth_scaled/1205.png DELETED Viewed

Binary file (9.58 kB)