Spaces:

mooki0
/

HunyuanWorld-Demo

Build error

File size: 8,676 Bytes

57276d4

# Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
# THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND 
# IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
# By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying 
# any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service, 
# You will be deemed to have recognized and accepted the content of this Agreement, 
# which is effective immediately.

# For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models 
# and their software and algorithms, including trained model weights, parameters (including 
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, 
# fine-tuning enabling code and other elements of the foregoing made publicly available 
# by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
import os
import torch
import numpy as np 

import cv2
from PIL import Image

import argparse

# huanyuan3d text to panorama
from hy3dworld import Text2PanoramaPipelines

# huanyuan3d image to panorama
from hy3dworld import Image2PanoramaPipelines
from hy3dworld import Perspective


class Text2PanoramaDemo:
    def __init__(self):
        # set default parameters
        self.height = 960
        self.width = 1920

        # panorama parameters
        # these parameters are used to control the panorama generation
        # you can adjust them according to your needs
        self.guidance_scale = 30
        self.shifting_extend = 0
        self.num_inference_steps = 50
        self.true_cfg_scale = 0.0
        self.blend_extend = 6

        # model paths
        self.lora_path = "tencent/HunyuanWorld-1"
        self.model_path = "black-forest-labs/FLUX.1-dev"
        # load the pipeline
        # use bfloat16 to save some VRAM
        self.pipe = Text2PanoramaPipelines.from_pretrained(
            self.model_path,
            torch_dtype=torch.bfloat16
        ).to("cuda")
        # and enable lora weights
        self.pipe.load_lora_weights(
            self.lora_path,
            subfolder="HunyuanWorld-PanoDiT-Text",
            weight_name="lora.safetensors",
            torch_dtype=torch.bfloat16
        )
        # save some VRAM by offloading the model to CPU
        self.pipe.enable_model_cpu_offload()
        self.pipe.enable_vae_tiling()  # and enable vae tiling to save some VRAM

    def run(self, prompt, negative_prompt=None, seed=42, output_path='output_panorama'):
        # get panorama
        image = self.pipe(
            prompt,
            height=self.height,
            width=self.width,
            negative_prompt=negative_prompt,
            generator=torch.Generator("cpu").manual_seed(seed),
            num_inference_steps=self.num_inference_steps,
            guidance_scale=self.guidance_scale,
            blend_extend=self.blend_extend,
            true_cfg_scale=self.true_cfg_scale,
        ).images[0]

        # create output directory if it does not exist
        os.makedirs(output_path, exist_ok=True)
        # save the panorama image
        if not isinstance(image, Image.Image):
            image = Image.fromarray(image)
        # save the image to the output path
        image.save(os.path.join(output_path, 'panorama.png'))

        return image


class Image2PanoramaDemo:
    def __init__(self):
        # set default parameters
        self.height, self.width = 960, 1920  # 768, 1536 #

        # panorama parameters
        # these parameters are used to control the panorama generation
        # you can adjust them according to your needs
        self.THETA = 0
        self.PHI = 0
        self.FOV = 80
        self.guidance_scale = 30
        self.num_inference_steps = 50
        self.true_cfg_scale = 2.0
        self.shifting_extend = 0
        self.blend_extend = 6

        # model paths
        self.lora_path = "tencent/HunyuanWorld-1"
        self.model_path = "black-forest-labs/FLUX.1-Fill-dev"
        # load the pipeline
        # use bfloat16 to save some VRAM
        self.pipe = Image2PanoramaPipelines.from_pretrained(
            self.model_path,
            torch_dtype=torch.bfloat16
        ).to("cuda")
        # and enable lora weights
        self.pipe.load_lora_weights(
            self.lora_path,
            subfolder="HunyuanWorld-PanoDiT-Image",
            weight_name="lora.safetensors",
            torch_dtype=torch.bfloat16
        )
        # save some VRAM by offloading the model to CPU
        self.pipe.enable_model_cpu_offload()
        self.pipe.enable_vae_tiling()  # and enable vae tiling to save some VRAM

        # set general prompts
        self.general_negative_prompt = (
            "human, person, people, messy,"
            "low-quality, blur, noise, low-resolution"
        )
        self.general_positive_prompt = "high-quality,  high-resolution, sharp, clear, 8k"

    def run(self, prompt, negative_prompt, image_path, seed=42, output_path='output_panorama'):
        # preprocess prompt
        prompt = prompt + ", " + self.general_positive_prompt
        negative_prompt = self.general_negative_prompt + ", " + negative_prompt

        # read image
        perspective_img = cv2.imread(image_path)
        height_fov, width_fov = perspective_img.shape[:2]
        if width_fov > height_fov:
            ratio = width_fov / height_fov
            w = int((self.FOV / 360) * self.width)
            h = int(w / ratio)
            perspective_img = cv2.resize(
                perspective_img, (w, h), interpolation=cv2.INTER_AREA)
        else:
            ratio = height_fov / width_fov
            h = int((self.FOV / 180) * self.height)
            w = int(h / ratio)
            perspective_img = cv2.resize(
                perspective_img, (w, h), interpolation=cv2.INTER_AREA)

        
        equ = Perspective(perspective_img, self.FOV,
                          self.THETA, self.PHI, crop_bound=False)
        img, mask = equ.GetEquirec(self.height, self.width)
        # erode mask
        mask = cv2.erode(mask.astype(np.uint8), np.ones(
            (3, 3), np.uint8), iterations=5)

        img = img * mask

        mask = mask.astype(np.uint8) * 255
        mask = 255 - mask

        mask = Image.fromarray(mask[:, :, 0])
        img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB)
        img = Image.fromarray(img)

        image = self.pipe(
            prompt=prompt,
            image=img,
            mask_image=mask,
            height=self.height,
            width=self.width,
            negative_prompt=negative_prompt,
            guidance_scale=self.guidance_scale,
            num_inference_steps=self.num_inference_steps,
            generator=torch.Generator("cpu").manual_seed(seed),
            blend_extend=self.blend_extend,
            shifting_extend=self.shifting_extend,
            true_cfg_scale=self.true_cfg_scale,
        ).images[0]

        image.save(os.path.join(output_path, 'panorama.png'))

        return image


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Text/Image to Panorama Demo")
    parser.add_argument("--prompt", type=str,
                        default="", help="Prompt for image generation")
    parser.add_argument("--negative_prompt", type=str,
                        default="", help="Negative prompt for image generation")
    parser.add_argument("--image_path", type=str,
                        default=None, help="Path to the input image")
    parser.add_argument("--seed", type=int, default=42,
                        help="Random seed for reproducibility")
    parser.add_argument("--output_path", type=str, default="results",
                        help="Path to save the output results")

    args = parser.parse_args()

    os.makedirs(args.output_path, exist_ok=True)
    print(f"Output will be saved to: {args.output_path}")

    if args.image_path is None:
        print("No image path provided, using text-to-panorama generation.")
        demo_T2P = Text2PanoramaDemo()
        panorama_image = demo_T2P.run(
            args.prompt, args.negative_prompt, args.seed, args.output_path)
    else:
        if not os.path.exists(args.image_path):
            raise FileNotFoundError(
                f"Image path {args.image_path} does not exist.")
        print(f"Using image at {args.image_path} for panorama generation.")
        demo_I2P = Image2PanoramaDemo()
        panorama_image = demo_I2P.run(
            args.prompt, args.negative_prompt, args.image_path, args.seed, args.output_path)