Spaces:

mooki0
/

HunyuanWorld-Demo

Build error

App Files Files Community

HunyuanWorld-Demo / demo_panogen.py

mooki0

Initial commit of Gradio app

57276d4 verified 4 months ago

raw

history blame contribute delete

8.68 kB

	# Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT
	# THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND
	# IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
	# By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying
	# any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service,
	# You will be deemed to have recognized and accepted the content of this Agreement,
	# which is effective immediately.

	# For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models
	# and their software and algorithms, including trained model weights, parameters (including
	# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
	# fine-tuning enabling code and other elements of the foregoing made publicly available
	# by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0].
	import os
	import torch
	import numpy as np

	import cv2
	from PIL import Image

	import argparse

	# huanyuan3d text to panorama
	from hy3dworld import Text2PanoramaPipelines

	# huanyuan3d image to panorama
	from hy3dworld import Image2PanoramaPipelines
	from hy3dworld import Perspective


	class Text2PanoramaDemo:
	def __init__(self):
	# set default parameters
	self.height = 960
	self.width = 1920

	# panorama parameters
	# these parameters are used to control the panorama generation
	# you can adjust them according to your needs
	self.guidance_scale = 30
	self.shifting_extend = 0
	self.num_inference_steps = 50
	self.true_cfg_scale = 0.0
	self.blend_extend = 6

	# model paths
	self.lora_path = "tencent/HunyuanWorld-1"
	self.model_path = "black-forest-labs/FLUX.1-dev"
	# load the pipeline
	# use bfloat16 to save some VRAM
	self.pipe = Text2PanoramaPipelines.from_pretrained(
	self.model_path,
	torch_dtype=torch.bfloat16
	).to("cuda")
	# and enable lora weights
	self.pipe.load_lora_weights(
	self.lora_path,
	subfolder="HunyuanWorld-PanoDiT-Text",
	weight_name="lora.safetensors",
	torch_dtype=torch.bfloat16
	)
	# save some VRAM by offloading the model to CPU
	self.pipe.enable_model_cpu_offload()
	self.pipe.enable_vae_tiling() # and enable vae tiling to save some VRAM

	def run(self, prompt, negative_prompt=None, seed=42, output_path='output_panorama'):
	# get panorama
	image = self.pipe(
	prompt,
	height=self.height,
	width=self.width,
	negative_prompt=negative_prompt,
	generator=torch.Generator("cpu").manual_seed(seed),
	num_inference_steps=self.num_inference_steps,
	guidance_scale=self.guidance_scale,
	blend_extend=self.blend_extend,
	true_cfg_scale=self.true_cfg_scale,
	).images[0]

	# create output directory if it does not exist
	os.makedirs(output_path, exist_ok=True)
	# save the panorama image
	if not isinstance(image, Image.Image):
	image = Image.fromarray(image)
	# save the image to the output path
	image.save(os.path.join(output_path, 'panorama.png'))

	return image


	class Image2PanoramaDemo:
	def __init__(self):
	# set default parameters
	self.height, self.width = 960, 1920 # 768, 1536 #

	# panorama parameters
	# these parameters are used to control the panorama generation
	# you can adjust them according to your needs
	self.THETA = 0
	self.PHI = 0
	self.FOV = 80
	self.guidance_scale = 30
	self.num_inference_steps = 50
	self.true_cfg_scale = 2.0
	self.shifting_extend = 0
	self.blend_extend = 6

	# model paths
	self.lora_path = "tencent/HunyuanWorld-1"
	self.model_path = "black-forest-labs/FLUX.1-Fill-dev"
	# load the pipeline
	# use bfloat16 to save some VRAM
	self.pipe = Image2PanoramaPipelines.from_pretrained(
	self.model_path,
	torch_dtype=torch.bfloat16
	).to("cuda")
	# and enable lora weights
	self.pipe.load_lora_weights(
	self.lora_path,
	subfolder="HunyuanWorld-PanoDiT-Image",
	weight_name="lora.safetensors",
	torch_dtype=torch.bfloat16
	)
	# save some VRAM by offloading the model to CPU
	self.pipe.enable_model_cpu_offload()
	self.pipe.enable_vae_tiling() # and enable vae tiling to save some VRAM

	# set general prompts
	self.general_negative_prompt = (
	"human, person, people, messy,"
	"low-quality, blur, noise, low-resolution"
	)
	self.general_positive_prompt = "high-quality, high-resolution, sharp, clear, 8k"

	def run(self, prompt, negative_prompt, image_path, seed=42, output_path='output_panorama'):
	# preprocess prompt
	prompt = prompt + ", " + self.general_positive_prompt
	negative_prompt = self.general_negative_prompt + ", " + negative_prompt

	# read image
	perspective_img = cv2.imread(image_path)
	height_fov, width_fov = perspective_img.shape[:2]
	if width_fov > height_fov:
	ratio = width_fov / height_fov
	w = int((self.FOV / 360) * self.width)
	h = int(w / ratio)
	perspective_img = cv2.resize(
	perspective_img, (w, h), interpolation=cv2.INTER_AREA)
	else:
	ratio = height_fov / width_fov
	h = int((self.FOV / 180) * self.height)
	w = int(h / ratio)
	perspective_img = cv2.resize(
	perspective_img, (w, h), interpolation=cv2.INTER_AREA)


	equ = Perspective(perspective_img, self.FOV,
	self.THETA, self.PHI, crop_bound=False)
	img, mask = equ.GetEquirec(self.height, self.width)
	# erode mask
	mask = cv2.erode(mask.astype(np.uint8), np.ones(
	(3, 3), np.uint8), iterations=5)

	img = img * mask

	mask = mask.astype(np.uint8) * 255
	mask = 255 - mask

	mask = Image.fromarray(mask[:, :, 0])
	img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB)
	img = Image.fromarray(img)

	image = self.pipe(
	prompt=prompt,
	image=img,
	mask_image=mask,
	height=self.height,
	width=self.width,
	negative_prompt=negative_prompt,
	guidance_scale=self.guidance_scale,
	num_inference_steps=self.num_inference_steps,
	generator=torch.Generator("cpu").manual_seed(seed),
	blend_extend=self.blend_extend,
	shifting_extend=self.shifting_extend,
	true_cfg_scale=self.true_cfg_scale,
	).images[0]

	image.save(os.path.join(output_path, 'panorama.png'))

	return image


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Text/Image to Panorama Demo")
	parser.add_argument("--prompt", type=str,
	default="", help="Prompt for image generation")
	parser.add_argument("--negative_prompt", type=str,
	default="", help="Negative prompt for image generation")
	parser.add_argument("--image_path", type=str,
	default=None, help="Path to the input image")
	parser.add_argument("--seed", type=int, default=42,
	help="Random seed for reproducibility")
	parser.add_argument("--output_path", type=str, default="results",
	help="Path to save the output results")

	args = parser.parse_args()

	os.makedirs(args.output_path, exist_ok=True)
	print(f"Output will be saved to: {args.output_path}")

	if args.image_path is None:
	print("No image path provided, using text-to-panorama generation.")
	demo_T2P = Text2PanoramaDemo()
	panorama_image = demo_T2P.run(
	args.prompt, args.negative_prompt, args.seed, args.output_path)
	else:
	if not os.path.exists(args.image_path):
	raise FileNotFoundError(
	f"Image path {args.image_path} does not exist.")
	print(f"Using image at {args.image_path} for panorama generation.")
	demo_I2P = Image2PanoramaDemo()
	panorama_image = demo_I2P.run(
	args.prompt, args.negative_prompt, args.image_path, args.seed, args.output_path)